private void DownloadPage() { try { WebRequest request = HttpWebRequest.Create(Uri); WebResponse response = request.GetResponse(); using (StreamReader streamReader = new StreamReader(response.GetResponseStream())) { DownloadedPage = new Page(streamReader.ReadToEnd(), Uri); } } catch (WebException ex) { Console.WriteLine("Network or protocol error : {0}", ex.Message); } catch (NotSupportedException ex) { Console.WriteLine("URI format not supported : {0}", ex.Message); } finally { //The completed flag is set true no matter what the //outcome, so the dispatcher thread can dispose of //the fetcher appropriately. Completed = true; } }
//When a page is saved, two files are created on the filesystem, //in the appropriate folder: //a <pagehash>.link file, which contains one line only - the //absolute URI of the page and a <pagehash> file, containing the //actual body of the page. //The write operation is locked in order to avoid //IO problems with possible concurrent writes public void Save(Page page) { String path = Path.Combine(_targetFolder.FullName, page.Hash); lock (_lockObject) { using (var writer = new StreamWriter(path + ".link")) { writer.WriteLine(page.Uri.AbsoluteUri); } page.Document.Save(Path.Combine(_targetFolder.FullName, page.Hash)); } }
//Every time a new page is fetched, if not in cache, //an event is raised. The event is listened //by the main program, which decides what to do based on context. //Every URL on the page which is not excluded by any //of the filtering criteria is then added to the queue, and round it goes. public void OnPageLoaded(Page page) { if (NewPageFetched != null && _cache.Get(page.Uri) == null) { _cache.Add(page); NewPageFetched(page); } foreach (Uri link in page.Links .Where(x => _linkFilters.All(y => !y.Matches(x)))) { _queue.Enqueue(new Fetcher(link)); } }
//http://msdn.microsoft.com/en-us/library/86wf6409%28v=vs.71%29.aspx private void OnStreamRead(IAsyncResult result) { RequestState state = result.AsyncState as RequestState; // Retrieve the ResponseStream that was set in RespCallback. Stream responseStream = state.ResponseStream; // Read rs.BufferRead to verify that it contains data. int read = responseStream.EndRead(result); if (read > 0) { // Prepare a Char array buffer for converting to Unicode. Char[] charBuffer = new Char[RequestState.BUFFER_SIZE * 2]; // Convert byte stream to Char array and then to String. // len contains the number of characters converted to Unicode. int len = state.StreamDecode.GetChars(state.BufferRead, 0, read, charBuffer, 0); String str = new String(charBuffer, 0, len); // Append the recently read data to the RequestData stringbuilder // object contained in RequestState. state.RequestData.Append(str); // Continue reading data until // responseStream.EndRead returns –1. IAsyncResult ar = responseStream.BeginRead( state.BufferRead, 0, RequestState.BUFFER_SIZE, new AsyncCallback(OnStreamRead), state); } else { if (state.RequestData.Length > 0) { DownloadedPage = new Page(state.RequestData.ToString(), Uri); } Completed = true; // Close down the response stream. responseStream.Close(); } }