/// <summary> /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the /// crawling threads that may be interrupted or aborted at any time it must be able /// to handle ThreadAbortException and ThreadInterruptedException. /// </summary> /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/> /// object that encapsulates the url that must be crawled.</param> private void CrawlUrl(ref InternetUrlToCrawl urlToCrawl) { try { UrlCrawlData urlData = new UrlCrawlData(); HiResTimer timer = new HiResTimer(); //create the web request and download the data HttpWebRequest pageRequest = null; try { pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url); } catch { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.Data = String.Empty; UpdateStatistics(HttpStatusCode.BadRequest, 0); lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.BadRequest, 0); return; } pageRequest.UserAgent = globals.UserAgent; pageRequest.Timeout=ExponentialBackoff.DefaultBackoff; //page timeout = 30 seconds HttpWebResponse pageResponse=null; try { timer.Start(); pageResponse = (HttpWebResponse)pageRequest.GetResponse(); //the above line might throw either WebException or UriFormatException } catch(WebException we) { HttpWebResponse response=(HttpWebResponse)we.Response; if (response!=null) { //although an exception occured we're able to get the Status Code urlData.HttpStatusCode=response.StatusCode; urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(response.StatusCode, response.ContentLength); } else { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(HttpStatusCode.BadRequest, 0); } } catch(UriFormatException) { //this will occur if the url is not valid urlData.HttpStatusCode=HttpStatusCode.BadRequest; urlData.Updated=false; urlData.Data = String.Empty; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; } finally { timer.Stop(); urlData.TimeStamp = DateTime.UtcNow; } if(pageResponse !=null) { //update the fields urlData.HttpStatusCode = pageResponse.StatusCode; //download and parse the contents of the url Stream receiveStream=pageResponse.GetResponseStream(); StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding); string contents = String.Empty; try { contents=receivedBytes.ReadToEnd(); } catch { //it should be response timeout not request timeout urlData.HttpStatusCode = HttpStatusCode.RequestTimeout; urlData.Updated = true; urlData.RetrievalTime = (int)timer.Duration; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.UrlToCrawl = urlToCrawl; try { receivedBytes.Close(); receiveStream.Close(); pageResponse.Close(); } catch {} lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.RequestTimeout, 0); return; } byte []buffer=Encoding.ASCII.GetBytes(contents); receiveStream.Close(); receivedBytes.Close(); UpdateStatistics(pageResponse.StatusCode, contents.Length); string redirectUrl = string.Empty; if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url) {//now that was a bloody BUGBUG redirectUrl = pageResponse.ResponseUri.AbsoluteUri; urlData.RedirectedPriority = CleanupRedirectUrl(ref redirectUrl); if(urlToCrawl.Url != redirectUrl) { urlData.Redirected=true; urlToCrawl.Url=redirectUrl; } } Parser parser = SelectParser(pageResponse.ContentType); pageResponse.Close(); long CRC = CompressionUtils.BufferCRC(buffer); if(CRC != urlToCrawl.CRC) { urlData.Updated = true; urlToCrawl.CRC = CRC; } if(urlData.Updated) { urlData.RetrievalTime = (int)timer.Duration; //if redirected, calculate robots, domain & priority for redirect url if(urlData.Redirected) { urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl); urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(ref redirectUrl, ref urlToCrawl, RobotsMetaTagValue.NoMeta); } //perform link extraction and content extraction ArrayList outlinks = null; try { if((parser == htmlParser)||(parser == textParser)) { urlData.Data = parser.ExtractContent(ref contents, false); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } else { urlData.Data = parser.ExtractContent(buffer, false); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(buffer, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } } catch { if(outlinks == null) { outlinks = new ArrayList(); } } urlData.OutLinks = new InternetUrlToIndex[outlinks.Count]; for(int i = 0; i< outlinks.Count; i++) { urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i]; } //finally update the urlData object with the modified UrlToCrawl urlData.UrlToCrawl = urlToCrawl; } } //lock and update CrawledUrls lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } throw ex; //PerformCrawling should catch this one ????? } finally { GC.Collect(); } }
/// <summary> /// DetectConnectionSpeed attempts to detect the computer's internet connection speed /// by measuring how much time it takes to download the contents of a web site. /// </summary> /// <returns>A <see cref="CWConnectionSpeed"/> value containing the estimated internet /// connection speed.</returns> public static CWConnectionSpeed DetectConnectionSpeed() { CWConnectionSpeed retVal = CWConnectionSpeed.Unknown; try { WebClient client = new WebClient(); HiResTimer timer = new HiResTimer(); byte[] data; timer.Start(); try { data = client.DownloadData("http://www.in.gr/"); } catch { data = new byte[0]; } finally { timer.Stop(); client.Dispose(); } if (data.Length > 0) { double Kbps = ((data.Length * 8) / timer.Duration); //determine which enumeration value fits best if ((Kbps > 0) && (Kbps <= 56)) retVal = CWConnectionSpeed.Modem56K; else if (Kbps <= 64) retVal = CWConnectionSpeed.ISDN64K; else if (Kbps <= 128) retVal = CWConnectionSpeed.ISDN128K; else if (Kbps <= 256) retVal = CWConnectionSpeed.DSL256K; else if (Kbps <= 512) retVal = CWConnectionSpeed.DSL512K; else if (Kbps <= 1024) retVal = CWConnectionSpeed.DSL1M; else if ((Kbps > 1024) && (Kbps <= 1536)) retVal = CWConnectionSpeed.T1; else if ((Kbps > 1536) && (Kbps <= 46080)) retVal = CWConnectionSpeed.T3; else if ((Kbps > 46080) && (Kbps < 158720)) retVal = CWConnectionSpeed.Fiber; else if (Kbps >= 158720) retVal = CWConnectionSpeed.ATM; else retVal = CWConnectionSpeed.Unknown; } } catch { } return retVal; }
/// <summary> /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the /// crawling threads that may be interrupted or aborted at any time it must be able /// to handle ThreadAbortException and ThreadInterruptedException. /// </summary> /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/> /// object that encapsulates the url that must be crawled.</param> internal void CrawlUrl(ref InternetUrlToCrawl urlToCrawl) { try { UrlCrawlData urlData = new UrlCrawlData(); HiResTimer timer = new HiResTimer(); //create the web request and download the data HttpWebRequest pageRequest = null; try { pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url); } catch { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.Data = String.Empty; UpdateStatistics(HttpStatusCode.BadRequest, 0); lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } return; } pageRequest.UserAgent = globals.UserAgent; pageRequest.Timeout=Backoff.DefaultBackoff; //page timeout = 30 seconds pageRequest.KeepAlive = false; HttpWebResponse pageResponse=null; try { timer.Start(); pageResponse = (HttpWebResponse)pageRequest.GetResponse(); //the above line might throw either WebException or UriFormatException } catch(WebException we) { HttpWebResponse response=(HttpWebResponse)we.Response; if (response!=null) { //although an exception occured we're able to get the Status Code urlData.HttpStatusCode=response.StatusCode; urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(response.StatusCode, response.ContentLength); response.Close(); } else { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(HttpStatusCode.BadRequest, 0); } } catch(UriFormatException) { //this will occur if the url is not valid urlData.HttpStatusCode=HttpStatusCode.BadRequest; urlData.Updated=false; urlData.Data = String.Empty; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; } finally { timer.Stop(); urlData.TimeStamp = DateTime.UtcNow; } if(pageResponse !=null) { //update the fields urlData.HttpStatusCode = pageResponse.StatusCode; //download and parse the contents of the url Stream receiveStream=pageResponse.GetResponseStream(); /*StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);*/ MemoryStream receivedBytes = new MemoryStream(); byte [] buffer = new byte[4096]; int read = 0; try { while((read=receiveStream.Read(buffer,0,4096))>0) { receivedBytes.Write(buffer,0,read); } } catch { //it should be response timeout not request timeout urlData.HttpStatusCode = HttpStatusCode.RequestTimeout; urlData.Updated = true; urlData.RetrievalTime = (int)timer.Duration; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.UrlToCrawl = urlToCrawl; try { receivedBytes.Close(); receiveStream.Close(); pageResponse.Close(); } catch {} lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.RequestTimeout, 0); return; } buffer = receivedBytes.ToArray(); Parser parser = SelectParser(pageResponse.ContentType); string contents = String.Empty; if(parser == htmlParser) { Encoding encoding = null; switch(pageResponse.ContentEncoding) { case "": case "none": contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length); //re-check the encoding encoding = DetectContentEncoding(ref contents); if(encoding != defaultEncoding) { contents = encoding.GetString(buffer, 0, buffer.Length); } break; case "gzip": case "x-gzip": //first decompress the stream and then re-check the encoding byte [] decompressed_buffer = new byte [0]; DecompressGzippedContent(buffer, out decompressed_buffer); contents = this.defaultEncoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length); //re-check the encoding encoding = DetectContentEncoding(ref contents); if(encoding != defaultEncoding) { contents = encoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length); } break; default: try { encoding = Encoding.GetEncoding(pageResponse.ContentEncoding); contents = encoding.GetString(buffer, 0, buffer.Length); } catch//(NotSupportedException) { encoding = defaultEncoding; //the encoding specified is unsupported. contents = String.Empty; } break; } } else { if(parser == textParser) { try { contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length); } catch { //something went seriously wrong here. The crawler got a header that says the server is //sending back a plain text document but for some reason we can't get the string contents. contents = String.Empty; } } } receiveStream.Close(); receivedBytes.Close(); UpdateStatistics(pageResponse.StatusCode, buffer.Length); string redirectUrl = string.Empty; if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url) { redirectUrl = pageResponse.ResponseUri.AbsoluteUri; urlData.RedirectedPriority = htmlParser.CleanUrlParams(ref redirectUrl);//CleanupRedirectUrl(ref redirectUrl); if(urlToCrawl.Url != redirectUrl)//now that was a bloody BUGBUG { urlData.Redirected=true; urlToCrawl.Url=redirectUrl; } } pageResponse.Close(); long CRC = CompressionUtils.BufferCRC(buffer); if(CRC != urlToCrawl.CRC) { urlData.Updated = true; urlToCrawl.CRC = CRC; } if(urlData.Updated) { urlData.RetrievalTime = (int)timer.Duration; //if redirected, calculate robots, domain & priority for redirect url if(urlData.Redirected) { urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl); urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(redirectUrl, urlToCrawl, RobotsMetaTagValue.NoMeta); } //perform link extraction and content extraction ArrayList outlinks = null; try { if((parser == htmlParser)||(parser == textParser)) { string clean = parser.ExtractContent(ref contents, false); if(clean.Length>1048576) { clean = clean.Substring(0,1048576); } urlData.Data = InternetUtils.Base64Encode(clean); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } else { contents = parser.ExtractContent(buffer, false); if(contents.Length>1048576) { contents = contents.Substring(0,1048576); } urlData.Data = InternetUtils.Base64Encode(contents); if(urlData.Data == null) { urlData.Data = String.Empty; } if(parser == pdfParser) { outlinks = textParser.ExtractLinks(ref contents, ref urlToCrawl); } else { outlinks = htmlParser.ExtractLinks(ref contents, ref urlToCrawl); } if(outlinks == null) { outlinks = new ArrayList(); } } } catch { if(outlinks == null) { outlinks = new ArrayList(); } } urlData.OutLinks = new InternetUrlToIndex[outlinks.Count]; for(int i = 0; i< outlinks.Count; i++) { urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i]; } //finally update the urlData object with the modified UrlToCrawl urlData.UrlToCrawl = urlToCrawl; } } //lock and update CrawledUrls lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } throw ex; //PerformCrawling should catch this one ????? } finally { GC.Collect(); } }
/// <summary> /// DetectConnectionSpeed attempts to detect the computer's internet connection speed /// by measuring how much time it takes to download the contents of a web site. /// </summary> /// <returns>A <see cref="CWConnectionSpeed"/> value containing the estimated internet /// connection speed.</returns> public static CWConnectionSpeed DetectConnectionSpeed() { CWConnectionSpeed retVal = CWConnectionSpeed.Unknown; try { WebClient client = new WebClient(); HiResTimer timer = new HiResTimer(); byte[] data; timer.Start(); try { data = client.DownloadData("http://www.in.gr/"); } catch { data = new byte[0]; } finally { timer.Stop(); client.Dispose(); } if (data.Length > 0) { double Kbps = ((data.Length * 8) / timer.Duration); //determine which enumeration value fits best if ((Kbps > 0) && (Kbps <= 56)) { retVal = CWConnectionSpeed.Modem56K; } else if (Kbps <= 64) { retVal = CWConnectionSpeed.ISDN64K; } else if (Kbps <= 128) { retVal = CWConnectionSpeed.ISDN128K; } else if (Kbps <= 256) { retVal = CWConnectionSpeed.DSL256K; } else if (Kbps <= 512) { retVal = CWConnectionSpeed.DSL512K; } else if (Kbps <= 1024) { retVal = CWConnectionSpeed.DSL1M; } else if ((Kbps > 1024) && (Kbps <= 1536)) { retVal = CWConnectionSpeed.T1; } else if ((Kbps > 1536) && (Kbps <= 46080)) { retVal = CWConnectionSpeed.T3; } else if ((Kbps > 46080) && (Kbps < 158720)) { retVal = CWConnectionSpeed.Fiber; } else if (Kbps >= 158720) { retVal = CWConnectionSpeed.ATM; } else { retVal = CWConnectionSpeed.Unknown; } } } catch { } return(retVal); }