/// <summary> /// Constructs a new <see cref="InternetUrlToCrawl"/> object from an existing <see cref="InternetUrlToCrawl"/> object. /// </summary> /// <param name="IUrl">The existing <see cref="InternetUrlToCrawl"/> object</param> public InternetUrlToCrawl(InternetUrlToCrawl IUrl) { ID = IUrl.ID; Url = IUrl.Url; //m_UrlMD5=IUrlVal.MD5; crc = IUrl.CRC; flagDomain = IUrl.FlagDomain; flagFetchRobots = IUrl.FlagFetchRobots; robotsDisallowedPaths = IUrl.RobotsDisallowedPaths; }
/// <summary> /// Returns the host name of an <see cref="InternetUrlToCrawl"/> /// </summary> /// <param name="url">The <see cref="InternetUrlToCrawl"/> to examine</param> /// <returns>A string containing the Url's host name or IP Address.</returns> public static string HostName(InternetUrlToCrawl url) { try { Uri uri = new Uri(url.Url); return(uri.Host); } catch { return(String.Empty); } }
/// <summary> /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes /// it with the default values. /// </summary> public UrlCrawlData() { url = new InternetUrlToCrawl(); updated = false; redirected = false; redirectedFlagRobots = false; redirectedFlagDomain = DomainFlagValue.MustVisit; redirectedPriority = 255; httpStatusCode = HttpStatusCode.OK; data = String.Empty; timeStamp = DateTime.UtcNow; retrievalTime = 0; outLinks = null;//new ArrayList(); }
/// <summary> /// Extracts links from the contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <param name="contentUrl">The url of the document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links=new ArrayList(); ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return links; }
public SerializedException SendUrlsToCrawl(ClientInfo ci, out InternetUrlToCrawl[] data) { data = null; engine.LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl); return engine.SelectUrlsToCrawl(ci, ref data); }
/// <summary> /// Extracts links from the contents of a SWF document. /// </summary> /// <param name="content">The contents of the SWF document.</param> /// <param name="contentUrl">The url of the SWF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links=null; if((content==null)||(content.Length==0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string swfFileName = FileName + ".swf"; string htmFileName = FileName + ".htm"; FileStream swf = null; StreamReader htm = null; try { //store the swf file swf = new FileStream(swfFileName,FileMode.Create); swf.Write(content, 0, content.Length); swf.Close(); swf = null; //convert it to html bool success = converter.ConvertSwfFile(swfFileName, htmFileName); if(success) { htm = new StreamReader(htmFileName, encoding); string html = htm.ReadToEnd(); htm.Close(); htm = null; links = parser.ExtractLinks(ref html, ref contentUrl); } } catch(Exception ex) { if(swf!=null) { try { swf.Close(); } catch {} } if(htm!=null) { try { htm.Close(); } catch {} } if(globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogInfo("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString()); } } finally { File.Delete(swfFileName); File.Delete(htmFileName); } } catch(Exception ex) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("SwfParser failed to extract links from " + contentUrl.Url + ": " + ex.Message); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return links; }
/// <summary> /// Extracts links from the contents of a SWF document. /// </summary> /// <param name="content">The contents of the SWF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="NotSupportedException">Whenever this method is called.</exception> /// <remarks> /// Since a SWF document can not be converted to a string this method <b>ALWAYS</b> /// throws a <see cref="NotSupportedException"/>. /// </remarks> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { throw new NotSupportedException(); }
/// <summary> /// Returns the host name of an <see cref="InternetUrlToCrawl"/> /// </summary> /// <param name="url">The <see cref="InternetUrlToCrawl"/> to examine</param> /// <returns>A string containing the Url's host name or IP Address.</returns> public static string HostName(InternetUrlToCrawl url) { try { Uri uri = new Uri(url.Url); return uri.Host; } catch { return String.Empty; } }
/// <summary> /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the /// crawling threads that may be interrupted or aborted at any time it must be able /// to handle ThreadAbortException and ThreadInterruptedException. /// </summary> /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/> /// object that encapsulates the url that must be crawled.</param> private void CrawlUrl(ref InternetUrlToCrawl urlToCrawl) { try { UrlCrawlData urlData = new UrlCrawlData(); HiResTimer timer = new HiResTimer(); //create the web request and download the data HttpWebRequest pageRequest = null; try { pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url); } catch { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.Data = String.Empty; UpdateStatistics(HttpStatusCode.BadRequest, 0); lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.BadRequest, 0); return; } pageRequest.UserAgent = globals.UserAgent; pageRequest.Timeout=ExponentialBackoff.DefaultBackoff; //page timeout = 30 seconds HttpWebResponse pageResponse=null; try { timer.Start(); pageResponse = (HttpWebResponse)pageRequest.GetResponse(); //the above line might throw either WebException or UriFormatException } catch(WebException we) { HttpWebResponse response=(HttpWebResponse)we.Response; if (response!=null) { //although an exception occured we're able to get the Status Code urlData.HttpStatusCode=response.StatusCode; urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(response.StatusCode, response.ContentLength); } else { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(HttpStatusCode.BadRequest, 0); } } catch(UriFormatException) { //this will occur if the url is not valid urlData.HttpStatusCode=HttpStatusCode.BadRequest; urlData.Updated=false; urlData.Data = String.Empty; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; } finally { timer.Stop(); urlData.TimeStamp = DateTime.UtcNow; } if(pageResponse !=null) { //update the fields urlData.HttpStatusCode = pageResponse.StatusCode; //download and parse the contents of the url Stream receiveStream=pageResponse.GetResponseStream(); StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding); string contents = String.Empty; try { contents=receivedBytes.ReadToEnd(); } catch { //it should be response timeout not request timeout urlData.HttpStatusCode = HttpStatusCode.RequestTimeout; urlData.Updated = true; urlData.RetrievalTime = (int)timer.Duration; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.UrlToCrawl = urlToCrawl; try { receivedBytes.Close(); receiveStream.Close(); pageResponse.Close(); } catch {} lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.RequestTimeout, 0); return; } byte []buffer=Encoding.ASCII.GetBytes(contents); receiveStream.Close(); receivedBytes.Close(); UpdateStatistics(pageResponse.StatusCode, contents.Length); string redirectUrl = string.Empty; if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url) {//now that was a bloody BUGBUG redirectUrl = pageResponse.ResponseUri.AbsoluteUri; urlData.RedirectedPriority = CleanupRedirectUrl(ref redirectUrl); if(urlToCrawl.Url != redirectUrl) { urlData.Redirected=true; urlToCrawl.Url=redirectUrl; } } Parser parser = SelectParser(pageResponse.ContentType); pageResponse.Close(); long CRC = CompressionUtils.BufferCRC(buffer); if(CRC != urlToCrawl.CRC) { urlData.Updated = true; urlToCrawl.CRC = CRC; } if(urlData.Updated) { urlData.RetrievalTime = (int)timer.Duration; //if redirected, calculate robots, domain & priority for redirect url if(urlData.Redirected) { urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl); urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(ref redirectUrl, ref urlToCrawl, RobotsMetaTagValue.NoMeta); } //perform link extraction and content extraction ArrayList outlinks = null; try { if((parser == htmlParser)||(parser == textParser)) { urlData.Data = parser.ExtractContent(ref contents, false); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } else { urlData.Data = parser.ExtractContent(buffer, false); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(buffer, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } } catch { if(outlinks == null) { outlinks = new ArrayList(); } } urlData.OutLinks = new InternetUrlToIndex[outlinks.Count]; for(int i = 0; i< outlinks.Count; i++) { urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i]; } //finally update the urlData object with the modified UrlToCrawl urlData.UrlToCrawl = urlToCrawl; } } //lock and update CrawledUrls lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } throw ex; //PerformCrawling should catch this one ????? } finally { GC.Collect(); } }
/// <summary> /// Extracts links from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="NotSupportedException">Whenever this method is called.</exception> /// <remarks> /// Since a PDF document can not be converted to a string this method <b>ALWAYS</b> /// throws a <see cref="NotSupportedException"/>. /// </remarks> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { throw new NotSupportedException("The ExtractLinks method of the PdfParser cannot accept a string as input."); }
/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. IIt is passed as /// an array of bytes containing the text contents in UTF8 binary format, in order /// to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList retVal = null; try { mutex.WaitOne(); string html = Encoding.UTF8.GetString(content); retVal = ExtractLinks(ref html, ref contentUrl); } catch {} finally { mutex.ReleaseMutex(); } return retVal; }
/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. It is passed by /// reference in order to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); // It is important to notice that if the FlagFetchRobots of the contentUrl is // true then the TextParser must remember this value because during the Robots // Filtering it will become false so as not to download the robots.txt file // every time a Url must be filtered. //bool FlagFetchRobots = contentUrl.FlagFetchRobots; try { //make sure only one thread will parse contents at a time. //mutex.WaitOne(); if(contentUrl.FlagDomain!=DomainFlagValue.MustVisit) { contentUrl.FlagDomain = ExtractDomainFlag(ref content); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) if (InternetUtils.HostName(contentUrl).Contains("ebay.com")) contentUrl.FlagDomain = DomainFlagValue.MustVisit; } //perform the hyperlink matching MatchCollection matches = hrefRegex.Matches(content); if(matches.Count>0) { string documentUrl = contentUrl.Url; string baseUrl = BaseUrl(ref documentUrl); byte priority = 0; foreach(Match m in matches) { try { string url = m.Value.Trim(); url = NormalizeUrl(ref url, ref baseUrl); priority = CleanUrlParams(ref url); if(FilterUrl(ref url, ref documentUrl)) { InternetUrlToIndex iurl = new InternetUrlToIndex(url); iurl.Priority = priority; iurl.FlagDomain = domainFilter.FilterUrl(ref url); //[mod 24/2/05] No robots.txt checking is performed for non-greek urls if(iurl.FlagDomain == DomainFlagValue.MustVisit) { iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta); } else { iurl.FlagRobots = false; } if(!links.Contains(iurl)) { links.Add(iurl); } } } catch { if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("TextParser failed to parse " + m.Value); } continue; } } } } catch(Exception ex) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning(ex.Message); } } finally { //mutex.ReleaseMutex(); } //contentUrl.FlagFetchRobots = FlagFetchRobots; ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); links.TrimToSize(); return links; }
/// <summary> /// Checks if the Robots Exclusion Standard allows the crawler to visit a url. /// </summary> /// <param name="targetUrl">The url that is to be validated.</param> /// <param name="sourceUrl">The <see cref="InternetUrlToCrawl"/> containing the targetUrl.</param> /// <param name="robotsMeta">A <see cref="RobotsMetaTagValue"/> flag indicating the /// restrictions posed by the robots meta tag contained in the sourceUrl.</param> /// <returns> A <see cref="Boolean"/> value indicating whether the crawler is /// allowed (false) or disallowed (true) to visit the target Url.</returns> /// <remarks>This method is safe for multi-threaded operations. However only one /// thread will be able to perform a check at any given time. /// </remarks> public bool FilterUrl(string targetUrl, InternetUrlToCrawl sourceUrl, RobotsMetaTagValue robotsMeta) { bool retVal = false; //assume that it's allowed to crawl the targetUrl try { mutex.WaitOne(); //perhaphs we should use the hash code of the hostnames as keys. string targetHost = InternetUtils.HostName(targetUrl); string sourceHost = InternetUtils.HostName(sourceUrl); RobotsTxtEntry robots = null; //Do we need to fetch the robots.txt for the source Url? if(sourceUrl.FlagFetchRobots) { //we must fetch the robots.txt from the source url host and update sourceUrl. robots = FetchRobots(sourceHost); sourceUrl.RobotsDisallowedPaths = ConcatenatePaths(robots.DisallowedPaths); sourceUrl.FlagFetchRobots = false; //fetch it only once //check if it exists in the Hashtable, if so update it, otherwise add it if(robotsTable.ContainsKey(sourceHost)) { robotsTable[sourceHost] = robots; } else { robotsTable.Add(sourceHost, robots); } } else { //check if it exists in the Hashtable. If so check if it has expired, else just get it from InternetUrlToCrawl if(!robotsTable.TryGetValue(sourceHost, out robots)) { robots = new RobotsTxtEntry(); robots.DisallowedPaths = SplitPaths(sourceUrl.RobotsDisallowedPaths); robotsTable.Add(sourceHost, robots); } else { if(robots.ExpirationDate<DateTime.Today) { robots = FetchRobots(sourceHost); robotsTable[sourceHost] = robots; } } } if(targetHost != sourceHost) { //the target url is on a different host, we must get its robots.txt if(!robotsTable.TryGetValue(targetHost, out robots)) { robots = FetchRobots(targetHost); robotsTable.Add(targetHost, robots); } else { if(robots.ExpirationDate<DateTime.Today) { robots = FetchRobots(targetHost); robotsTable[targetHost] = robots; } } } if((robotsMeta & RobotsMetaTagValue.NoFollow)>0) { //if the meta tag has the NoFollow option set then we cannot crawl targetUrl retVal = true; } else { robots = robotsTable[targetHost]; //if the DisallowedPaths is null then we can crawl targetUrl, otherwise we must check the disallowed paths if(robots.DisallowedPaths!=null) { for(int i = 0; i < robots.DisallowedPaths.Length; i++) { if(targetUrl.IndexOf(robots.DisallowedPaths[i])!=-1) { //we found a match. It is therefore not allowed to crawl targetUrl retVal = true; break; //stop searching as soon as we have a match } } } } } catch(Exception e) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("RobotsFilter failed to filter " + targetUrl + ": " + e.ToString()); } } finally { mutex.ReleaseMutex(); } return retVal; }
/// <summary> /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes /// it with the default values. /// </summary> public UrlCrawlData() { url=new InternetUrlToCrawl(); updated=false; redirected=false; redirectedFlagRobots = false; redirectedFlagDomain = DomainFlagValue.MustVisit; redirectedPriority = 255; httpStatusCode=HttpStatusCode.OK; data=String.Empty; timeStamp=DateTime.UtcNow; retrievalTime=0; outLinks=null;//new ArrayList(); }
/// <summary> /// Extracts the hypertext references (links) contained in a document. /// </summary> /// <param name="content"> /// The content of the document that will be parsed for links. /// </param> /// <param name="contentUrl"> /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the /// document to be parsed for links and its associated robots.txt file. /// </param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating /// the links contained in the parsed document. /// </returns> public abstract ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl);
/// <summary> /// Extracts the hypertext references (links) contained in a document. /// </summary> /// <param name="content"> /// An array of bytes holding the content of the document that will be parsed for links. /// </param> /// <param name="contentUrl"> /// An <see cref="InternetUrlToCrawl"/> object encapsulating the Uri address of the /// document to be parsed for links and its associated robots.txt file. /// </param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects encapsulating /// the links contained in the parsed document. /// </returns> public abstract ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl);
/// <summary> /// Extracts links from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links=null; if((content==null)||(content.Length==0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string pdfFileName = FileName + ".pdf"; string txtFileName = FileName + ".txt"; FileStream pdf = null; StreamReader txt = null; try { //store the pdf file pdf = new FileStream(pdfFileName,FileMode.Create); pdf.Write(content, 0, content.Length); pdf.Close(); pdf = null; bool success = false; //convert it to text try { converter.loadFile(pdfFileName); converter.convertToTextFile(1, converter.numPages, txtFileName); success = true; } catch { success = false; } finally { converter.closeFile(); } if(success) { txt = new StreamReader(txtFileName, encoding); string text = txt.ReadToEnd(); txt.Close(); txt = null; links = parser.ExtractLinks(ref text, ref contentUrl); } else { txt = null; } } catch(Exception ex) { if(pdf!=null) { try { pdf.Close(); } catch {} } if(txt!=null) { try { txt.Close(); } catch {} } if(globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString()); } } finally { File.Delete(pdfFileName); File.Delete(txtFileName); } } catch { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return links; }
/// <summary> /// Crawls a Url and creates a <see cref="UrlCrawlData"/> object that is stored in /// the internal crawledUrls <see cref="ArrayList"/>. Since it runs in one of the /// crawling threads that may be interrupted or aborted at any time it must be able /// to handle ThreadAbortException and ThreadInterruptedException. /// </summary> /// <param name="urlToCrawl">A reference to the <see cref="InternetUrlToCrawl"/> /// object that encapsulates the url that must be crawled.</param> internal void CrawlUrl(ref InternetUrlToCrawl urlToCrawl) { try { UrlCrawlData urlData = new UrlCrawlData(); HiResTimer timer = new HiResTimer(); //create the web request and download the data HttpWebRequest pageRequest = null; try { pageRequest = (HttpWebRequest)HttpWebRequest.Create(urlToCrawl.Url); } catch { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.Data = String.Empty; UpdateStatistics(HttpStatusCode.BadRequest, 0); lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } return; } pageRequest.UserAgent = globals.UserAgent; pageRequest.Timeout=Backoff.DefaultBackoff; //page timeout = 30 seconds pageRequest.KeepAlive = false; HttpWebResponse pageResponse=null; try { timer.Start(); pageResponse = (HttpWebResponse)pageRequest.GetResponse(); //the above line might throw either WebException or UriFormatException } catch(WebException we) { HttpWebResponse response=(HttpWebResponse)we.Response; if (response!=null) { //although an exception occured we're able to get the Status Code urlData.HttpStatusCode=response.StatusCode; urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(response.StatusCode, response.ContentLength); response.Close(); } else { urlData.HttpStatusCode=HttpStatusCode.BadRequest;//TODO comment urlData.Updated=true; urlData.UrlToCrawl = urlToCrawl; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; UpdateStatistics(HttpStatusCode.BadRequest, 0); } } catch(UriFormatException) { //this will occur if the url is not valid urlData.HttpStatusCode=HttpStatusCode.BadRequest; urlData.Updated=false; urlData.Data = String.Empty; urlData.UrlToCrawl = urlToCrawl; urlData.OutLinks = new InternetUrlToIndex[0]; } finally { timer.Stop(); urlData.TimeStamp = DateTime.UtcNow; } if(pageResponse !=null) { //update the fields urlData.HttpStatusCode = pageResponse.StatusCode; //download and parse the contents of the url Stream receiveStream=pageResponse.GetResponseStream(); /*StreamReader receivedBytes=new StreamReader(receiveStream,defaultEncoding);*/ MemoryStream receivedBytes = new MemoryStream(); byte [] buffer = new byte[4096]; int read = 0; try { while((read=receiveStream.Read(buffer,0,4096))>0) { receivedBytes.Write(buffer,0,read); } } catch { //it should be response timeout not request timeout urlData.HttpStatusCode = HttpStatusCode.RequestTimeout; urlData.Updated = true; urlData.RetrievalTime = (int)timer.Duration; urlData.Data = String.Empty; urlData.OutLinks = new InternetUrlToIndex[0]; urlData.UrlToCrawl = urlToCrawl; try { receivedBytes.Close(); receiveStream.Close(); pageResponse.Close(); } catch {} lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } UpdateStatistics(HttpStatusCode.RequestTimeout, 0); return; } buffer = receivedBytes.ToArray(); Parser parser = SelectParser(pageResponse.ContentType); string contents = String.Empty; if(parser == htmlParser) { Encoding encoding = null; switch(pageResponse.ContentEncoding) { case "": case "none": contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length); //re-check the encoding encoding = DetectContentEncoding(ref contents); if(encoding != defaultEncoding) { contents = encoding.GetString(buffer, 0, buffer.Length); } break; case "gzip": case "x-gzip": //first decompress the stream and then re-check the encoding byte [] decompressed_buffer = new byte [0]; DecompressGzippedContent(buffer, out decompressed_buffer); contents = this.defaultEncoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length); //re-check the encoding encoding = DetectContentEncoding(ref contents); if(encoding != defaultEncoding) { contents = encoding.GetString(decompressed_buffer, 0, decompressed_buffer.Length); } break; default: try { encoding = Encoding.GetEncoding(pageResponse.ContentEncoding); contents = encoding.GetString(buffer, 0, buffer.Length); } catch//(NotSupportedException) { encoding = defaultEncoding; //the encoding specified is unsupported. contents = String.Empty; } break; } } else { if(parser == textParser) { try { contents = this.defaultEncoding.GetString(buffer, 0, buffer.Length); } catch { //something went seriously wrong here. The crawler got a header that says the server is //sending back a plain text document but for some reason we can't get the string contents. contents = String.Empty; } } } receiveStream.Close(); receivedBytes.Close(); UpdateStatistics(pageResponse.StatusCode, buffer.Length); string redirectUrl = string.Empty; if (pageResponse.ResponseUri.AbsoluteUri!=urlToCrawl.Url) { redirectUrl = pageResponse.ResponseUri.AbsoluteUri; urlData.RedirectedPriority = htmlParser.CleanUrlParams(ref redirectUrl);//CleanupRedirectUrl(ref redirectUrl); if(urlToCrawl.Url != redirectUrl)//now that was a bloody BUGBUG { urlData.Redirected=true; urlToCrawl.Url=redirectUrl; } } pageResponse.Close(); long CRC = CompressionUtils.BufferCRC(buffer); if(CRC != urlToCrawl.CRC) { urlData.Updated = true; urlToCrawl.CRC = CRC; } if(urlData.Updated) { urlData.RetrievalTime = (int)timer.Duration; //if redirected, calculate robots, domain & priority for redirect url if(urlData.Redirected) { urlData.RedirectedFlagDomain = domainFilter.FilterUrl(ref redirectUrl); urlData.RedirectedFlagRobots = robotsFilter.FilterUrl(redirectUrl, urlToCrawl, RobotsMetaTagValue.NoMeta); } //perform link extraction and content extraction ArrayList outlinks = null; try { if((parser == htmlParser)||(parser == textParser)) { string clean = parser.ExtractContent(ref contents, false); if(clean.Length>1048576) { clean = clean.Substring(0,1048576); } urlData.Data = InternetUtils.Base64Encode(clean); if(urlData.Data == null) { urlData.Data = String.Empty; } outlinks = parser.ExtractLinks(ref contents, ref urlToCrawl); if(outlinks == null) { outlinks = new ArrayList(); } } else { contents = parser.ExtractContent(buffer, false); if(contents.Length>1048576) { contents = contents.Substring(0,1048576); } urlData.Data = InternetUtils.Base64Encode(contents); if(urlData.Data == null) { urlData.Data = String.Empty; } if(parser == pdfParser) { outlinks = textParser.ExtractLinks(ref contents, ref urlToCrawl); } else { outlinks = htmlParser.ExtractLinks(ref contents, ref urlToCrawl); } if(outlinks == null) { outlinks = new ArrayList(); } } } catch { if(outlinks == null) { outlinks = new ArrayList(); } } urlData.OutLinks = new InternetUrlToIndex[outlinks.Count]; for(int i = 0; i< outlinks.Count; i++) { urlData.OutLinks[i] = (InternetUrlToIndex)outlinks[i]; } //finally update the urlData object with the modified UrlToCrawl urlData.UrlToCrawl = urlToCrawl; } } //lock and update CrawledUrls lock(crawledUrls.SyncRoot) { crawledUrls.Add(urlData); } } catch(ThreadAbortException tae) { //The thread has been asked to abort. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been asked to abort: " + tae.Message); } return; } catch(ThreadInterruptedException tie) { //The thread has been asked to join. Log information and return at once if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo(Thread.CurrentThread.Name + " has been interrupted: " + tie.Message); } return; } catch(Exception ex) { if(!(ex is ThreadAbortException)) // the ThreadAbortedException is rethrown { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("CrawlUrl running in " + Thread.CurrentThread.Name + " encountered an unexpected exception: " + ex.ToString()); } } throw ex; //PerformCrawling should catch this one ????? } finally { GC.Collect(); } }
/// <summary> /// Selects and returns a set of urls that are ready to be crawled. /// </summary> /// <param name="ci">The <see cref="ClientInfo"/> of the client requesting urls to crawl.</param> /// <param name="data">An array of <see cref="InternetUrlToCrawl"/> objects containing the selected urls.</param> /// <returns>Null if the operation succeeds, or <see cref="SerializedException"/> /// encapsulating the error that occured if the operation fails.</returns> public SerializedException SelectUrlsToCrawl(ClientInfo ci, ref InternetUrlToCrawl[] data) { SerializedException sx = null; try { if (!ConnectToDatabase()) { throw new CWDBConnectionFailedException(); } //we must use a transaction to make sure that if something goes wrong the //changes to the database will be rolled back. SqlTransaction transaction = dbcon.BeginTransaction(IsolationLevel.Serializable);//perhaps | repeatableread try { //first select the urls to crawl SqlCommand cmd = new SqlCommand("cw_select_urls_to_crawl", dbcon, transaction); cmd.CommandType = CommandType.StoredProcedure; cmd.CommandTimeout = 120; SqlDataAdapter da = new SqlDataAdapter(cmd); DataSet ds = new DataSet(); da.Fill(ds); da.Dispose(); cmd.Dispose(); //now delete them from the table of urls to crawl data = new InternetUrlToCrawl[ds.Tables[0].Rows.Count]; if (data.Length > 0) { int i = 0; foreach (DataRow dr in ds.Tables[0].Rows) { try { InternetUrlToCrawl url = new InternetUrlToCrawl((int)dr[0], (string)dr[1]); if (dr[2] != DBNull.Value) { url.CRC = (long)dr[2]; } if (dr[3] != DBNull.Value) { url.FlagDomain = (DomainFlagValue)((byte)dr[3]); } if (dr[4] != DBNull.Value) { url.RobotsDisallowedPaths = (string)dr[4]; } else { RobotsTxtEntry entry = settings.Robots.GetEntry(InternetUtils.HostName(url)); if (entry != null) { url.RobotsDisallowedPaths = ConcatenatePaths(entry.DisallowedPaths); } else { url.FlagFetchRobots = true; } } data[i++] = url; } catch { continue; } } SqlCommand statscmd = new SqlCommand("cw_update_client_statistics", dbcon, transaction); statscmd.CommandType = CommandType.StoredProcedure; statscmd.CommandTimeout = 120; statscmd.Parameters.Add("@client_id", SqlDbType.UniqueIdentifier); statscmd.Parameters.Add("@assigned", SqlDbType.BigInt); statscmd.Parameters.Add("@returned", SqlDbType.BigInt); statscmd.Parameters.Add("@type", SqlDbType.TinyInt); statscmd.Parameters[0].Value = ci.ClientID; statscmd.Parameters[1].Value = data.Length; statscmd.Parameters[2].Value = DBNull.Value; statscmd.Parameters[3].Value = 0; statscmd.ExecuteNonQuery(); statscmd.Dispose(); transaction.Commit(); } } catch (Exception ex) { transaction.Rollback(); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("SelectUrlsToCrawl failed, Transaction was rolled back: " + ex.ToString()); } throw ex; } finally { UpdateClientLastActive(ci); LogClientAction(ci, CWClientActions.LogSendUrlsToCrawl); if (!DisconnectFromDatabase()) { throw new CWDBConnectionFailedException("Disconnect from database failure."); } } } catch (Exception e) { sx = new SerializedException(e.GetType().ToString(), e.Message, e.ToString()); if (settings.LogLevel <= CWLogLevel.LogWarning) { settings.Log.LogWarning("SelectUrlsToCrawl failed: " + e.ToString()); } } return sx; }
/// <summary> /// Checks if at least 30 seconds have passed since the last request to a given host /// was made, in order not to slammer it with simultaneous or frequent requests. /// </summary> /// <param name="targetUrl"> /// A <see cref="InternetUrlToCrawl"/> that is served by a host we wish to check. /// </param> /// <returns> /// An integer containing the number of milliseconds a crawler thread must wait /// before visiting this host. /// </returns> public int FilterUrl(ref InternetUrlToCrawl targetUrl) { string hostName = InternetUtils.HostName(targetUrl); return FilterHost(ref hostName); }