/// <summary> /// Processes the crawl request. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, FileManager <TArachnodeDAO> fileManager, ImageManager <TArachnodeDAO> imageManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: ProcessFile(crawlRequest, fileManager, arachnodeDAO); break; case DiscoveryType.Image: ProcessImage(crawlRequest, imageManager, arachnodeDAO); break; case DiscoveryType.WebPage: if (crawlRequest.Discovery.ExpectFileOrImage || _discoveryManager.IsCrawlRestricted(crawlRequest, crawlRequest.Parent.Uri.AbsoluteUri)) { /*<img src="http://msn.com"></a> would return a WebPage. If the Crawl was restricting crawling to fark.com, and a * page on fark.com listed the aforementioned Image and we didn't make this check, then the Crawl would create a * record in the WebPages table, which would not be correct. * A CurrentDepth of '0' means a request was made for a File or an Image. */ //ANODET: Improve parsing... check the regular expressions... crawlRequest.Crawl.UnassignedDiscoveries.Remove(crawlRequest.Discovery.Uri.AbsoluteUri); return; #if !DEMO //throw new Exception("A CrawlRequest was created for a File or an Image, but the HttpWebResponse returned a WebPage. This is typically indicative of invalid HTML."); #endif } ProcessWebPage(crawlRequest, webPageManager, arachnodeDAO); break; #if !DEMO case DiscoveryType.None: if (ApplicationSettings.InsertDisallowedAbsoluteUris) { arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, "Disallowed by unassigned DataType. (" + crawlRequest.DataType.ContentType + ")", ApplicationSettings.ClassifyAbsoluteUris); } break; #endif } //remove the reference from the crawl. crawlRequest.Crawl.UnassignedDiscoveries.Remove(crawlRequest.Discovery.Uri.AbsoluteUri); }
/// <summary> /// Processes the web page. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> protected override void ProcessWebPage(CrawlRequest <TArachnodeDAO> crawlRequest, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { _consoleManager.OutputWebPageDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest); Counters.GetInstance().WebPagesDiscovered(1); /**/ webPageManager.ManageWebPage(crawlRequest); //the Crawler may(/will) be null if PostProcessing... if (ApplicationSettings.ProcessDiscoveriesAsynchronously && !crawlRequest.Crawl.IsProcessingDiscoveriesAsynchronously && crawlRequest.Crawl.Crawler != null) { crawlRequest.Crawl.Crawler.Engine.DiscoveryProcessors[crawlRequest.Crawl.CrawlInfo.ThreadNumber].AddCrawlRequestToBeProcessed(crawlRequest); } else { ProcessDiscoveries(crawlRequest, arachnodeDAO); } } public override void ProcessDiscoveries(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { /**/ //Email Addresses ProcessEmailAddresses(crawlRequest, arachnodeDAO); /**/ //HyperLinks ProcessHyperLinks(crawlRequest, arachnodeDAO); /**/ //Files and Images ProcessFilesAndImages(crawlRequest, arachnodeDAO); }