/// <summary> /// Processes the crawl request. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, FileManager <TArachnodeDAO> fileManager, ImageManager <TArachnodeDAO> imageManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { switch (crawlRequest.DataType.DiscoveryType) { case DiscoveryType.File: ProcessFile(crawlRequest, fileManager, arachnodeDAO); break; case DiscoveryType.Image: ProcessImage(crawlRequest, imageManager, arachnodeDAO); break; case DiscoveryType.WebPage: if (crawlRequest.Discovery.ExpectFileOrImage || _discoveryManager.IsCrawlRestricted(crawlRequest, crawlRequest.Parent.Uri.AbsoluteUri)) { /*<img src="http://msn.com"></a> would return a WebPage. If the Crawl was restricting crawling to fark.com, and a * page on fark.com listed the aforementioned Image and we didn't make this check, then the Crawl would create a * record in the WebPages table, which would not be correct. * A CurrentDepth of '0' means a request was made for a File or an Image. */ //ANODET: Improve parsing... check the regular expressions... crawlRequest.Crawl.UnassignedDiscoveries.Remove(crawlRequest.Discovery.Uri.AbsoluteUri); return; #if !DEMO //throw new Exception("A CrawlRequest was created for a File or an Image, but the HttpWebResponse returned a WebPage. This is typically indicative of invalid HTML."); #endif } ProcessWebPage(crawlRequest, webPageManager, arachnodeDAO); break; #if !DEMO case DiscoveryType.None: if (ApplicationSettings.InsertDisallowedAbsoluteUris) { arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, "Disallowed by unassigned DataType. (" + crawlRequest.DataType.ContentType + ")", ApplicationSettings.ClassifyAbsoluteUris); } break; #endif } //remove the reference from the crawl. crawlRequest.Crawl.UnassignedDiscoveries.Remove(crawlRequest.Discovery.Uri.AbsoluteUri); }
/// <summary> /// Processes the image. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "imageManager">The image manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> protected override void ProcessImage(CrawlRequest <TArachnodeDAO> crawlRequest, ImageManager <TArachnodeDAO> imageManager, IArachnodeDAO arachnodeDAO) { _consoleManager.OutputImageDiscovered(crawlRequest.Crawl.CrawlInfo.ThreadNumber, crawlRequest, crawlRequest.Discovery); Counters.GetInstance().ImagesDiscovered(1); if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.PostRequest) { imageManager.ManageImage(crawlRequest); } else { if (ApplicationSettings.InsertImageDiscoveries) { arachnodeDAO.InsertImageDiscovery(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } _consoleManager.OutputCacheHit(crawlRequest.Crawl.CrawlInfo, crawlRequest, crawlRequest.Discovery); } }