/// <summary> /// Processes a FilesRow after crawling. /// </summary> /// <param name = "filesRow">The files row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name = "fileManager">The file manager.</param> /// <param name = "fileManager">The file manager.</param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessFile(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.FilesRow filesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, FileManager <TArachnodeDAO> fileManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager();; CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(filesRow.AbsoluteUri), 1, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.File; crawlRequest.Discovery.ID = filesRow.ID; crawlRequest.Data = filesRow.Source; crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the FilesRow.ResponseHeaders string... foreach (string responseHeader in filesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); if (applicationSettings.InsertFiles) { crawlRequest.Discovery.ID = arachnodeDAO.InsertFile(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertFileSource ? crawlRequest.Data : new byte[] { }, crawlRequest.DataType.FullTextIndexType, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = fileManager.ManageFile(crawlRequest, crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractFileMetaData, applicationSettings.InsertFileMetaData, applicationSettings.SaveDiscoveredFilesToDisk); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }
/// <summary> /// Processes the crawl request. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "obeyCrawlRules">if set to <c>true</c> [obey crawl rules].</param> /// <param name = "executeCrawlActions">if set to <c>true</c> [execute crawl actions].</param> public void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { //HACK:!!! Solve this!!! //#if DEMO // return; //#endif bool wasACacheHit = false; try { crawlRequest.WebClient = WebClient; if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered) { if (!_politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestRequested, _arachnodeDAO)) { Crawler.Engine.OnCrawlRequestThrottled(crawlRequest); return; } _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest); _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, _arachnodeDAO); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreRequest, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { _stopwatch.Reset(); _stopwatch.Start(); try { _dataManager.ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions); } catch (Exception exception2) { throw new Exception(exception2.Message, exception2); } finally { _stopwatch.Stop(); _crawlInfo.TotalHttpWebResponseTime += _stopwatch.Elapsed; crawlRequest.HttpWebResponseTime = _stopwatch.Elapsed; _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); } Counters.GetInstance().TotalBytesDiscovered(crawlRequest.Data.LongLength); _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PostRequest, _arachnodeDAO); _encodingManager.ProcessCrawlRequest(crawlRequest, _arachnodeDAO); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PostRequest, _arachnodeDAO); } //the CrawlRequest could be Disallowed by a PreGet CrawlRule - specifically DataType.cs. if (!crawlRequest.IsDisallowed) { if (_processData) { _crawlRequestManager.ProcessCrawlRequest(crawlRequest, _fileManager, _imageManager, _webPageManager, _arachnodeDAO); } } else { if (crawlRequest.DataType.ContentType == null) { crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); } if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest); } } else { _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); if (crawlRequest.DataType.ContentType == null) { crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); } if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest); } } else { wasACacheHit = true; //this should only occur when you submit a CR from a rule, or action... _consoleManager.OutputCacheHit(_crawlInfo, crawlRequest, crawlRequest.Discovery); } } catch (Exception exception) { _stopwatch.Stop(); if (Crawler.Engine.State != EngineState.Start) { //the request was aborted as it was long running and Engine was requested to Stop. if ((crawlRequest.WebClient.WebException != null && crawlRequest.WebClient.WebException.Status == WebExceptionStatus.RequestCanceled) || (exception.InnerException != null && exception.InnerException.Message == "The request was aborted: The request was canceled.")) { return; } } if (crawlRequest.WebClient.WebException != null && crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 && crawlRequest.WebClient.WebException.Message.StartsWith("Unable to connect to the remote server")) { _politenessManager.ResubmitCrawlRequest(crawlRequest, false, _arachnodeDAO); _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCanceled, _arachnodeDAO); return; } try { _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); } catch (Exception exception2) { exception = exception2; } if (exception.InnerException == null) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } else { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.InnerException, false); } crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (_applicationSettings.InsertDisallowedAbsoluteUris) { if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.Message, _applicationSettings.ClassifyAbsoluteUris); } else { if (_applicationSettings.InsertDisallowedAbsoluteUriDiscoveries) { _arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } } } _consoleManager.OutputException(_crawlInfo.ThreadNumber, crawlRequest, _arachnodeDAO.LastExceptionID, _arachnodeDAO.LastExceptionMessage); } if (crawlRequest.IsFromDatabase) { _arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, _arachnodeDAO); if (!wasACacheHit) { if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, _arachnodeDAO); } Crawler.Engine.OnCrawlRequestCompleted(crawlRequest); } _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest); Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth); Counters.GetInstance().CrawlRequestRemoved(); Counters.GetInstance().CrawlRequestProcessed(); _crawlInfo.TotalCrawlRequestsProcessed++; }
/// <summary> /// Processes a WebPagesRow after crawling. /// </summary> /// <param name = "webPagesRow">The web pages row.</param> /// <param name="webClient"></param> /// <param name="actionManager"></param> /// <param name="consoleManager"></param> /// <param name="discoveryManager"></param> /// <param name="memoryManager"></param> /// <param name="ruleManager"></param> /// <param name = "webPageManager">The web page manager.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> /// <param name = "fileManager">The file manager.</param> /// <param name = "imageManager">The image manager.</param> public static void ProcessWebPage(ApplicationSettings applicationSettings, WebSettings webSettings, Crawler <TArachnodeDAO> crawler, ArachnodeDataSet.WebPagesRow webPagesRow, WebClient <TArachnodeDAO> webClient, Cache <TArachnodeDAO> cache, ActionManager <TArachnodeDAO> actionManager, ConsoleManager <TArachnodeDAO> consoleManager, CrawlerPeerManager <TArachnodeDAO> crawlerPeerManager, DiscoveryManager <TArachnodeDAO> discoveryManager, MemoryManager <TArachnodeDAO> memoryManager, RuleManager <TArachnodeDAO> ruleManager, WebPageManager <TArachnodeDAO> webPageManager, IArachnodeDAO arachnodeDAO) { CacheManager <TArachnodeDAO> cacheManager = new CacheManager <TArachnodeDAO>(applicationSettings, webSettings); CookieManager cookieManager = new CookieManager(); CrawlRequestManager <TArachnodeDAO> crawlRequestManager = new CrawlRequestManager <TArachnodeDAO>(applicationSettings, webSettings, cache, consoleManager, discoveryManager); DataTypeManager <TArachnodeDAO> dataTypeManager = new DataTypeManager <TArachnodeDAO>(applicationSettings, webSettings); EncodingManager <TArachnodeDAO> encodingManager = new EncodingManager <TArachnodeDAO>(applicationSettings, webSettings); PolitenessManager <TArachnodeDAO> politenessManager = new PolitenessManager <TArachnodeDAO>(applicationSettings, webSettings, cache); ProxyManager <TArachnodeDAO> proxyManager = new ProxyManager <TArachnodeDAO>(applicationSettings, webSettings, consoleManager); HtmlManager <TArachnodeDAO> htmlManager = new HtmlManager <TArachnodeDAO>(applicationSettings, webSettings, discoveryManager); Crawl <TArachnodeDAO> crawl = new Crawl <TArachnodeDAO>(applicationSettings, webSettings, crawler, actionManager, consoleManager, cookieManager, crawlRequestManager, dataTypeManager, discoveryManager, encodingManager, htmlManager, politenessManager, proxyManager, ruleManager, true); //create a CrawlRequest as this is what the internals of SiteCrawler.dll expect to operate on... CrawlRequest <TArachnodeDAO> crawlRequest = new CrawlRequest <TArachnodeDAO>(new Discovery <TArachnodeDAO>(webPagesRow.AbsoluteUri), webPagesRow.CrawlDepth, UriClassificationType.Host, UriClassificationType.Host, 0, RenderType.None, RenderType.None); crawlRequest.Crawl = crawl; crawlRequest.Discovery.DiscoveryType = DiscoveryType.WebPage; crawlRequest.Discovery.ID = webPagesRow.ID; crawlRequest.Data = webPagesRow.Source; crawlRequest.CurrentDepth = webPagesRow.CrawlDepth; crawlRequest.Encoding = Encoding.GetEncoding(webPagesRow.CodePage); crawlRequest.ProcessData = true; crawlRequest.WebClient = webClient; crawlRequest.WebClient.HttpWebResponse.Headers.Clear(); //parse the ResponseHeaders from the WebPagesRow.ResponseHeaders string... foreach (string responseHeader in webPagesRow.ResponseHeaders.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { string[] responseHeaderSplit = responseHeader.Split(":".ToCharArray()); string name = responseHeaderSplit[0]; string value = UserDefinedFunctions.ExtractResponseHeader(webPagesRow.ResponseHeaders, name, true).Value; crawlRequest.WebClient.HttpWebResponse.Headers.Add(name, value); } //refresh the DataTypes in the DataTypeManager... (if necessary)... if (dataTypeManager.AllowedDataTypes.Count == 0) { dataTypeManager.RefreshDataTypes(); } crawlRequest.DataType = dataTypeManager.DetermineDataType(crawlRequest); //now, process the bytes... encodingManager.ProcessCrawlRequest(crawlRequest, arachnodeDAO); if (applicationSettings.InsertWebPages) { crawlRequest.Discovery.ID = arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.WebClient.HttpWebResponse.Headers.ToString(), applicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, applicationSettings.ClassifyAbsoluteUris); } crawlRequest.ManagedDiscovery = webPageManager.ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, applicationSettings.ExtractWebPageMetaData, applicationSettings.InsertWebPageMetaData, applicationSettings.SaveDiscoveredWebPagesToDisk); //assigning FileAndImageDiscoveries isn't applicable because Files and Images need to be crawled to be properly classified... without classification we don't know whether they belong in dbo.Files or dbo.Images... crawlRequestManager.ProcessEmailAddresses(crawlRequest, arachnodeDAO); crawlRequestManager.ProcessHyperLinks(crawlRequest, arachnodeDAO); actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, arachnodeDAO); discoveryManager.CloseAndDisposeManagedDiscovery(crawlRequest, arachnodeDAO); }