/// <summary> /// Processes the crawl request. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "obeyCrawlRules">if set to <c>true</c> [obey crawl rules].</param> /// <param name = "executeCrawlActions">if set to <c>true</c> [execute crawl actions].</param> public void ProcessCrawlRequest(CrawlRequest <TArachnodeDAO> crawlRequest, bool obeyCrawlRules, bool executeCrawlActions) { //HACK:!!! Solve this!!! //#if DEMO // return; //#endif bool wasACacheHit = false; try { crawlRequest.WebClient = WebClient; if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered) { if (!_politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestRequested, _arachnodeDAO)) { Crawler.Engine.OnCrawlRequestThrottled(crawlRequest); return; } _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest); _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PreRequest, _arachnodeDAO); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PreRequest, _arachnodeDAO); } if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PreRequest, _arachnodeDAO); } if (!crawlRequest.IsDisallowed) { _stopwatch.Reset(); _stopwatch.Start(); try { _dataManager.ProcessCrawlRequest(crawlRequest, obeyCrawlRules, executeCrawlActions); } catch (Exception exception2) { throw new Exception(exception2.Message, exception2); } finally { _stopwatch.Stop(); _crawlInfo.TotalHttpWebResponseTime += _stopwatch.Elapsed; crawlRequest.HttpWebResponseTime = _stopwatch.Elapsed; _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); } Counters.GetInstance().TotalBytesDiscovered(crawlRequest.Data.LongLength); _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.PostRequest, _arachnodeDAO); _encodingManager.ProcessCrawlRequest(crawlRequest, _arachnodeDAO); if (obeyCrawlRules) { _ruleManager.IsDisallowed(crawlRequest, CrawlRuleType.PostRequest, _arachnodeDAO); } //the CrawlRequest could be Disallowed by a PreGet CrawlRule - specifically DataType.cs. if (!crawlRequest.IsDisallowed) { if (_processData) { _crawlRequestManager.ProcessCrawlRequest(crawlRequest, _fileManager, _imageManager, _webPageManager, _arachnodeDAO); } } else { if (crawlRequest.DataType.ContentType == null) { crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); } if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest); } } else { _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); if (crawlRequest.DataType.ContentType == null) { crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); } if (_applicationSettings.InsertDisallowedAbsoluteUris) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.IsDisallowedReason, _applicationSettings.ClassifyAbsoluteUris); } _consoleManager.OutputIsDisallowedReason(_crawlInfo, crawlRequest); } } else { wasACacheHit = true; //this should only occur when you submit a CR from a rule, or action... _consoleManager.OutputCacheHit(_crawlInfo, crawlRequest, crawlRequest.Discovery); } } catch (Exception exception) { _stopwatch.Stop(); if (Crawler.Engine.State != EngineState.Start) { //the request was aborted as it was long running and Engine was requested to Stop. if ((crawlRequest.WebClient.WebException != null && crawlRequest.WebClient.WebException.Status == WebExceptionStatus.RequestCanceled) || (exception.InnerException != null && exception.InnerException.Message == "The request was aborted: The request was canceled.")) { return; } } if (crawlRequest.WebClient.WebException != null && crawlRequest.Discovery.HttpWebRequestRetriesRemaining != 0 && crawlRequest.WebClient.WebException.Message.StartsWith("Unable to connect to the remote server")) { _politenessManager.ResubmitCrawlRequest(crawlRequest, false, _arachnodeDAO); _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCanceled, _arachnodeDAO); return; } try { _politenessManager.ManagePoliteness(crawlRequest, PolitenessState.HttpWebRequestCompleted, _arachnodeDAO); } catch (Exception exception2) { exception = exception2; } if (exception.InnerException == null) { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception, false); } else { _arachnodeDAO.InsertException(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.InnerException, false); } crawlRequest.DataType = _dataTypeManager.DetermineDataType(crawlRequest); if (_applicationSettings.InsertDisallowedAbsoluteUris) { if (crawlRequest.Discovery.DiscoveryState == DiscoveryState.Undiscovered) { _arachnodeDAO.InsertDisallowedAbsoluteUri(crawlRequest.DataType.ContentTypeID, (int)crawlRequest.DataType.DiscoveryType, crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, exception.Message, _applicationSettings.ClassifyAbsoluteUris); } else { if (_applicationSettings.InsertDisallowedAbsoluteUriDiscoveries) { _arachnodeDAO.InsertDisallowedAbsoluteUriDiscovery(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } } } _consoleManager.OutputException(_crawlInfo.ThreadNumber, crawlRequest, _arachnodeDAO.LastExceptionID, _arachnodeDAO.LastExceptionMessage); } if (crawlRequest.IsFromDatabase) { _arachnodeDAO.DeleteCrawlRequest(crawlRequest.Parent.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri); } _discoveryManager.ManageDiscovery(crawlRequest, DiscoveryState.Discovered, _arachnodeDAO); if (!wasACacheHit) { if (executeCrawlActions) { _actionManager.PerformCrawlActions(crawlRequest, CrawlActionType.PostRequest, _arachnodeDAO); } Crawler.Engine.OnCrawlRequestCompleted(crawlRequest); } _consoleManager.OutputProcessCrawlRequest(_crawlInfo.ThreadNumber, crawlRequest); Counters.GetInstance().ReportCurrentDepth(crawlRequest.CurrentDepth); Counters.GetInstance().CrawlRequestRemoved(); Counters.GetInstance().CrawlRequestProcessed(); _crawlInfo.TotalCrawlRequestsProcessed++; }