public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if (pageToCrawl == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) { return(CrawlDecision.DisallowCrawl("Crawl depth is above max")); } if (!pageToCrawl.Uri.Scheme.StartsWith("http")) { return(CrawlDecision.DisallowCrawl("Scheme does not begin with http")); } //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return(CrawlDecision.DisallowCrawl(string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl))); } return(CrawlDecision.AllowCrawl()); }
public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (crawledPage.Exception == null) { return(CrawlDecision.DisallowCrawl("WebException did not occur")); } if (crawlContext.CrawlConfiguration.MaxRetryCount < 1) { return(CrawlDecision.AllowCrawl("无限次重试")); } if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount) { return(CrawlDecision.DisallowCrawl("MaxRetryCount has been reached")); } return(CrawlDecision.AllowCrawl()); }
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } //if (crawledPage.HttpWebResponse == null) return CrawlDecision.DisallowCrawl("Null HttpWebResponse"); if (crawledPage.StatusCode != HttpStatusCode.OK) { return(CrawlDecision.DisallowCrawl("HttpStatusCode is not 200")); } //string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim(); //bool isDownloadable = false; //List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes // .Split(',') // .Select(t => t.Trim()) // .Where(t => !string.IsNullOrEmpty(t)) // .ToList(); //foreach (string downloadableContentType in cleanDownloadableContentTypes) //{ // if (pageContentType.Contains(downloadableContentType.ToLower().Trim())) // { // isDownloadable = true; // break; // } //} //if (!isDownloadable) return CrawlDecision.DisallowCrawl("Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes)); return(CrawlDecision.AllowCrawl()); }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) { return(CrawlDecision.DisallowCrawl("Null crawled page")); } if (crawlContext == null) { return(CrawlDecision.DisallowCrawl("Null crawl context")); } if (string.IsNullOrWhiteSpace(crawledPage.Content.Text)) { return(CrawlDecision.DisallowCrawl("Page has no content")); } if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) { return(CrawlDecision.DisallowCrawl("Crawl depth is above max")); } return(CrawlDecision.AllowCrawl()); }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (!shouldCrawlPageDecision.Allow && shouldCrawlPageDecision.Reason.Contains("MaxPagesToCrawl limit of")) { _logger.LogInformation("MaxPagesToCrawlLimit has been reached or scheduled. No more pages will be scheduled."); return(false); } if (shouldCrawlPageDecision.Allow) { shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : CrawlDecision.AllowCrawl(); } if (!shouldCrawlPageDecision.Allow) { _logger.LogDebug("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); //FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } return(shouldCrawlPageDecision.Allow); }
protected virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage) { CrawlDecision decision = _crawlDecisionMaker.ShouldDownloadPageContent(crawledPage, _crawlContext); if (decision.Allow) { decision = (_shouldDownloadPageContentDecisionMaker != null) ? _shouldDownloadPageContentDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl(); } //SignalCrawlStopIfNeeded(decision); return(decision); }
protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage) { //TODO No unit tests cover these lines CrawlDecision shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext); if (shouldRecrawlPageDecision.Allow) { shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl(); } //if (!shouldRecrawlPageDecision.Allow) //{ // //_logger.LogDebug("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason); //} //else //{ // Look for the Retry-After header in the response. //crawledPage.RetryAfter = null; //if (crawledPage.HttpWebResponse != null && // crawledPage.HttpWebResponse.Headers != null) //{ // string value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After"); // if (!String.IsNullOrEmpty(value)) // { // // Try to convert to DateTime first, then in double. // DateTime date; // double seconds; // if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date)) // { // crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds; // } // else if (double.TryParse(value, out seconds)) // { // crawledPage.RetryAfter = seconds; // } // } //} //} return(shouldRecrawlPageDecision.Allow); }
protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage) { CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext); if (shouldCrawlPageLinksDecision.Allow) { shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl(); } if (!shouldCrawlPageLinksDecision.Allow) { _logger.LogDebug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason); FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason); //FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason); } return(shouldCrawlPageLinksDecision.Allow); }
public CrawledPage MakeRequest(Uri uri) { return(MakeRequest(uri, m => CrawlDecision.AllowCrawl())); }