Ejemplo n.º 1
0
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if (pageToCrawl == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return(CrawlDecision.DisallowCrawl("Crawl depth is above max"));
            }

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
            {
                return(CrawlDecision.DisallowCrawl("Scheme does not begin with http"));
            }

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return(CrawlDecision.DisallowCrawl(string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl)));
            }

            return(CrawlDecision.AllowCrawl());
        }
Ejemplo n.º 2
0
        public virtual CrawlDecision ShouldRecrawlPage(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (crawledPage.Exception == null)
            {
                return(CrawlDecision.DisallowCrawl("WebException did not occur"));
            }

            if (crawlContext.CrawlConfiguration.MaxRetryCount < 1)
            {
                return(CrawlDecision.AllowCrawl("无限次重试"));
            }

            if (crawledPage.RetryCount >= crawlContext.CrawlConfiguration.MaxRetryCount)
            {
                return(CrawlDecision.DisallowCrawl("MaxRetryCount has been reached"));
            }

            return(CrawlDecision.AllowCrawl());
        }
Ejemplo n.º 3
0
        public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            //if (crawledPage.HttpWebResponse == null) return CrawlDecision.DisallowCrawl("Null HttpWebResponse");

            if (crawledPage.StatusCode != HttpStatusCode.OK)
            {
                return(CrawlDecision.DisallowCrawl("HttpStatusCode is not 200"));
            }

            //string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim();
            //bool isDownloadable = false;
            //List<string> cleanDownloadableContentTypes = crawlContext.CrawlConfiguration.DownloadableContentTypes
            //    .Split(',')
            //    .Select(t => t.Trim())
            //    .Where(t => !string.IsNullOrEmpty(t))
            //    .ToList();

            //foreach (string downloadableContentType in cleanDownloadableContentTypes)
            //{
            //    if (pageContentType.Contains(downloadableContentType.ToLower().Trim()))
            //    {
            //        isDownloadable = true;
            //        break;
            //    }
            //}
            //if (!isDownloadable) return CrawlDecision.DisallowCrawl("Content type is not any of the following: " + string.Join(",", cleanDownloadableContentTypes));

            return(CrawlDecision.AllowCrawl());
        }
Ejemplo n.º 4
0
        public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
        {
            if (crawledPage == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawled page"));
            }

            if (crawlContext == null)
            {
                return(CrawlDecision.DisallowCrawl("Null crawl context"));
            }

            if (string.IsNullOrWhiteSpace(crawledPage.Content.Text))
            {
                return(CrawlDecision.DisallowCrawl("Page has no content"));
            }

            if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth)
            {
                return(CrawlDecision.DisallowCrawl("Crawl depth is above max"));
            }

            return(CrawlDecision.AllowCrawl());
        }
Ejemplo n.º 5
0
        protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext);

            if (!shouldCrawlPageDecision.Allow &&
                shouldCrawlPageDecision.Reason.Contains("MaxPagesToCrawl limit of"))
            {
                _logger.LogInformation("MaxPagesToCrawlLimit has been reached or scheduled. No more pages will be scheduled.");
                return(false);
            }

            if (shouldCrawlPageDecision.Allow)
            {
                shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            if (!shouldCrawlPageDecision.Allow)
            {
                _logger.LogDebug("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason);
                //FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason);
            }

            return(shouldCrawlPageDecision.Allow);
        }
Ejemplo n.º 6
0
        protected virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage)
        {
            CrawlDecision decision = _crawlDecisionMaker.ShouldDownloadPageContent(crawledPage, _crawlContext);

            if (decision.Allow)
            {
                decision = (_shouldDownloadPageContentDecisionMaker != null) ? _shouldDownloadPageContentDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            //SignalCrawlStopIfNeeded(decision);
            return(decision);
        }
Ejemplo n.º 7
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            CrawlDecision shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);

            if (shouldRecrawlPageDecision.Allow)
            {
                shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            //if (!shouldRecrawlPageDecision.Allow)
            //{
            //    //_logger.LogDebug("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason);
            //}
            //else
            //{
            // Look for the Retry-After header in the response.
            //crawledPage.RetryAfter = null;
            //if (crawledPage.HttpWebResponse != null &&
            //    crawledPage.HttpWebResponse.Headers != null)
            //{
            //    string value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After");
            //    if (!String.IsNullOrEmpty(value))
            //    {
            //        // Try to convert to DateTime first, then in double.
            //        DateTime date;
            //        double seconds;
            //        if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date))
            //        {
            //            crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds;
            //        }
            //        else if (double.TryParse(value, out seconds))
            //        {
            //            crawledPage.RetryAfter = seconds;
            //        }
            //    }
            //}
            //}
            return(shouldRecrawlPageDecision.Allow);
        }
Ejemplo n.º 8
0
        protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage)
        {
            CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext);

            if (shouldCrawlPageLinksDecision.Allow)
            {
                shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            if (!shouldCrawlPageLinksDecision.Allow)
            {
                _logger.LogDebug("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason);
                FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason);
                //FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason);
            }

            return(shouldCrawlPageLinksDecision.Allow);
        }
Ejemplo n.º 9
0
 public CrawledPage MakeRequest(Uri uri)
 {
     return(MakeRequest(uri, m => CrawlDecision.AllowCrawl()));
 }