Example #1
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            var shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);

            if (shouldRecrawlPageDecision.Allow)
            {
                shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision {
                    Allow = true
                }
            }
            ;

            if (!shouldRecrawlPageDecision.Allow)
            {
                _logger.LogDebug($"Page [{crawledPage.Uri.AbsoluteUri}] not recrawled, [{shouldRecrawlPageDecision.Reason}]");
            }
            else
            {
                // Look for the Retry-After header in the response.
                crawledPage.RetryAfter = null;
                if (crawledPage.HttpWebResponse != null &&
                    crawledPage.HttpWebResponse.Headers != null)
                {
                    var value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After");
                    if (!String.IsNullOrEmpty(value))
                    {
                        // Try to convert to DateTime first, then in double.
                        DateTime date;
                        double   seconds;
                        if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date))
                        {
                            crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds;
                        }
                        else if (double.TryParse(value, out seconds))
                        {
                            crawledPage.RetryAfter = seconds;
                        }
                    }
                }
            }

            SignalCrawlStopIfNeeded(shouldRecrawlPageDecision);
            return(shouldRecrawlPageDecision.Allow);
        }
Example #2
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            var shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);

            if (shouldRecrawlPageDecision.Allow)
            {
                shouldRecrawlPageDecision = (ShouldRecrawlPageDecisionMaker != null) ? ShouldRecrawlPageDecisionMaker(crawledPage, _crawlContext) : new CrawlDecision {
                    Allow = true
                }
            }
            ;

            if (!shouldRecrawlPageDecision.Allow)
            {
                Log.DebugFormat("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason);
            }
            else
            {
                // Look for the Retry-After header in the response.
                crawledPage.RetryAfter = null;

                var value = crawledPage.HttpResponseMessage?.Headers?.RetryAfter?.ToString();
                if (!String.IsNullOrEmpty(value))
                {
                    // Try to convert to DateTime first, then in double.
                    DateTime date;
                    double   seconds;
                    if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date))
                    {
                        crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds;
                    }
                    else if (double.TryParse(value, out seconds))
                    {
                        crawledPage.RetryAfter = seconds;
                    }
                }
            }

            SignalCrawlStopIfNeeded(shouldRecrawlPageDecision);
            return(shouldRecrawlPageDecision.Allow);
        }
Example #3
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            CrawlDecision shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);

            if (shouldRecrawlPageDecision.Allow)
            {
                shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision {
                    Allow = true
                }
            }
            ;

            if (!shouldRecrawlPageDecision.Allow)
            {
                _logger.DebugFormat("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason);
            }

            SignalCrawlStopIfNeeded(shouldRecrawlPageDecision);
            return(shouldRecrawlPageDecision.Allow);
        }
Example #4
0
        protected virtual bool ShouldRecrawlPage(CrawledPage crawledPage)
        {
            //TODO No unit tests cover these lines
            CrawlDecision shouldRecrawlPageDecision = _crawlDecisionMaker.ShouldRecrawlPage(crawledPage, _crawlContext);

            if (shouldRecrawlPageDecision.Allow)
            {
                shouldRecrawlPageDecision = (_shouldRecrawlPageDecisionMaker != null) ? _shouldRecrawlPageDecisionMaker.Invoke(crawledPage, _crawlContext) : CrawlDecision.AllowCrawl();
            }

            //if (!shouldRecrawlPageDecision.Allow)
            //{
            //    //_logger.LogDebug("Page [{0}] not recrawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldRecrawlPageDecision.Reason);
            //}
            //else
            //{
            // Look for the Retry-After header in the response.
            //crawledPage.RetryAfter = null;
            //if (crawledPage.HttpWebResponse != null &&
            //    crawledPage.HttpWebResponse.Headers != null)
            //{
            //    string value = crawledPage.HttpWebResponse.GetResponseHeader("Retry-After");
            //    if (!String.IsNullOrEmpty(value))
            //    {
            //        // Try to convert to DateTime first, then in double.
            //        DateTime date;
            //        double seconds;
            //        if (crawledPage.LastRequest.HasValue && DateTime.TryParse(value, out date))
            //        {
            //            crawledPage.RetryAfter = (date - crawledPage.LastRequest.Value).TotalSeconds;
            //        }
            //        else if (double.TryParse(value, out seconds))
            //        {
            //            crawledPage.RetryAfter = seconds;
            //        }
            //    }
            //}
            //}
            return(shouldRecrawlPageDecision.Allow);
        }