public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { int robotsDotTextCrawlDelayInSecs = 0; int robotsDotTextCrawlDelayInMillisecs = 0; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots); FireRobotsDotTextParseCompleted(_robotsDotText.Robots); robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } // Additional url patterns to exclude var urlPatternsToExclude = _crawlContext.CrawlConfiguration.UrlPatternsToExclude; if (urlPatternsToExclude != null && urlPatternsToExclude.Count > 0) { if (_robotsDotText == null) { _robotsDotText = new RobotsDotText(uri, String.Empty); } if (_robotsDotText.Robots is Robots.Robots) { foreach (var pattern in urlPatternsToExclude) { ((Robots.Robots)_robotsDotText.Robots).AddDisallowEntry(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString, pattern); } } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds); robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); return(base.Crawl(uri, cancellationTokenSource)); }
/// <summary> /// Try to find and load site robots.txt /// </summary> /// <param name="uri"></param> protected bool TryLoadRobotsTxt(Uri uri) { int robotsDotTextCrawlDelayInSecs = 0; long robotsDotTextCrawlDelayInMillisecs = 0; // Load robots.txt if (CrawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { RobotsDotText = RobotsDotTextFinder.Find(uri); if (RobotsDotText != null) { Logger.InfoFormat("Robots.txt was found!"); FireRobotsDotTextParseCompletedAsync(RobotsDotText.Robots); FireRobotsDotTextParseCompleted(RobotsDotText.Robots); robotsDotTextCrawlDelayInSecs = RobotsDotText.GetCrawlDelay(CrawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = TimeConverter.SecondsToMilliseconds(robotsDotTextCrawlDelayInSecs); } else { Logger.InfoFormat("Robots.txt was NOT found!"); } } // Use whichever value is greater between the actual crawl delay value found, // the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > CrawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { Logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set " + "in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds); robotsDotTextCrawlDelayInSecs = CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = TimeConverter.SecondsToMilliseconds(robotsDotTextCrawlDelayInSecs); } Logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); DomainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } return(RobotsDotText != null); }
public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { int robotsDotTextCrawlDelayInSecs = 0; int robotsDotTextCrawlDelayInMillisecs = 0; int maxRobotsDotTextCrawlDelayInSeconds = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; int maxRobotsDotTextCrawlDelayInMilliSecs = maxRobotsDotTextCrawlDelayInSeconds * 1000; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > maxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, robotsDotTextCrawlDelayInSecs); robotsDotTextCrawlDelayInSecs = maxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0) { PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); } return(base.Crawl(uri, cancellationTokenSource)); }