/// <summary> /// Begins a synchronous crawl using the uri param, /// subscribe to events to process data as it becomes available /// </summary> /// <param name="uri"></param> /// <param name="cancellationTokenSource"></param> /// <returns></returns> public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { TryLoadRobotsTxt(uri); PageCrawlStarting += (s, e) => DomainRateLimiter.RateLimit(e.PageToCrawl.Uri); return(base.Crawl(uri, cancellationTokenSource)); }
public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { int robotsDotTextCrawlDelayInSecs = 0; int robotsDotTextCrawlDelayInMillisecs = 0; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots); FireRobotsDotTextParseCompleted(_robotsDotText.Robots); robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } // Additional url patterns to exclude var urlPatternsToExclude = _crawlContext.CrawlConfiguration.UrlPatternsToExclude; if (urlPatternsToExclude != null && urlPatternsToExclude.Count > 0) { if (_robotsDotText == null) { _robotsDotText = new RobotsDotText(uri, String.Empty); } if (_robotsDotText.Robots is Robots.Robots) { foreach (var pattern in urlPatternsToExclude) { ((Robots.Robots)_robotsDotText.Robots).AddDisallowEntry(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString, pattern); } } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds); robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); return(base.Crawl(uri, cancellationTokenSource)); }
public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { int robotsDotTextCrawlDelayInSecs = 0; int robotsDotTextCrawlDelayInMillisecs = 0; int maxRobotsDotTextCrawlDelayInSeconds = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; int maxRobotsDotTextCrawlDelayInMilliSecs = maxRobotsDotTextCrawlDelayInSeconds * 1000; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > maxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, robotsDotTextCrawlDelayInSecs); robotsDotTextCrawlDelayInSecs = maxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0) { PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); } return(base.Crawl(uri, cancellationTokenSource)); }
public override async Task <CrawlResult> CrawlAsync(Uri uri, CancellationTokenSource cancellationTokenSource) { var robotsDotTextCrawlDelayInSecs = 0; var robotsDotTextCrawlDelayInMillisecs = 0; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = await _robotsDotTextFinder.FindAsync(uri); if (_robotsDotText != null) { FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots); FireRobotsDotTextParseCompleted(_robotsDotText.Robots); robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { _logger.LogWarning($"[{uri}] robot.txt file directive [Crawl-delay: {_crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead."); robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.LogWarning($"[{uri}] robot.txt file directive [Crawl-delay: {robotsDotTextCrawlDelayInSecs}] will be respected."); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); return(await base.CrawlAsync(uri, cancellationTokenSource)); }