protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { bool allowedByRobots = true; if (_robotsDotText != null) { allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); } if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled) { string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } else if (!allowedByRobots) { string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return(false); } return(allowedByRobots && base.ShouldCrawlPage(pageToCrawl)); }
public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { int robotsDotTextCrawlDelayInSecs = 0; int robotsDotTextCrawlDelayInMillisecs = 0; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots); FireRobotsDotTextParseCompleted(_robotsDotText.Robots); robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } // Additional url patterns to exclude var urlPatternsToExclude = _crawlContext.CrawlConfiguration.UrlPatternsToExclude; if (urlPatternsToExclude != null && urlPatternsToExclude.Count > 0) { if (_robotsDotText == null) { _robotsDotText = new RobotsDotText(uri, String.Empty); } if (_robotsDotText.Robots is Robots.Robots) { foreach (var pattern in urlPatternsToExclude) { ((Robots.Robots)_robotsDotText.Robots).AddDisallowEntry(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString, pattern); } } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds); robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); return(base.Crawl(uri, cancellationTokenSource)); }
public void Find_RobotsExists_UriIsRootDomainNoSlash_ReturnsRobotsDotText() { Uri _rootUri = new Uri("http://a.com"); Uri expectedRobotsUri = new Uri("http://a.com/robots.txt"); _fakePageRequester.Setup(f => f.MakeRequest(It.Is <Uri>(u => u == expectedRobotsUri))).Returns(_goodPageResult); IRobotsDotText result = _uut.Find(_rootUri); _fakePageRequester.Verify(f => f.MakeRequest(It.Is <Uri>(u => u == expectedRobotsUri)), Times.Exactly(1)); Assert.IsNotNull(result); }
public async Task Find_RobotsExists_UriIsNotRootDomain_ReturnsRobotsDotText() { Uri _rootUri = new Uri("http://a.com/a/b/b.html"); Uri expectedRobotsUri = new Uri("http://a.com/robots.txt"); _fakePageRequester.Setup(f => f.MakeRequestAsync(It.Is <Uri>(u => u == expectedRobotsUri))).Returns(Task.FromResult(_goodPageResult)); IRobotsDotText result = await _uut.FindAsync(_rootUri); _fakePageRequester.Verify(f => f.MakeRequestAsync(It.Is <Uri>(u => u == expectedRobotsUri)), Times.Exactly(1)); Assert.IsNotNull(result); }
/// <summary> /// Try to find and load site robots.txt /// </summary> /// <param name="uri"></param> protected bool TryLoadRobotsTxt(Uri uri) { int robotsDotTextCrawlDelayInSecs = 0; long robotsDotTextCrawlDelayInMillisecs = 0; // Load robots.txt if (CrawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { RobotsDotText = RobotsDotTextFinder.Find(uri); if (RobotsDotText != null) { Logger.InfoFormat("Robots.txt was found!"); FireRobotsDotTextParseCompletedAsync(RobotsDotText.Robots); FireRobotsDotTextParseCompleted(RobotsDotText.Robots); robotsDotTextCrawlDelayInSecs = RobotsDotText.GetCrawlDelay(CrawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = TimeConverter.SecondsToMilliseconds(robotsDotTextCrawlDelayInSecs); } else { Logger.InfoFormat("Robots.txt was NOT found!"); } } // Use whichever value is greater between the actual crawl delay value found, // the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > CrawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { Logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set " + "in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds); robotsDotTextCrawlDelayInSecs = CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = TimeConverter.SecondsToMilliseconds(robotsDotTextCrawlDelayInSecs); } Logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); DomainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } return(RobotsDotText != null); }
/// <summary> /// /// </summary> /// <param name="pageToCrawl"></param> /// <returns></returns> protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { bool allowedByRobots = true; if (_robotsDotText != null) { allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); } //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*" var allPathsBelowRootAllowedByRobots = false; if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots) { var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa"; allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); } if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot) { if (!allowedByRobots) { string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } else if (!allPathsBelowRootAllowedByRobots) { string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } } else if (!allowedByRobots) { string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return(false); } return(allowedByRobots && base.ShouldCrawlPage(pageToCrawl)); }
public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { int robotsDotTextCrawlDelayInSecs = 0; int robotsDotTextCrawlDelayInMillisecs = 0; int maxRobotsDotTextCrawlDelayInSeconds = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; int maxRobotsDotTextCrawlDelayInMilliSecs = maxRobotsDotTextCrawlDelayInSeconds * 1000; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > maxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, robotsDotTextCrawlDelayInSecs); robotsDotTextCrawlDelayInSecs = maxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0) { PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); } return(base.Crawl(uri, cancellationTokenSource)); }
public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource) { var robotsDotTextCrawlDelayInSecs = 0; var robotsDotTextCrawlDelayInMillisecs = 0; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = _robotsDotTextFinder.Find(uri); if (_robotsDotText != null) { robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds); robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } //if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0) // PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0) PageCrawlCompleted += (s, e) => _domainRateLimiter.RateLimit((WebProxy)e.CrawledPage.HttpWebRequest.Proxy, e.CrawledPage.Uri); return base.Crawl(uri, cancellationTokenSource); }
public override async Task <CrawlResult> CrawlAsync(Uri uri, CancellationTokenSource cancellationTokenSource) { var robotsDotTextCrawlDelayInSecs = 0; var robotsDotTextCrawlDelayInMillisecs = 0; //Load robots.txt if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled) { _robotsDotText = await _robotsDotTextFinder.FindAsync(uri); if (_robotsDotText != null) { FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots); FireRobotsDotTextParseCompleted(_robotsDotText.Robots); robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } } //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds) { if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds) { _logger.LogWarning($"[{uri}] robot.txt file directive [Crawl-delay: {_crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead."); robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds; robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000; } _logger.LogWarning($"[{uri}] robot.txt file directive [Crawl-delay: {robotsDotTextCrawlDelayInSecs}] will be respected."); _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs); } PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri); return(await base.CrawlAsync(uri, cancellationTokenSource)); }
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { bool allowedByRobots = true; if (_robotsDotText != null) allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled) { string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } else if (!allowedByRobots) { string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return false; } return allowedByRobots && base.ShouldCrawlPage(pageToCrawl); }
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { bool allowedByRobots = true; if (_robotsDotText != null) allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*" var allPathsBelowRootAllowedByRobots = false; if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots) { var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa"; allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); } if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot) { if (!allowedByRobots) { string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } else if (!allPathsBelowRootAllowedByRobots) { string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } } else if (!allowedByRobots) { string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return false; } return allowedByRobots && base.ShouldCrawlPage(pageToCrawl); }