Beispiel #1
0
        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;

            if (_robotsDotText != null)
            {
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }

            if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
            {
                string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);
                allowedByRobots = true;
                _robotsDotText  = null;
            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return(false);
            }

            return(allowedByRobots && base.ShouldCrawlPage(pageToCrawl));
        }
        public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            int robotsDotTextCrawlDelayInSecs      = 0;
            int robotsDotTextCrawlDelayInMillisecs = 0;

            //Load robots.txt
            if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled)
            {
                _robotsDotText = _robotsDotTextFinder.Find(uri);

                if (_robotsDotText != null)
                {
                    FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots);
                    FireRobotsDotTextParseCompleted(_robotsDotText.Robots);

                    robotsDotTextCrawlDelayInSecs      = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
                    robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
                }
            }

            // Additional url patterns to exclude
            var urlPatternsToExclude = _crawlContext.CrawlConfiguration.UrlPatternsToExclude;

            if (urlPatternsToExclude != null && urlPatternsToExclude.Count > 0)
            {
                if (_robotsDotText == null)
                {
                    _robotsDotText = new RobotsDotText(uri, String.Empty);
                }
                if (_robotsDotText.Robots is Robots.Robots)
                {
                    foreach (var pattern in urlPatternsToExclude)
                    {
                        ((Robots.Robots)_robotsDotText.Robots).AddDisallowEntry(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString, pattern);
                    }
                }
            }

            //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain
            if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds)
            {
                if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds)
                {
                    _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds);

                    robotsDotTextCrawlDelayInSecs      = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds;
                    robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
                }

                _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs);
                _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs);
            }

            PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri);

            return(base.Crawl(uri, cancellationTokenSource));
        }
        public void Find_RobotsExists_UriIsRootDomainNoSlash_ReturnsRobotsDotText()
        {
            Uri _rootUri          = new Uri("http://a.com");
            Uri expectedRobotsUri = new Uri("http://a.com/robots.txt");

            _fakePageRequester.Setup(f => f.MakeRequest(It.Is <Uri>(u => u == expectedRobotsUri))).Returns(_goodPageResult);

            IRobotsDotText result = _uut.Find(_rootUri);

            _fakePageRequester.Verify(f => f.MakeRequest(It.Is <Uri>(u => u == expectedRobotsUri)), Times.Exactly(1));
            Assert.IsNotNull(result);
        }
        public async Task Find_RobotsExists_UriIsNotRootDomain_ReturnsRobotsDotText()
        {
            Uri _rootUri          = new Uri("http://a.com/a/b/b.html");
            Uri expectedRobotsUri = new Uri("http://a.com/robots.txt");

            _fakePageRequester.Setup(f => f.MakeRequestAsync(It.Is <Uri>(u => u == expectedRobotsUri))).Returns(Task.FromResult(_goodPageResult));

            IRobotsDotText result = await _uut.FindAsync(_rootUri);

            _fakePageRequester.Verify(f => f.MakeRequestAsync(It.Is <Uri>(u => u == expectedRobotsUri)), Times.Exactly(1));
            Assert.IsNotNull(result);
        }
Beispiel #5
0
        /// <summary>
        /// Try to find and load site robots.txt
        /// </summary>
        /// <param name="uri"></param>
        protected bool TryLoadRobotsTxt(Uri uri)
        {
            int  robotsDotTextCrawlDelayInSecs      = 0;
            long robotsDotTextCrawlDelayInMillisecs = 0;

            // Load robots.txt
            if (CrawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled)
            {
                RobotsDotText = RobotsDotTextFinder.Find(uri);

                if (RobotsDotText != null)
                {
                    Logger.InfoFormat("Robots.txt was found!");

                    FireRobotsDotTextParseCompletedAsync(RobotsDotText.Robots);
                    FireRobotsDotTextParseCompleted(RobotsDotText.Robots);

                    robotsDotTextCrawlDelayInSecs      = RobotsDotText.GetCrawlDelay(CrawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
                    robotsDotTextCrawlDelayInMillisecs = TimeConverter.SecondsToMilliseconds(robotsDotTextCrawlDelayInSecs);
                }
                else
                {
                    Logger.InfoFormat("Robots.txt was NOT found!");
                }
            }

            // Use whichever value is greater between the actual crawl delay value found,
            // the max allowed crawl delay value or the minimum crawl delay required for every domain
            if (robotsDotTextCrawlDelayInSecs > 0 &&
                robotsDotTextCrawlDelayInMillisecs > CrawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds)
            {
                if (robotsDotTextCrawlDelayInSecs > CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds)
                {
                    Logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set " +
                                      "in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.",
                                      uri,
                                      CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds);

                    robotsDotTextCrawlDelayInSecs      = CrawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds;
                    robotsDotTextCrawlDelayInMillisecs = TimeConverter.SecondsToMilliseconds(robotsDotTextCrawlDelayInSecs);
                }

                Logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.",
                                  uri,
                                  robotsDotTextCrawlDelayInSecs);

                DomainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs);
            }

            return(RobotsDotText != null);
        }
Beispiel #6
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="pageToCrawl"></param>
        /// <returns></returns>
        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;

            if (_robotsDotText != null)
            {
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }


            //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
            var allPathsBelowRootAllowedByRobots = false;

            if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
            {
                var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
                allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }

            if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)
            {
                if (!allowedByRobots)
                {
                    string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText  = null;
                }
                else if (!allPathsBelowRootAllowedByRobots)
                {
                    string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText  = null;
                }
            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return(false);
            }

            return(allowedByRobots && base.ShouldCrawlPage(pageToCrawl));
        }
        public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            int robotsDotTextCrawlDelayInSecs      = 0;
            int robotsDotTextCrawlDelayInMillisecs = 0;

            int maxRobotsDotTextCrawlDelayInSeconds   = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds;
            int maxRobotsDotTextCrawlDelayInMilliSecs = maxRobotsDotTextCrawlDelayInSeconds * 1000;

            //Load robots.txt
            if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled)
            {
                _robotsDotText = _robotsDotTextFinder.Find(uri);

                if (_robotsDotText != null)
                {
                    robotsDotTextCrawlDelayInSecs      = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
                    robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
                }
            }

            //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain
            if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds)
            {
                if (robotsDotTextCrawlDelayInSecs > maxRobotsDotTextCrawlDelayInSeconds)
                {
                    _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, robotsDotTextCrawlDelayInSecs);

                    robotsDotTextCrawlDelayInSecs      = maxRobotsDotTextCrawlDelayInSeconds;
                    robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
                }

                _logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs);
                _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs);
            }

            if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0)
            {
                PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri);
            }

            return(base.Crawl(uri, cancellationTokenSource));
        }
		public override CrawlResult Crawl(Uri uri, CancellationTokenSource cancellationTokenSource)
		{
			var robotsDotTextCrawlDelayInSecs = 0;
			var robotsDotTextCrawlDelayInMillisecs = 0;

			//Load robots.txt
			if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled)
			{
				_robotsDotText = _robotsDotTextFinder.Find(uri);

				if (_robotsDotText != null)
				{
					robotsDotTextCrawlDelayInSecs = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
					robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
				}
			}

			//Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain
			if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds)
			{
				if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds)
				{
					_logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.", uri, _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds);

					robotsDotTextCrawlDelayInSecs = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds;
					robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
				}

				_logger.WarnFormat("[{0}] robot.txt file directive [Crawl-delay: {1}] will be respected.", uri, robotsDotTextCrawlDelayInSecs);
				_domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs);
			}

			//if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0)
			//	PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri);

			if (robotsDotTextCrawlDelayInSecs > 0 || _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds > 0)
				PageCrawlCompleted += (s, e) => _domainRateLimiter.RateLimit((WebProxy)e.CrawledPage.HttpWebRequest.Proxy, e.CrawledPage.Uri);

			return base.Crawl(uri, cancellationTokenSource);
		}
        public override async Task <CrawlResult> CrawlAsync(Uri uri, CancellationTokenSource cancellationTokenSource)
        {
            var robotsDotTextCrawlDelayInSecs      = 0;
            var robotsDotTextCrawlDelayInMillisecs = 0;

            //Load robots.txt
            if (_crawlContext.CrawlConfiguration.IsRespectRobotsDotTextEnabled)
            {
                _robotsDotText = await _robotsDotTextFinder.FindAsync(uri);

                if (_robotsDotText != null)
                {
                    FireRobotsDotTextParseCompletedAsync(_robotsDotText.Robots);
                    FireRobotsDotTextParseCompleted(_robotsDotText.Robots);

                    robotsDotTextCrawlDelayInSecs      = _robotsDotText.GetCrawlDelay(_crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
                    robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
                }
            }

            //Use whichever value is greater between the actual crawl delay value found, the max allowed crawl delay value or the minimum crawl delay required for every domain
            if (robotsDotTextCrawlDelayInSecs > 0 && robotsDotTextCrawlDelayInMillisecs > _crawlContext.CrawlConfiguration.MinCrawlDelayPerDomainMilliSeconds)
            {
                if (robotsDotTextCrawlDelayInSecs > _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds)
                {
                    _logger.LogWarning($"[{uri}] robot.txt file directive [Crawl-delay: {_crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds}] is above the value set in the config value MaxRobotsDotTextCrawlDelay, will use MaxRobotsDotTextCrawlDelay value instead.");

                    robotsDotTextCrawlDelayInSecs      = _crawlContext.CrawlConfiguration.MaxRobotsDotTextCrawlDelayInSeconds;
                    robotsDotTextCrawlDelayInMillisecs = robotsDotTextCrawlDelayInSecs * 1000;
                }

                _logger.LogWarning($"[{uri}] robot.txt file directive [Crawl-delay: {robotsDotTextCrawlDelayInSecs}] will be respected.");
                _domainRateLimiter.AddDomain(uri, robotsDotTextCrawlDelayInMillisecs);
            }

            PageCrawlStarting += (s, e) => _domainRateLimiter.RateLimit(e.PageToCrawl.Uri);

            return(await base.CrawlAsync(uri, cancellationTokenSource));
        }
Beispiel #10
0
        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;
            if (_robotsDotText != null)
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);

            if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
            {
                string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);
                allowedByRobots = true;
                _robotsDotText = null;
            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return false;
            }

            return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
        }
Beispiel #11
0
        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;
            if (_robotsDotText != null)
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);


            //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
            var allPathsBelowRootAllowedByRobots = false;
            if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
            {
                var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
                allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }

            if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)    
            {
                if (!allowedByRobots)
                {
                    string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }
                else if (!allPathsBelowRootAllowedByRobots)
                {
                    string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }

            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return false;
            }

            return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
        }