예제 #1
0
        public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            string robotsHttpUrl = string.IsNullOrEmpty(_searchPath)
                                ? $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}/robots.txt"
                                : $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}" + _searchPath;

            RobotsTxt.Robots robots;
            if (!_robotsInfo.TryGetValue(robotsHttpUrl, out robots))
            {
                _logger.Verbose("Downloading robots.txt file from {@0}", robotsHttpUrl);
                string robotsContext = null;
                try
                {
                    robotsContext = await _httpClient.GetStringAsync(robotsHttpUrl);
                }
                catch (WebException)
                {
                }
                catch (ProtocolViolationException)
                {
                }
                catch (HttpRequestException)
                {
                }

                robots = new RobotsTxt.Robots(robotsContext ?? string.Empty);
                _robotsInfo.Add(robotsHttpUrl, robots);
            }

            if (!robots.HasRules)
            {
                return(true);
            }

            long crawlDelay = robots.CrawlDelay(propertyBag.UserAgent);

            if (crawlDelay > 0)
            {
                await Task.Delay((int)crawlDelay);
            }

            bool result = robots.IsPathAllowed(propertyBag.UserAgent, propertyBag.Step.Uri.ToString());

            propertyBag[RobotsIsPathAllowedPropertyName].Name  = nameof(RobotsPipelineStep);
            propertyBag[RobotsIsPathAllowedPropertyName].Value = result;
            return(result);
        }
예제 #2
0
		public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			string robotsHttpUrl = string.IsNullOrEmpty(_searchPath)
				? $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}/robots.txt"
				: $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}" + _searchPath;

			RobotsTxt.Robots robots;
			if (!_robotsInfo.TryGetValue(robotsHttpUrl, out robots))
			{
				_logger.Verbose("Downloading robots.txt file from {@0}", robotsHttpUrl);
				string robotsContext = null;
				try
				{
					robotsContext = await _httpClient.GetStringAsync(robotsHttpUrl);
				}
				catch (WebException)
				{
				}
				catch (ProtocolViolationException)
				{
				}
				catch (HttpRequestException)
				{
				}

				robots = new RobotsTxt.Robots(robotsContext ?? string.Empty);
				_robotsInfo.Add(robotsHttpUrl, robots);
			}

			if (!robots.HasRules)
			{
				return true;
			}

			long crawlDelay = robots.CrawlDelay(propertyBag.UserAgent);
			if (crawlDelay > 0)
			{
				await Task.Delay((int) crawlDelay);
			}

			bool result = robots.IsPathAllowed(propertyBag.UserAgent, propertyBag.Step.Uri.ToString());
			propertyBag[RobotsIsPathAllowedPropertyName].Name = nameof(RobotsPipelineStep);
			propertyBag[RobotsIsPathAllowedPropertyName].Value = result;
			return result;
		}