Ejemplo n.º 1
0
        /// <summary>
        /// Using provided RobotsData, determines if specified path is disallowed or allowed.
        /// </summary>
        /// <param name="data">RobotsData containing necessary lists to check if URL is excluded</param>
        /// <param name="path">URL request path used for matching excluded patterns</param>
        /// <returns></returns>
        bool isUrlExcluded(RobotsData data, string path)
        {
            // update last access
            data.LastAccess = DateTime.Now;

            // check if disallow list is empty, automatically accept url
            if (data.DisallowedList == null || data.DisallowedList.Count == 0)
            {
                return(false);
            }

            // THE MOST SPECIFIC RULE IS PRIORITIZED

            string matchedDisallowPattern = null;

            // check if URL is disallowed
            foreach (var pattern in data.DisallowedList)
            {
                if (Regex.IsMatch(path, pattern))
                {
                    matchedDisallowPattern = pattern;

                    // ONCE FIRST PATTERN IS MATCHED, DO NOT SEARCH FURTHER
                    break;
                }
            }

            // if no disallow was matched, automatically accept url
            if (matchedDisallowPattern == null)
            {
                return(false);
            }

            // if URL is disallowed, check if a more specific allowed rule exists
            foreach (var pattern in data.AllowedList)
            {
                if (Regex.IsMatch(path, pattern))
                {
                    // if matched pattern is more detailed than disallowed pattern, URL is allowed
                    if (pattern.Length > matchedDisallowPattern.Length)
                    {
                        return(false);
                    }
                }
            }

            // URL is excluded because it was matched by a disallow pattern
            return(true);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Checks if URL is excluded for specified domain. Specify new config to override default one.
        /// </summary>
        /// <returns></returns>
        public async Task <bool> IsUrlExcluded(string url, WorkerConfiguration config = null, bool getRobotsIfMissing = false)
        {
            config = config ?? this.config;

            // check if URL matches any blacklisted patterns
            if (Extensions.IsURLBlacklisted(url, config))
            {
                return(true);
            }

            // if robot standard is not being respected, return false
            if (config.RespectRobotsExclusionStandard == false)
            {
                return(false);
            }

            // otherwise check the exclusion list

            var domain = Extensions.GetDomainName(url, out string protocol, out string path, false);

            if (UrlExclusions.TryGetValue(domain, out RobotsData data))
            {
                return(isUrlExcluded(data, path));
            }
            else if (getRobotsIfMissing)
            {
                // send request for robot.txt
                data = null;
                try
                {
                    // send request
                    var response = await http.GetAsync($"{protocol}://{domain}/robots.txt");

                    response.EnsureSuccessStatusCode();

                    var robotsStream = await response.Content.ReadAsStreamAsync();

                    // process content
                    data = await processRobotsContent(robotsStream);

                    Logger.Log($"Got 'robots.txt' for '{domain}' (" +
                               $"Allowed: {data.AllowedList.Count}, " +
                               $"Disallowed: {data.DisallowedList.Count}, " +
                               $"Wait: {data.WaitTime})", Logger.LogSeverity.Debug);
                }
                catch (Exception ex)
                {
                    // leave data empty if request fails
                    Logger.Log($"Failed to get 'robots.txt' for '{domain}': {ex.Message}", Logger.LogSeverity.Debug);
                }
                finally
                {
                    if (data == null)
                    {
                        data = new RobotsData();
                    }

                    // add to url exclusions
                    UrlExclusions.TryAdd(domain, data);
                }

                return(isUrlExcluded(data, path));
            }

            return(false);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Processes "robots.txt" content and returns the data. Specify new config to override default one.
        /// </summary>
        /// <param name="stream">Stream containing "robots.txt" content</param>
        async Task <RobotsData> processRobotsContent(Stream stream, WorkerConfiguration config = null)
        {
            config = config ?? this.config;

            var data = new RobotsData();

            using (var reader = new StreamReader(stream))
            {
                bool         userAgentMatched = false;
                const string userAgentKey     = "User-agent: ";

                while (!reader.EndOfStream)
                {
                    var line = await reader.ReadLineAsync();

                    // ignore comments
                    if (line.StartsWith('#'))
                    {
                        continue;
                    }

                    // check if we match the user-agent
                    if (line.StartsWith(userAgentKey, StringComparison.OrdinalIgnoreCase))
                    {
                        if (userAgentMatched)
                        {
                            // our user-agent was already matched, this means we got OUT of our user-agent sector
                            // based on specification: Only first matched User-Agent section is respected - others are ignored
                            break;
                        }

                        var useragent = line.Substring(userAgentKey.Length);

                        // if useragent is identical to our useragent, it's a match
                        if (useragent.ToLower() == config.UserAgent.ToLower())
                        {
                            userAgentMatched = true;
                        }
                        else
                        {
                            // otherwise if useragent contains '*', treat it as regex pattern
                            var pattern = GetRegexPattern(useragent);
                            if (Regex.IsMatch(config.UserAgent, pattern, RegexOptions.IgnoreCase))
                            {
                                userAgentMatched = true;
                            }
                        }

                        continue;
                    }

                    // do not continue until user agent is matched
                    if (userAgentMatched == false)
                    {
                        continue;
                    }

                    const string allowKey    = "Allow: ";
                    const string disallowKey = "Disallow: ";
                    const string delayKey    = "Crawl-delay: ";

                    if (line.StartsWith(disallowKey, StringComparison.OrdinalIgnoreCase))
                    {
                        var disallowedPath = line.Substring(disallowKey.Length);
                        var pattern        = GetRegexPattern(disallowedPath);

                        // '/' is a special symbol to match everything
                        if (disallowedPath == "/")
                        {
                            pattern = ".*";
                        }

                        data.DisallowedList.Add(pattern);
                    }
                    else if (line.StartsWith(allowKey, StringComparison.OrdinalIgnoreCase))
                    {
                        var allowedPath = line.Substring(allowKey.Length);
                        var pattern     = GetRegexPattern(allowedPath);
                        data.AllowedList.Add(pattern);
                    }
                    else if (line.StartsWith(delayKey, StringComparison.OrdinalIgnoreCase))
                    {
                        var delay = line.Substring(delayKey.Length);
                        if (double.TryParse(delay, out double r))
                        {
                            data.WaitTime = r;
                        }
                    }
                }
            }

            return(data);
        }