Пример #1
0
        private void ReadLines(string[] lines)
        {
            globalAccessRules = new List<AccessRule>();
            specificAccessRules = new List<AccessRule>();
            crawlDelayRules = new List<CrawlDelayRule>();
            Sitemaps = new List<Sitemap>();
            string userAgent = String.Empty;

            int ruleCount = 0;
            foreach (var robotsLine in lines.Select(line => new Line(line)))
            {
                switch (robotsLine.Type)
                {
                    case LineType.Comment: //ignore the comments
                        continue;
                    case LineType.UserAgent:
                        userAgent = robotsLine.Value;
                        continue;
                    case LineType.Sitemap:
                        if (Sitemap.FromLine(robotsLine) != null)
                        {
                            Sitemaps.Add(Sitemap.FromLine(robotsLine));
                        }
                        continue;
                    case LineType.AccessRule:
                    case LineType.CrawlDelayRule:
                        //if there's a rule without user-agent declaration, ignore it
                        if (String.IsNullOrEmpty(userAgent))
                        {
                            this.Malformed = true;
                            continue;
                        }
                        if (robotsLine.Type == LineType.AccessRule)
                        {
                            var accessRule = new AccessRule(userAgent, robotsLine, ++ruleCount);
                            if (accessRule.For.Equals("*"))
                            {
                                this.globalAccessRules.Add(accessRule);
                            }
                            else
                            {
                                this.specificAccessRules.Add(accessRule);
                            }
                            if (!accessRule.Allowed && !String.IsNullOrEmpty(accessRule.Path))
                            {
                                // We say !String.IsNullOrEmpty(x.Path) because the rule "Disallow: " means nothing is disallowed.
                                this.IsAnyPathDisallowed = true;
                            }
                        }
                        else
                        {
                            this.crawlDelayRules.Add(new CrawlDelayRule(userAgent, robotsLine, ++ruleCount));
                        }
                        this.HasRules = true;
                        continue;
                    case LineType.Unknown:
                        this.Malformed = true;
                        continue;
                    default:
                        this.Malformed = true;
                        continue;
                }
            }

            if (globalAccessRules.Any())
            {
                foreach (var accessRule in this.globalAccessRules.Where(accessRule => !accessRule.Allowed && !String.IsNullOrEmpty(accessRule.Path)))
                {
                    this.HaveNoAllowRules = true;
                }
            }

            if (specificAccessRules.Any())
            {
                foreach (var accessRule in this.specificAccessRules.Where(accessRule => !accessRule.Allowed && !String.IsNullOrEmpty(accessRule.Path)))
                {
                    this.HaveNoAllowRules = true;
                }
            }
        }
Пример #2
0
        private void readLines(string[] lines)
        {
            globalAccessRules   = new List <AccessRule>();
            specificAccessRules = new List <AccessRule>();
            crawlDelayRules     = new List <CrawlDelayRule>();
            Sitemaps            = new List <Sitemap>();
            string userAgent = String.Empty;

            int ruleCount = 0;

            for (int i = 0; i < lines.Length; i++)
            {
                var line       = lines[i];
                var robotsLine = new Line(line);
                switch (robotsLine.Type)
                {
                case LineType.Comment:     //ignore the comments
                    continue;

                case LineType.UserAgent:
                    userAgent = robotsLine.Value;
                    continue;

                case LineType.Sitemap:
                    Sitemaps.Add(Sitemap.FromLine(robotsLine));
                    continue;

                case LineType.AccessRule:
                case LineType.CrawlDelayRule:
                    //if there's a rule without user-agent declaration, ignore it
                    if (String.IsNullOrEmpty(userAgent))
                    {
                        Malformed = true;
                        continue;
                    }
                    if (robotsLine.Type == LineType.AccessRule)
                    {
                        var accessRule = new AccessRule(userAgent, robotsLine, ++ruleCount);
                        if (accessRule.For.Equals("*"))
                        {
                            globalAccessRules.Add(accessRule);
                        }
                        else
                        {
                            specificAccessRules.Add(accessRule);
                        }
                        if (!accessRule.Allowed && !String.IsNullOrEmpty(accessRule.Path))
                        {
                            // We say !String.IsNullOrEmpty(x.Path) because the rule "Disallow: " means nothing is disallowed.
                            IsAnyPathDisallowed = true;
                        }
                    }
                    else
                    {
                        crawlDelayRules.Add(new CrawlDelayRule(userAgent, robotsLine, ++ruleCount));
                    }
                    HasRules = true;
                    continue;

                case LineType.Unknown:
                    Malformed = true;
                    continue;
                }
            }
        }