internal static Sitemap FromLine(Line line) { Sitemap s = new Sitemap { Value = line.Value }; try { s.Url = new Uri(line.Value); } catch (UriFormatException) { // fail silently, we can't do anything about the uri being invalid. } return(s); }
private void readLines(string[] lines) { globalAccessRules = new List <AccessRule>(); specificAccessRules = new List <AccessRule>(); crawlDelayRules = new List <CrawlDelayRule>(); Sitemaps = new List <Sitemap>(); string userAgent = String.Empty; int ruleCount = 0; for (int i = 0; i < lines.Length; i++) { var line = lines[i]; var robotsLine = new Line(line); switch (robotsLine.Type) { case LineType.Comment: //ignore the comments continue; case LineType.UserAgent: userAgent = robotsLine.Value; continue; case LineType.Sitemap: Sitemaps.Add(Sitemap.FromLine(robotsLine)); continue; case LineType.AccessRule: case LineType.CrawlDelayRule: //if there's a rule without user-agent declaration, ignore it if (String.IsNullOrEmpty(userAgent)) { Malformed = true; continue; } if (robotsLine.Type == LineType.AccessRule) { var accessRule = new AccessRule(userAgent, robotsLine, ++ruleCount); if (accessRule.For.Equals("*")) { globalAccessRules.Add(accessRule); } else { specificAccessRules.Add(accessRule); } if (!accessRule.Allowed && !String.IsNullOrEmpty(accessRule.Path)) { // We say !String.IsNullOrEmpty(x.Path) because the rule "Disallow: " means nothing is disallowed. IsAnyPathDisallowed = true; } } else { crawlDelayRules.Add(new CrawlDelayRule(userAgent, robotsLine, ++ruleCount)); } HasRules = true; continue; case LineType.Unknown: Malformed = true; continue; } } }