public IEnumerable <SiteAccessEntry> GetSiteAccessEntries(IEnumerable <Token> tokens) { var result = new List <SiteAccessEntry>(); var parseState = new SiteAccessParseState(); var comparer = StringComparer.OrdinalIgnoreCase; using (var enumerator = tokens.GetEnumerator()) { string lastFieldValue = null; while (enumerator.MoveTo(TokenType.Field)) { var fieldValue = enumerator.Current.Value; if (!ExpectedFields.Contains(fieldValue)) { continue; } //Reset the state when we have encountered a new "User-agent" field not immediately after another if (!string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(lastFieldValue, UserAgentField) && comparer.Equals(fieldValue, UserAgentField)) { result.Add(parseState.AsEntry()); parseState.Reset(); } //When we have seen a field for the first time that isn't a User-agent, default to all User-agents (written as "*") if (string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(fieldValue, UserAgentField)) { parseState.UserAgents.Add("*"); } lastFieldValue = fieldValue; if (comparer.Equals(fieldValue, UserAgentField)) { if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) { parseState.UserAgents.Add(enumerator.Current.Value); } } else if (comparer.Equals(fieldValue, AllowField) || comparer.Equals(fieldValue, DisallowField)) { var pathRule = comparer.Equals(fieldValue, DisallowField) ? PathRuleType.Disallow : PathRuleType.Allow; var pathValue = string.Empty; if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) { pathValue = enumerator.Current.Value; } if (pathRule == PathRuleType.Allow && string.IsNullOrEmpty(pathValue)) { //Only disallow can be blank (no "Value" token) - See Section 4 of RFC continue; } parseState.PathRules.Add(new SiteAccessPathRule { RuleType = pathRule, Path = pathValue }); } else if (comparer.Equals(fieldValue, CrawlDelayField)) { if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter)) { if (int.TryParse(enumerator.Current.Value, out var parsedCrawlDelay)) { parseState.CrawlDelay = parsedCrawlDelay; } } } } result.Add(parseState.AsEntry()); } return(result); }
public IEnumerable <SiteAccessEntry> GetSiteAccessEntries(IEnumerable <Token> tokens) { //TODO: Refactor the implementation to not be as nasty as it is :( var result = new List <SiteAccessEntry>(); var parseState = new SiteAccessParseState(); var valueSteppingTokens = new[] { TokenType.FieldValueDelimiter }; var expectedFields = new[] { "User-agent", "Allow", "Disallow", "Crawl-delay" }; using (var enumerator = tokens.GetEnumerator()) { var lastFieldValue = string.Empty; while (enumerator.MoveTo(TokenType.Field)) { var fieldCurrent = enumerator.Current; if (!expectedFields.Contains(fieldCurrent.Value)) { continue; } //Reset the state when we have encountered a new "User-agent" field not immediately after another if (lastFieldValue != string.Empty && lastFieldValue != "User-agent" && fieldCurrent.Value == "User-agent") { result.Add(parseState.AsEntry()); parseState.Reset(); } //When we have seen a field for the first time that isn't a User-agent, default to all User-agents if (lastFieldValue == string.Empty && fieldCurrent.Value != "User-agent") { parseState.UserAgents.Add("*"); } lastFieldValue = fieldCurrent.Value; if (fieldCurrent.Value == "User-agent") { if (enumerator.StepOverTo(TokenType.Value, valueSteppingTokens)) { parseState.UserAgents.Add(enumerator.Current.Value); } } else if (fieldCurrent.Value == "Allow" || fieldCurrent.Value == "Disallow") { var pathRule = fieldCurrent.Value == "Disallow" ? PathRuleType.Disallow : PathRuleType.Allow; var pathValue = string.Empty; if (enumerator.StepOverTo(TokenType.Value, valueSteppingTokens)) { pathValue = enumerator.Current.Value; } if (pathRule == PathRuleType.Allow && pathValue == null) { //Only disallow can be blank (no "Value" token) - See Section 4 of RFC continue; } parseState.PathRules.Add(new SiteAccessPathRule { RuleType = pathRule, Path = pathValue }); } else if (fieldCurrent.Value == "Crawl-delay") { if (enumerator.StepOverTo(TokenType.Value, valueSteppingTokens)) { if (int.TryParse(enumerator.Current.Value, out int parsedInt)) { parseState.CrawlDelay = parsedInt; } } } } result.Add(parseState.AsEntry()); } return(result); }