public IEnumerable <SiteAccessEntry> GetSiteAccessEntries(IEnumerable <Token> tokens)
        {
            var result     = new List <SiteAccessEntry>();
            var parseState = new SiteAccessParseState();
            var comparer   = StringComparer.OrdinalIgnoreCase;

            using (var enumerator = tokens.GetEnumerator())
            {
                string lastFieldValue = null;
                while (enumerator.MoveTo(TokenType.Field))
                {
                    var fieldValue = enumerator.Current.Value;

                    if (!ExpectedFields.Contains(fieldValue))
                    {
                        continue;
                    }

                    //Reset the state when we have encountered a new "User-agent" field not immediately after another
                    if (!string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(lastFieldValue, UserAgentField) && comparer.Equals(fieldValue, UserAgentField))
                    {
                        result.Add(parseState.AsEntry());
                        parseState.Reset();
                    }

                    //When we have seen a field for the first time that isn't a User-agent, default to all User-agents (written as "*")
                    if (string.IsNullOrEmpty(lastFieldValue) && !comparer.Equals(fieldValue, UserAgentField))
                    {
                        parseState.UserAgents.Add("*");
                    }

                    lastFieldValue = fieldValue;

                    if (comparer.Equals(fieldValue, UserAgentField))
                    {
                        if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
                        {
                            parseState.UserAgents.Add(enumerator.Current.Value);
                        }
                    }
                    else if (comparer.Equals(fieldValue, AllowField) || comparer.Equals(fieldValue, DisallowField))
                    {
                        var pathRule  = comparer.Equals(fieldValue, DisallowField) ? PathRuleType.Disallow : PathRuleType.Allow;
                        var pathValue = string.Empty;

                        if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
                        {
                            pathValue = enumerator.Current.Value;
                        }

                        if (pathRule == PathRuleType.Allow && string.IsNullOrEmpty(pathValue))
                        {
                            //Only disallow can be blank (no "Value" token) - See Section 4 of RFC
                            continue;
                        }

                        parseState.PathRules.Add(new SiteAccessPathRule
                        {
                            RuleType = pathRule,
                            Path     = pathValue
                        });
                    }
                    else if (comparer.Equals(fieldValue, CrawlDelayField))
                    {
                        if (enumerator.StepOverTo(TokenType.Value, TokenType.FieldValueDelimiter))
                        {
                            if (int.TryParse(enumerator.Current.Value, out var parsedCrawlDelay))
                            {
                                parseState.CrawlDelay = parsedCrawlDelay;
                            }
                        }
                    }
                }

                result.Add(parseState.AsEntry());
            }

            return(result);
        }
Ejemplo n.º 2
0
        public IEnumerable <SiteAccessEntry> GetSiteAccessEntries(IEnumerable <Token> tokens)
        {
            //TODO: Refactor the implementation to not be as nasty as it is :(
            var result              = new List <SiteAccessEntry>();
            var parseState          = new SiteAccessParseState();
            var valueSteppingTokens = new[] { TokenType.FieldValueDelimiter };
            var expectedFields      = new[] { "User-agent", "Allow", "Disallow", "Crawl-delay" };

            using (var enumerator = tokens.GetEnumerator())
            {
                var lastFieldValue = string.Empty;
                while (enumerator.MoveTo(TokenType.Field))
                {
                    var fieldCurrent = enumerator.Current;

                    if (!expectedFields.Contains(fieldCurrent.Value))
                    {
                        continue;
                    }

                    //Reset the state when we have encountered a new "User-agent" field not immediately after another
                    if (lastFieldValue != string.Empty && lastFieldValue != "User-agent" && fieldCurrent.Value == "User-agent")
                    {
                        result.Add(parseState.AsEntry());
                        parseState.Reset();
                    }

                    //When we have seen a field for the first time that isn't a User-agent, default to all User-agents
                    if (lastFieldValue == string.Empty && fieldCurrent.Value != "User-agent")
                    {
                        parseState.UserAgents.Add("*");
                    }

                    lastFieldValue = fieldCurrent.Value;

                    if (fieldCurrent.Value == "User-agent")
                    {
                        if (enumerator.StepOverTo(TokenType.Value, valueSteppingTokens))
                        {
                            parseState.UserAgents.Add(enumerator.Current.Value);
                        }
                    }
                    else if (fieldCurrent.Value == "Allow" || fieldCurrent.Value == "Disallow")
                    {
                        var pathRule  = fieldCurrent.Value == "Disallow" ? PathRuleType.Disallow : PathRuleType.Allow;
                        var pathValue = string.Empty;

                        if (enumerator.StepOverTo(TokenType.Value, valueSteppingTokens))
                        {
                            pathValue = enumerator.Current.Value;
                        }

                        if (pathRule == PathRuleType.Allow && pathValue == null)
                        {
                            //Only disallow can be blank (no "Value" token) - See Section 4 of RFC
                            continue;
                        }

                        parseState.PathRules.Add(new SiteAccessPathRule
                        {
                            RuleType = pathRule,
                            Path     = pathValue
                        });
                    }
                    else if (fieldCurrent.Value == "Crawl-delay")
                    {
                        if (enumerator.StepOverTo(TokenType.Value, valueSteppingTokens))
                        {
                            if (int.TryParse(enumerator.Current.Value, out int parsedInt))
                            {
                                parseState.CrawlDelay = parsedInt;
                            }
                        }
                    }
                }

                result.Add(parseState.AsEntry());
            }

            return(result);
        }