Ejemplo n.º 1
0
        /// <summary>
        /// Returns true if the uri is not allowed to be crawled
        /// </summary>
        /// <param name="uri"></param>
        /// <returns></returns>
        public bool IsDisallowed(Uri uri, string userAgent = "")
        {
            bool rtn = false;

            if (userAgent.Length == 0)
            {
                userAgent = FetchoConfiguration.Current?.UserAgent;
            }

            // get the FSM that matches this useragent
            var matcher = Disallow.GetState(userAgent);
            IEnumerable <FiniteStateMachineBooleanState> states = null;

            if (!matcher.Any()) // if none, get the first
            {
                matcher = Disallow.RootNode.State.ToArray();
            }
            // TODO: This line above looks a bit wierd - need to check

            // when there's no matchers in the file - like a blank robots file!
            if (!matcher.Any())
            {
                rtn = false; // everything allowed by default
            }
            else
            {
                // get the states - Accept means its disallowed
                states = matcher.First().GetState(uri.AbsolutePath.ToString());

                // we dont get any, its not disallowed
                if (states == null)
                {
                    rtn = false;
                }
                else //if (!states.Any())
                {
                    rtn = states.Any(x => x == FiniteStateMachineBooleanState.Accept);
                }

                /*else if (states.Count() > 1)
                 * {
                 *  log.Error("More than one state for Robots URI" + uri);
                 *  rtn = false;
                 * }
                 * else if (states.First() == FiniteStateMachineBooleanState.Accept)
                 *  rtn = true;
                 * else
                 *  rtn = false;*/
            }

            return(rtn);
        }
Ejemplo n.º 2
0
        private void Process(TextReader reader)
        {
            string userAgent = "";

            // add the defaults
            addStringToMatcher(Disallow,
                               "*",
                               new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                               );
            addStringToMatcher(Allow,
                               "*",
                               new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                               );

            while (reader.Peek() > -1)
            {
                string line = reader.ReadLine().Trim();

                // skip comments
                if (line.StartsWith("#", StringComparison.InvariantCultureIgnoreCase))
                {
                    continue;
                }

                if (line.StartsWith("user-agent:", StringComparison.InvariantCultureIgnoreCase))
                {
                    userAgent = "";
                    if (line.Length > 11)
                    {
                        userAgent = line.Substring(11, line.Length - 11).Trim();
                    }

                    if (userAgent == FetchoConfiguration.Current?.UserAgent)
                    {
                        log.Error(this.Uri + " has a specific restriction for our user-agent");
                    }

                    addStringToMatcher(Disallow,
                                       userAgent,
                                       new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                                       );
                    addStringToMatcher(Allow,
                                       userAgent,
                                       new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                                       );
                }
                else
                {
                    if (line.EndsWith("*", StringComparison.InvariantCultureIgnoreCase))
                    {
                        line = line.Substring(0, line.Length - 1);                                                                  // chop it
                    }
                    if (line.StartsWith("disallow:", StringComparison.InvariantCultureIgnoreCase))
                    {
                        var disallow_matcher = Disallow.GetState(userAgent);
                        if (disallow_matcher.Count() == 0)
                        {
                            throw new FetchoException("No default disallow matcher available for '" + userAgent + "' uri " + Uri);
                        }

                        if (line.Length > 9)
                        {
                            addStringToMatcher(disallow_matcher.First(),
                                               line.Substring(9, line.Length - 9).Trim(),
                                               FiniteStateMachineBooleanState.Accept);
                        }
                    }
                    else if (line.StartsWith("allow:", StringComparison.InvariantCultureIgnoreCase))
                    {
                        var allow_matcher = Allow.GetState(userAgent);
                        if (allow_matcher.Count() == 0)
                        {
                            throw new FetchoException("No default allow matcher available for '" + userAgent + "' uri " + Uri);
                        }

                        if (line.Length > 6)
                        {
                            addStringToMatcher(allow_matcher.First(),
                                               line.Substring(6, line.Length - 6).Trim(),
                                               FiniteStateMachineBooleanState.Accept);
                        }
                    }
                    else if (line.StartsWith("sitemap:", StringComparison.InvariantCultureIgnoreCase))
                    {
                        if (line.Length > 8)
                        {
                            SiteMaps.Add(line.Substring(8, line.Length - 8).Trim());
                        }
                    }
                }
            }
        }