Esempio n. 1
0
 public override int GetHashCode()
 {
     unchecked
     {
         var hashCode = (UserAgent != null ? UserAgent.GetHashCode() : 0);
         hashCode = (hashCode * 397) ^ Disallow.GetHashCode();
         hashCode = (hashCode * 397) ^ Allow.GetHashCode();
         return(hashCode);
     }
 }
Esempio n. 2
0
 public bool Equals(RobotsGroup other)
 {
     if (ReferenceEquals(null, other))
     {
         return(false);
     }
     if (ReferenceEquals(this, other))
     {
         return(true);
     }
     return(string.Equals(UserAgent, other.UserAgent) && Disallow.Equals(other.Disallow) && Allow.Equals(other.Allow));
 }
Esempio n. 3
0
        /// <summary>
        /// Returns true if the uri is not allowed to be crawled
        /// </summary>
        /// <param name="uri"></param>
        /// <returns></returns>
        public bool IsDisallowed(Uri uri, string userAgent = "")
        {
            bool rtn = false;

            if (userAgent.Length == 0)
            {
                userAgent = FetchoConfiguration.Current?.UserAgent;
            }

            // get the FSM that matches this useragent
            var matcher = Disallow.GetState(userAgent);
            IEnumerable <FiniteStateMachineBooleanState> states = null;

            if (!matcher.Any()) // if none, get the first
            {
                matcher = Disallow.RootNode.State.ToArray();
            }
            // TODO: This line above looks a bit wierd - need to check

            // when there's no matchers in the file - like a blank robots file!
            if (!matcher.Any())
            {
                rtn = false; // everything allowed by default
            }
            else
            {
                // get the states - Accept means its disallowed
                states = matcher.First().GetState(uri.AbsolutePath.ToString());

                // we dont get any, its not disallowed
                if (states == null)
                {
                    rtn = false;
                }
                else //if (!states.Any())
                {
                    rtn = states.Any(x => x == FiniteStateMachineBooleanState.Accept);
                }

                /*else if (states.Count() > 1)
                 * {
                 *  log.Error("More than one state for Robots URI" + uri);
                 *  rtn = false;
                 * }
                 * else if (states.First() == FiniteStateMachineBooleanState.Accept)
                 *  rtn = true;
                 * else
                 *  rtn = false;*/
            }

            return(rtn);
        }
        /// <summary>
        /// Returns whether typo is sutable for group: not suitable if Allow regex doesn't match typo,
        /// or Disallow regex matches typo
        /// </summary>
        /// <param name="typo"></param>
        /// <returns></returns>
        public bool IsSuitableTypo(string typo)
        {
            if (Allow != null && !Allow.IsMatch(typo))
            {
                return(false);
            }
            if (Disallow != null && Disallow.IsMatch(typo))
            {
                return(false);
            }

            return(true);
        }
Esempio n. 5
0
        /// <summary>
        /// Dispose the robots file
        /// </summary>
        protected virtual void Dispose(bool disposable)
        {
            foreach (var state in Allow.States.Keys)
            {
                state.Dispose();
            }

            foreach (var state in Disallow.States.Keys)
            {
                state.Dispose();
            }

            Disallow.Dispose();
            Allow.Dispose();
        }
 internal void MergePaths(StringValues disallowPaths, StringValues allowPaths)
 {
     Disallow = Disallow.Union(disallowPaths).ToArray();
     Allow    = Allow.Union(allowPaths).ToArray();
 }
Esempio n. 7
0
        private void Process(TextReader reader)
        {
            string userAgent = "";

            // add the defaults
            addStringToMatcher(Disallow,
                               "*",
                               new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                               );
            addStringToMatcher(Allow,
                               "*",
                               new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                               );

            while (reader.Peek() > -1)
            {
                string line = reader.ReadLine().Trim();

                // skip comments
                if (line.StartsWith("#", StringComparison.InvariantCultureIgnoreCase))
                {
                    continue;
                }

                if (line.StartsWith("user-agent:", StringComparison.InvariantCultureIgnoreCase))
                {
                    userAgent = "";
                    if (line.Length > 11)
                    {
                        userAgent = line.Substring(11, line.Length - 11).Trim();
                    }

                    if (userAgent == FetchoConfiguration.Current?.UserAgent)
                    {
                        log.Error(this.Uri + " has a specific restriction for our user-agent");
                    }

                    addStringToMatcher(Disallow,
                                       userAgent,
                                       new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                                       );
                    addStringToMatcher(Allow,
                                       userAgent,
                                       new FiniteStateMachine <FiniteStateMachineBooleanState, char>()
                                       );
                }
                else
                {
                    if (line.EndsWith("*", StringComparison.InvariantCultureIgnoreCase))
                    {
                        line = line.Substring(0, line.Length - 1);                                                                  // chop it
                    }
                    if (line.StartsWith("disallow:", StringComparison.InvariantCultureIgnoreCase))
                    {
                        var disallow_matcher = Disallow.GetState(userAgent);
                        if (disallow_matcher.Count() == 0)
                        {
                            throw new FetchoException("No default disallow matcher available for '" + userAgent + "' uri " + Uri);
                        }

                        if (line.Length > 9)
                        {
                            addStringToMatcher(disallow_matcher.First(),
                                               line.Substring(9, line.Length - 9).Trim(),
                                               FiniteStateMachineBooleanState.Accept);
                        }
                    }
                    else if (line.StartsWith("allow:", StringComparison.InvariantCultureIgnoreCase))
                    {
                        var allow_matcher = Allow.GetState(userAgent);
                        if (allow_matcher.Count() == 0)
                        {
                            throw new FetchoException("No default allow matcher available for '" + userAgent + "' uri " + Uri);
                        }

                        if (line.Length > 6)
                        {
                            addStringToMatcher(allow_matcher.First(),
                                               line.Substring(6, line.Length - 6).Trim(),
                                               FiniteStateMachineBooleanState.Accept);
                        }
                    }
                    else if (line.StartsWith("sitemap:", StringComparison.InvariantCultureIgnoreCase))
                    {
                        if (line.Length > 8)
                        {
                            SiteMaps.Add(line.Substring(8, line.Length - 8).Trim());
                        }
                    }
                }
            }
        }