/// <summary> /// Returns true if the uri is not allowed to be crawled /// </summary> /// <param name="uri"></param> /// <returns></returns> public bool IsDisallowed(Uri uri, string userAgent = "") { bool rtn = false; if (userAgent.Length == 0) { userAgent = FetchoConfiguration.Current?.UserAgent; } // get the FSM that matches this useragent var matcher = Disallow.GetState(userAgent); IEnumerable <FiniteStateMachineBooleanState> states = null; if (!matcher.Any()) // if none, get the first { matcher = Disallow.RootNode.State.ToArray(); } // TODO: This line above looks a bit wierd - need to check // when there's no matchers in the file - like a blank robots file! if (!matcher.Any()) { rtn = false; // everything allowed by default } else { // get the states - Accept means its disallowed states = matcher.First().GetState(uri.AbsolutePath.ToString()); // we dont get any, its not disallowed if (states == null) { rtn = false; } else //if (!states.Any()) { rtn = states.Any(x => x == FiniteStateMachineBooleanState.Accept); } /*else if (states.Count() > 1) * { * log.Error("More than one state for Robots URI" + uri); * rtn = false; * } * else if (states.First() == FiniteStateMachineBooleanState.Accept) * rtn = true; * else * rtn = false;*/ } return(rtn); }
private void Process(TextReader reader) { string userAgent = ""; // add the defaults addStringToMatcher(Disallow, "*", new FiniteStateMachine <FiniteStateMachineBooleanState, char>() ); addStringToMatcher(Allow, "*", new FiniteStateMachine <FiniteStateMachineBooleanState, char>() ); while (reader.Peek() > -1) { string line = reader.ReadLine().Trim(); // skip comments if (line.StartsWith("#", StringComparison.InvariantCultureIgnoreCase)) { continue; } if (line.StartsWith("user-agent:", StringComparison.InvariantCultureIgnoreCase)) { userAgent = ""; if (line.Length > 11) { userAgent = line.Substring(11, line.Length - 11).Trim(); } if (userAgent == FetchoConfiguration.Current?.UserAgent) { log.Error(this.Uri + " has a specific restriction for our user-agent"); } addStringToMatcher(Disallow, userAgent, new FiniteStateMachine <FiniteStateMachineBooleanState, char>() ); addStringToMatcher(Allow, userAgent, new FiniteStateMachine <FiniteStateMachineBooleanState, char>() ); } else { if (line.EndsWith("*", StringComparison.InvariantCultureIgnoreCase)) { line = line.Substring(0, line.Length - 1); // chop it } if (line.StartsWith("disallow:", StringComparison.InvariantCultureIgnoreCase)) { var disallow_matcher = Disallow.GetState(userAgent); if (disallow_matcher.Count() == 0) { throw new FetchoException("No default disallow matcher available for '" + userAgent + "' uri " + Uri); } if (line.Length > 9) { addStringToMatcher(disallow_matcher.First(), line.Substring(9, line.Length - 9).Trim(), FiniteStateMachineBooleanState.Accept); } } else if (line.StartsWith("allow:", StringComparison.InvariantCultureIgnoreCase)) { var allow_matcher = Allow.GetState(userAgent); if (allow_matcher.Count() == 0) { throw new FetchoException("No default allow matcher available for '" + userAgent + "' uri " + Uri); } if (line.Length > 6) { addStringToMatcher(allow_matcher.First(), line.Substring(6, line.Length - 6).Trim(), FiniteStateMachineBooleanState.Accept); } } else if (line.StartsWith("sitemap:", StringComparison.InvariantCultureIgnoreCase)) { if (line.Length > 8) { SiteMaps.Add(line.Substring(8, line.Length - 8).Trim()); } } } } }