public static bool IsAllowed(System.Uri url) { System.String host = url.Host; RobotRuleSet robotRules = (RobotRuleSet)CACHE[host]; if (robotRules == null) { // cache miss //UPGRADE_TODO: Class 'java.net.URL' was converted to a 'System.Uri' which does not throw an exception if a URL specifies an unknown protocol. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1132_3"' HttpResponseMgr response = new HttpResponseMgr(new System.Uri(url, "/robots.txt")); if (response.Code == 200) { // found rules: parse them robotRules = new RobotRulesParser().ParseRules(response.Content); } else if ((response.Code == 403) && (!ALLOW_FORBIDDEN)) { robotRules = FORBID_ALL_RULES; } // use forbid all else { robotRules = EMPTY_RULES; // use default rules } CACHE[host] = robotRules; // cache rules for host } System.String path = url.AbsolutePath; // check rules if ((path == null) || "".Equals(path)) { path = "/"; } return(robotRules.IsAllowed(path)); }
internal RobotsEntry(RobotRuleSet enclosingInstance, System.String prefix, bool allowed) { InitBlock(enclosingInstance); this.m_strPrefix = prefix; this.m_bAllowed = allowed; }
private void InitBlock(RobotRuleSet enclosingInstance) { this.m_enclosingInstance = enclosingInstance; }
/// <summary> Returns a {@link RobotRuleSet} object which encapsulates the /// rules parsed from the supplied <code>robotContent</code>. /// </summary> internal virtual RobotRuleSet ParseRules(byte[] robotContent) { if (robotContent == null) { return EMPTY_RULES; } System.String content = new System.String(SupportMisc.ToCharArray(robotContent)); Tokenizer lineParser = new Tokenizer(content, "\n\r"); RobotRuleSet bestRulesSoFar = null; int bestPrecedenceSoFar = NO_PRECEDENCE; RobotRuleSet currentRules = new RobotRuleSet(); int currentPrecedence = NO_PRECEDENCE; bool addRules = false; // in stanza for our robot bool doneAgents = false; // detect multiple agent lines while (lineParser.HasMoreTokens()) { System.String line = lineParser.NextToken(); // trim out comments and whitespace int hashPos = line.IndexOf("#"); if (hashPos >= 0) line = line.Substring(0, (hashPos) - (0)); line = line.Trim(); if ((line.Length >= 11) && (line.Substring(0, (11) - (0)).ToUpper().Equals("User-agent:".ToUpper()))) { if (doneAgents) { if (currentPrecedence < bestPrecedenceSoFar) { bestPrecedenceSoFar = currentPrecedence; bestRulesSoFar = currentRules; currentPrecedence = NO_PRECEDENCE; currentRules = new RobotRuleSet(); } addRules = false; } doneAgents = false; System.String agentNames = line.Substring(line.IndexOf(":") + 1); agentNames = agentNames.Trim(); Tokenizer agentTokenizer = new Tokenizer(agentNames); while (agentTokenizer.HasMoreTokens()) { // for each agent listed, see if it's us: System.String agentName = agentTokenizer.NextToken().ToLower(); object obInt = m_robotNames[agentName]; if (obInt != null) { int precedence = (Int32)obInt; if ((precedence < currentPrecedence) && (precedence < bestPrecedenceSoFar)) currentPrecedence = precedence; } } if (currentPrecedence < bestPrecedenceSoFar) addRules = true; } else if ((line.Length >= 9) && (line.Substring(0, (9) - (0)).ToUpper().Equals("Disallow:".ToUpper()))) { doneAgents = true; System.String path = line.Substring(line.IndexOf(":") + 1); path = path.Trim(); try { path = HttpUtility.UrlDecode(path, System.Text.Encoding.GetEncoding(CHARACTER_ENCODING.ToLower())); } catch (System.Exception e) { //LOG.warning("error parsing robots rules- can't decode path: " + path); Trace.WriteLine("error parsing robots rules- can't decode path: " + path); } if (path.Length == 0) { // "empty rule" if (addRules) currentRules.ClearPrefixes(); } else { // rule with path if (addRules) currentRules.AddPrefix(path, false); } } else if ((line.Length >= 6) && (line.Substring(0, (6) - (0)).ToUpper().Equals("Allow:".ToUpper()))) { doneAgents = true; System.String path = line.Substring(line.IndexOf(":") + 1); path = path.Trim(); if (path.Length == 0) { // "empty rule"- treat same as empty disallow if (addRules) currentRules.ClearPrefixes(); } else { // rule with path if (addRules) currentRules.AddPrefix(path, true); } } } if (currentPrecedence < bestPrecedenceSoFar) { bestPrecedenceSoFar = currentPrecedence; bestRulesSoFar = currentRules; } if (bestPrecedenceSoFar == NO_PRECEDENCE) return EMPTY_RULES; return bestRulesSoFar; }
/// <summary> Returns a {@link RobotRuleSet} object which encapsulates the /// rules parsed from the supplied <code>robotContent</code>. /// </summary> internal virtual RobotRuleSet ParseRules(byte[] robotContent) { if (robotContent == null) { return(EMPTY_RULES); } System.String content = new System.String(SupportMisc.ToCharArray(robotContent)); Tokenizer lineParser = new Tokenizer(content, "\n\r"); RobotRuleSet bestRulesSoFar = null; int bestPrecedenceSoFar = NO_PRECEDENCE; RobotRuleSet currentRules = new RobotRuleSet(); int currentPrecedence = NO_PRECEDENCE; bool addRules = false; // in stanza for our robot bool doneAgents = false; // detect multiple agent lines while (lineParser.HasMoreTokens()) { System.String line = lineParser.NextToken(); // trim out comments and whitespace int hashPos = line.IndexOf("#"); if (hashPos >= 0) { line = line.Substring(0, (hashPos) - (0)); } line = line.Trim(); if ((line.Length >= 11) && (line.Substring(0, (11) - (0)).ToUpper().Equals("User-agent:".ToUpper()))) { if (doneAgents) { if (currentPrecedence < bestPrecedenceSoFar) { bestPrecedenceSoFar = currentPrecedence; bestRulesSoFar = currentRules; currentPrecedence = NO_PRECEDENCE; currentRules = new RobotRuleSet(); } addRules = false; } doneAgents = false; System.String agentNames = line.Substring(line.IndexOf(":") + 1); agentNames = agentNames.Trim(); Tokenizer agentTokenizer = new Tokenizer(agentNames); while (agentTokenizer.HasMoreTokens()) { // for each agent listed, see if it's us: System.String agentName = agentTokenizer.NextToken().ToLower(); object obInt = m_robotNames[agentName]; if (obInt != null) { int precedence = (Int32)obInt; if ((precedence < currentPrecedence) && (precedence < bestPrecedenceSoFar)) { currentPrecedence = precedence; } } } if (currentPrecedence < bestPrecedenceSoFar) { addRules = true; } } else if ((line.Length >= 9) && (line.Substring(0, (9) - (0)).ToUpper().Equals("Disallow:".ToUpper()))) { doneAgents = true; System.String path = line.Substring(line.IndexOf(":") + 1); path = path.Trim(); try { path = HttpUtility.UrlDecode(path, System.Text.Encoding.GetEncoding(CHARACTER_ENCODING.ToLower())); } catch (System.Exception e) { //LOG.warning("error parsing robots rules- can't decode path: " + path); Trace.WriteLine("error parsing robots rules- can't decode path: " + path); } if (path.Length == 0) { // "empty rule" if (addRules) { currentRules.ClearPrefixes(); } } else { // rule with path if (addRules) { currentRules.AddPrefix(path, false); } } } else if ((line.Length >= 6) && (line.Substring(0, (6) - (0)).ToUpper().Equals("Allow:".ToUpper()))) { doneAgents = true; System.String path = line.Substring(line.IndexOf(":") + 1); path = path.Trim(); if (path.Length == 0) { // "empty rule"- treat same as empty disallow if (addRules) { currentRules.ClearPrefixes(); } } else { // rule with path if (addRules) { currentRules.AddPrefix(path, true); } } } } if (currentPrecedence < bestPrecedenceSoFar) { bestPrecedenceSoFar = currentPrecedence; bestRulesSoFar = currentRules; } if (bestPrecedenceSoFar == NO_PRECEDENCE) { return(EMPTY_RULES); } return(bestRulesSoFar); }