public static bool IsAllowed(System.Uri url) { System.String host = url.Host; RobotRuleSet robotRules = (RobotRuleSet)CACHE[host]; if (robotRules == null) { // cache miss //UPGRADE_TODO: Class 'java.net.URL' was converted to a 'System.Uri' which does not throw an exception if a URL specifies an unknown protocol. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1132_3"' HttpResponseMgr response = new HttpResponseMgr(new System.Uri(url, "/robots.txt")); if (response.Code == 200) { // found rules: parse them robotRules = new RobotRulesParser().ParseRules(response.Content); } else if ((response.Code == 403) && (!ALLOW_FORBIDDEN)) { robotRules = FORBID_ALL_RULES; } // use forbid all else { robotRules = EMPTY_RULES; // use default rules } CACHE[host] = robotRules; // cache rules for host } System.String path = url.AbsolutePath; // check rules if ((path == null) || "".Equals(path)) { path = "/"; } return(robotRules.IsAllowed(path)); }
public static bool IsAllowed(System.Uri url) { System.String host = url.Host; RobotRuleSet robotRules = (RobotRuleSet) CACHE[host]; if (robotRules == null) { // cache miss //UPGRADE_TODO: Class 'java.net.URL' was converted to a 'System.Uri' which does not throw an exception if a URL specifies an unknown protocol. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1132_3"' HttpResponseMgr response = new HttpResponseMgr(new System.Uri(url, "/robots.txt")); if (response.Code == 200) // found rules: parse them robotRules = new RobotRulesParser().ParseRules(response.Content); else if ((response.Code == 403) && (!ALLOW_FORBIDDEN)) robotRules = FORBID_ALL_RULES; // use forbid all else robotRules = EMPTY_RULES; // use default rules CACHE[host] = robotRules; // cache rules for host } System.String path = url.AbsolutePath; // check rules if ((path == null) || "".Equals(path)) { path = "/"; } return robotRules.IsAllowed(path); }
/// <summary> /// /// </summary> /// <param name="pageUri"></param> /// <returns></returns> public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri) { this.m_ProtocolOutput = null; HttpProtocolStatus obStatus = null; System.String urlString = pageUri.ToString(); try { System.Uri url = new System.Uri(urlString); int redirects = 0; while (true) { if (!RobotRulesParser.IsAllowed(url)) { if (HttpProtocol.HONOR_ROBOTSTEXT) { throw new RobotBlockedException(url); } } System.Net.IPAddress addr = BlockAddr(url); HttpResponseMgr response; try { response = new HttpResponseMgr(urlString, url); // make a request } finally { UnblockAddr(addr); } int code = response.Code; if (code == 200) { // got a good response obStatus = HttpProtocolStatus.STATUS_SUCCESS; m_ProtocolOutput = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers), obStatus); // return it m_ProtocolOutput.Cookies = response.Cookies; m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion; return(m_ProtocolOutput); } else if (code == 410) { // page is gone throw new ResourceGoneException(url, "Http: " + code); } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) { throw new System.Web.HttpException("Too many redirects: " + urlString); } url = new System.Uri(url, response.GetHeader("Location")); redirects++; System.Diagnostics.Trace.WriteLine("redirect to " + url); } else { // convert to exception throw new HttpError(code); } } } catch (RobotBlockedException ex) { System.Diagnostics.Trace.WriteLine(ex.Message); m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED); } catch (HttpError ex) { System.Diagnostics.Trace.WriteLine(ex.Message); obStatus = new HttpProtocolStatus(ex.Code); m_ProtocolOutput = new HttpProtocolOutput(null, obStatus); } catch (System.Exception e) { System.Diagnostics.Trace.WriteLine(e.Message); m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED); } return(m_ProtocolOutput); }