Example #1
0
        public static bool IsAllowed(System.Uri url)
        {
            System.String host = url.Host;

            RobotRuleSet robotRules = (RobotRuleSet)CACHE[host];

            if (robotRules == null)
            {
                // cache miss
                //UPGRADE_TODO: Class 'java.net.URL' was converted to a 'System.Uri' which does not throw an exception if a URL specifies an unknown protocol. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1132_3"'
                HttpResponseMgr response = new HttpResponseMgr(new System.Uri(url, "/robots.txt"));

                if (response.Code == 200)
                {
                    // found rules: parse them
                    robotRules = new RobotRulesParser().ParseRules(response.Content);
                }
                else if ((response.Code == 403) && (!ALLOW_FORBIDDEN))
                {
                    robotRules = FORBID_ALL_RULES;
                }
                // use forbid all
                else
                {
                    robotRules = EMPTY_RULES;             // use default rules
                }
                CACHE[host] = robotRules;                 // cache rules for host
            }

            System.String path = url.AbsolutePath;             // check rules
            if ((path == null) || "".Equals(path))
            {
                path = "/";
            }

            return(robotRules.IsAllowed(path));
        }
		public static bool IsAllowed(System.Uri url)
		{
			
			System.String host = url.Host;
			
			RobotRuleSet robotRules = (RobotRuleSet) CACHE[host];
			
			if (robotRules == null)
			{
				// cache miss
				//UPGRADE_TODO: Class 'java.net.URL' was converted to a 'System.Uri' which does not throw an exception if a URL specifies an unknown protocol. 'ms-help://MS.VSCC.2003/commoner/redir/redirect.htm?keyword="jlca1132_3"'
				HttpResponseMgr response = new HttpResponseMgr(new System.Uri(url, "/robots.txt"));
				
				if (response.Code == 200)
					// found rules: parse them
					robotRules = new RobotRulesParser().ParseRules(response.Content);
				else if ((response.Code == 403) && (!ALLOW_FORBIDDEN))
					robotRules = FORBID_ALL_RULES;
					// use forbid all
				else
					robotRules = EMPTY_RULES; // use default rules
				
				CACHE[host] = robotRules; // cache rules for host
			}
			
			System.String path = url.AbsolutePath; // check rules
			if ((path == null) || "".Equals(path))
			{
				path = "/";
			}
			
			return robotRules.IsAllowed(path);
		}
        /// <summary>
        ///
        /// </summary>
        /// <param name="pageUri"></param>
        /// <returns></returns>
        public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri)
        {
            this.m_ProtocolOutput = null;
            HttpProtocolStatus obStatus = null;

            System.String urlString = pageUri.ToString();
            try
            {
                System.Uri url       = new System.Uri(urlString);
                int        redirects = 0;
                while (true)
                {
                    if (!RobotRulesParser.IsAllowed(url))
                    {
                        if (HttpProtocol.HONOR_ROBOTSTEXT)
                        {
                            throw new RobotBlockedException(url);
                        }
                    }

                    System.Net.IPAddress addr = BlockAddr(url);

                    HttpResponseMgr response;

                    try
                    {
                        response = new HttpResponseMgr(urlString, url);                         // make a request
                    }
                    finally
                    {
                        UnblockAddr(addr);
                    }

                    int code = response.Code;

                    if (code == 200)
                    {
                        // got a good response
                        obStatus                         = HttpProtocolStatus.STATUS_SUCCESS;
                        m_ProtocolOutput                 = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers), obStatus);        // return it
                        m_ProtocolOutput.Cookies         = response.Cookies;
                        m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion;
                        return(m_ProtocolOutput);
                    }
                    else if (code == 410)
                    {
                        // page is gone
                        throw new ResourceGoneException(url, "Http: " + code);
                    }
                    else if (code >= 300 && code < 400)
                    {
                        // handle redirect
                        if (redirects == MAX_REDIRECTS)
                        {
                            throw new System.Web.HttpException("Too many redirects: " + urlString);
                        }
                        url = new System.Uri(url, response.GetHeader("Location"));
                        redirects++;
                        System.Diagnostics.Trace.WriteLine("redirect to " + url);
                    }
                    else
                    {
                        // convert to exception
                        throw new HttpError(code);
                    }
                }
            }
            catch (RobotBlockedException ex)
            {
                System.Diagnostics.Trace.WriteLine(ex.Message);
                m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED);
            }
            catch (HttpError ex)
            {
                System.Diagnostics.Trace.WriteLine(ex.Message);
                obStatus         = new HttpProtocolStatus(ex.Code);
                m_ProtocolOutput = new HttpProtocolOutput(null, obStatus);
            }
            catch (System.Exception e)
            {
                System.Diagnostics.Trace.WriteLine(e.Message);
                m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED);
            }
            return(m_ProtocolOutput);
        }