Пример #1
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="pageUri"></param>
        /// <returns></returns>
        public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri)
        {
            this.m_ProtocolOutput = null;
            HttpProtocolStatus obStatus = null;

            System.String urlString = pageUri.ToString();
            try
            {
                System.Uri url       = new System.Uri(urlString);
                int        redirects = 0;
                while (true)
                {
                    if (!RobotRulesParser.IsAllowed(url))
                    {
                        if (HttpProtocol.HONOR_ROBOTSTEXT)
                        {
                            throw new RobotBlockedException(url);
                        }
                    }

                    System.Net.IPAddress addr = BlockAddr(url);

                    HttpResponseMgr response;

                    try
                    {
                        response = new HttpResponseMgr(urlString, url);                         // make a request
                    }
                    finally
                    {
                        UnblockAddr(addr);
                    }

                    int code = response.Code;

                    if (code == 200)
                    {
                        // got a good response
                        obStatus                         = HttpProtocolStatus.STATUS_SUCCESS;
                        m_ProtocolOutput                 = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers), obStatus);        // return it
                        m_ProtocolOutput.Cookies         = response.Cookies;
                        m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion;
                        return(m_ProtocolOutput);
                    }
                    else if (code == 410)
                    {
                        // page is gone
                        throw new ResourceGoneException(url, "Http: " + code);
                    }
                    else if (code >= 300 && code < 400)
                    {
                        // handle redirect
                        if (redirects == MAX_REDIRECTS)
                        {
                            throw new System.Web.HttpException("Too many redirects: " + urlString);
                        }
                        url = new System.Uri(url, response.GetHeader("Location"));
                        redirects++;
                        System.Diagnostics.Trace.WriteLine("redirect to " + url);
                    }
                    else
                    {
                        // convert to exception
                        throw new HttpError(code);
                    }
                }
            }
            catch (RobotBlockedException ex)
            {
                System.Diagnostics.Trace.WriteLine(ex.Message);
                m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED);
            }
            catch (HttpError ex)
            {
                System.Diagnostics.Trace.WriteLine(ex.Message);
                obStatus         = new HttpProtocolStatus(ex.Code);
                m_ProtocolOutput = new HttpProtocolOutput(null, obStatus);
            }
            catch (System.Exception e)
            {
                System.Diagnostics.Trace.WriteLine(e.Message);
                m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED);
            }
            return(m_ProtocolOutput);
        }
		/// <summary>
		/// 
		/// </summary>
		/// <param name="pageUri"></param>
		/// <returns></returns>
		public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri)
		{
			this.m_ProtocolOutput = null;
			HttpProtocolStatus obStatus = null;
			System.String urlString = pageUri.ToString();
			try
			{
				System.Uri url = new System.Uri(urlString);
				int redirects = 0;
				while(true)
				{
					if (!RobotRulesParser.IsAllowed(url))
					{
						if (HttpProtocol.HONOR_ROBOTSTEXT)
						{
							throw new RobotBlockedException(url);
						}
					}

					System.Net.IPAddress addr = BlockAddr(url);

					HttpResponseMgr response;

					try
					{
						response = new HttpResponseMgr(urlString, url); // make a request
					}
					finally
					{
						UnblockAddr(addr);
					}

					int code = response.Code;

					if (code == 200)
					{
						// got a good response
						obStatus = HttpProtocolStatus.STATUS_SUCCESS;
						m_ProtocolOutput = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers),obStatus); // return it
						m_ProtocolOutput.Cookies = response.Cookies;
						m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion;
						return m_ProtocolOutput;
					}
					else if (code == 410)
					{
						// page is gone
						throw new ResourceGoneException(url, "Http: " + code);
					}
					else if (code >= 300 && code < 400)
					{
						// handle redirect
						if (redirects == MAX_REDIRECTS)
						{
							throw new System.Web.HttpException("Too many redirects: " + urlString);
						}
						url = new System.Uri(url, response.GetHeader("Location"));
						redirects++;
						System.Diagnostics.Trace.WriteLine("redirect to " + url);
					}
					else
					{
						// convert to exception
						throw new HttpError(code);
					}
				}
			}
			catch(RobotBlockedException ex)
			{
				System.Diagnostics.Trace.WriteLine(ex.Message);
				m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED);
			}
			catch(HttpError ex)
			{
				System.Diagnostics.Trace.WriteLine(ex.Message);
				obStatus = new HttpProtocolStatus(ex.Code);
				m_ProtocolOutput = new HttpProtocolOutput(null, obStatus);
			}
			catch (System.Exception e)
			{
				System.Diagnostics.Trace.WriteLine(e.Message);
				m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED);
			}
			return m_ProtocolOutput;
		}
Пример #3
0
		private void GetPageContent(HttpProtocol obProtocol, bool bIsRefresh)
		{
			if(m_bHasContent && !bIsRefresh)
			{
				return;
			}

			if(obProtocol == null)
			{
				throw new ArgumentNullException("obProtocol", "Null HttpProtocol object specified");
			}

			lock(this)
			{
				ParserStream stream = null;
				System.String type = String.Empty;
				System.String charset = String.Empty;
				try
				{
					m_obProtocolOutput = obProtocol.GetProtocolOutput();
					if (m_obProtocolOutput.Status.Code == HttpProtocolStatus.SUCCESS)
					{
						m_bHasContent = true;
						this.m_HttpContentProperties = m_obProtocolOutput.Content.ContentProperties;
						type = this.ContentType;
						charset = GetCharset(type);
						stream = new ParserStream(new System.IO.MemoryStream(m_obProtocolOutput.Content.ContentData));
					}

					if (null != stream)
					{
						mSource = new InputStreamSource(stream,charset,m_obProtocolOutput.Content.ContentData.Length);
					}
				}
				catch (System.Exception e)
				{
					throw new ParserException("Failed to get page content", e);
				}

				mUrl = obProtocol.URL.ToString();
				mIndex = new PageIndex(this);
			}
		}