/// <summary> /// /// </summary> /// <param name="pageUri"></param> /// <returns></returns> public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri) { this.m_ProtocolOutput = null; HttpProtocolStatus obStatus = null; System.String urlString = pageUri.ToString(); try { System.Uri url = new System.Uri(urlString); int redirects = 0; while (true) { if (!RobotRulesParser.IsAllowed(url)) { if (HttpProtocol.HONOR_ROBOTSTEXT) { throw new RobotBlockedException(url); } } System.Net.IPAddress addr = BlockAddr(url); HttpResponseMgr response; try { response = new HttpResponseMgr(urlString, url); // make a request } finally { UnblockAddr(addr); } int code = response.Code; if (code == 200) { // got a good response obStatus = HttpProtocolStatus.STATUS_SUCCESS; m_ProtocolOutput = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers), obStatus); // return it m_ProtocolOutput.Cookies = response.Cookies; m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion; return(m_ProtocolOutput); } else if (code == 410) { // page is gone throw new ResourceGoneException(url, "Http: " + code); } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) { throw new System.Web.HttpException("Too many redirects: " + urlString); } url = new System.Uri(url, response.GetHeader("Location")); redirects++; System.Diagnostics.Trace.WriteLine("redirect to " + url); } else { // convert to exception throw new HttpError(code); } } } catch (RobotBlockedException ex) { System.Diagnostics.Trace.WriteLine(ex.Message); m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED); } catch (HttpError ex) { System.Diagnostics.Trace.WriteLine(ex.Message); obStatus = new HttpProtocolStatus(ex.Code); m_ProtocolOutput = new HttpProtocolOutput(null, obStatus); } catch (System.Exception e) { System.Diagnostics.Trace.WriteLine(e.Message); m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED); } return(m_ProtocolOutput); }
/// <summary> /// /// </summary> /// <param name="pageUri"></param> /// <returns></returns> public HttpProtocolOutput GetProtocolOutput(System.Uri pageUri) { this.m_ProtocolOutput = null; HttpProtocolStatus obStatus = null; System.String urlString = pageUri.ToString(); try { System.Uri url = new System.Uri(urlString); int redirects = 0; while(true) { if (!RobotRulesParser.IsAllowed(url)) { if (HttpProtocol.HONOR_ROBOTSTEXT) { throw new RobotBlockedException(url); } } System.Net.IPAddress addr = BlockAddr(url); HttpResponseMgr response; try { response = new HttpResponseMgr(urlString, url); // make a request } finally { UnblockAddr(addr); } int code = response.Code; if (code == 200) { // got a good response obStatus = HttpProtocolStatus.STATUS_SUCCESS; m_ProtocolOutput = new HttpProtocolOutput(new HttpProtocolContent(response.Content, response.Headers),obStatus); // return it m_ProtocolOutput.Cookies = response.Cookies; m_ProtocolOutput.ProtocolVersion = response.ProtocolVersion; return m_ProtocolOutput; } else if (code == 410) { // page is gone throw new ResourceGoneException(url, "Http: " + code); } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) { throw new System.Web.HttpException("Too many redirects: " + urlString); } url = new System.Uri(url, response.GetHeader("Location")); redirects++; System.Diagnostics.Trace.WriteLine("redirect to " + url); } else { // convert to exception throw new HttpError(code); } } } catch(RobotBlockedException ex) { System.Diagnostics.Trace.WriteLine(ex.Message); m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_ROBOTS_DENIED); } catch(HttpError ex) { System.Diagnostics.Trace.WriteLine(ex.Message); obStatus = new HttpProtocolStatus(ex.Code); m_ProtocolOutput = new HttpProtocolOutput(null, obStatus); } catch (System.Exception e) { System.Diagnostics.Trace.WriteLine(e.Message); m_ProtocolOutput = new HttpProtocolOutput(null, HttpProtocolStatus.STATUS_FAILED); } return m_ProtocolOutput; }
private void GetPageContent(HttpProtocol obProtocol, bool bIsRefresh) { if(m_bHasContent && !bIsRefresh) { return; } if(obProtocol == null) { throw new ArgumentNullException("obProtocol", "Null HttpProtocol object specified"); } lock(this) { ParserStream stream = null; System.String type = String.Empty; System.String charset = String.Empty; try { m_obProtocolOutput = obProtocol.GetProtocolOutput(); if (m_obProtocolOutput.Status.Code == HttpProtocolStatus.SUCCESS) { m_bHasContent = true; this.m_HttpContentProperties = m_obProtocolOutput.Content.ContentProperties; type = this.ContentType; charset = GetCharset(type); stream = new ParserStream(new System.IO.MemoryStream(m_obProtocolOutput.Content.ContentData)); } if (null != stream) { mSource = new InputStreamSource(stream,charset,m_obProtocolOutput.Content.ContentData.Length); } } catch (System.Exception e) { throw new ParserException("Failed to get page content", e); } mUrl = obProtocol.URL.ToString(); mIndex = new PageIndex(this); } }