public CrawledPage(Uri uri) : base(uri) { _htmlDocument = new Lazy<HtmlDocument>(() => InitializeHtmlAgilityPackDocument() ); _csQueryDocument = new Lazy<CQ>(() => InitializeCsQueryDocument()); Content = new PageContent(); }
public virtual PageContent GetContent(WebResponse response) { using (MemoryStream memoryStream = GetRawData(response)) { String charset = GetCharsetFromHeaders(response); if (charset == null) { memoryStream.Seek(0, SeekOrigin.Begin); // Do not wrap in closing statement to prevent closing of this stream. StreamReader srr = new StreamReader(memoryStream, Encoding.ASCII); String body = srr.ReadToEnd(); charset = GetCharsetFromBody(body); } memoryStream.Seek(0, SeekOrigin.Begin); charset = CleanCharset(charset); Encoding e = GetEncoding(charset); string content = ""; using (StreamReader sr = new StreamReader(memoryStream, e)) { content = sr.ReadToEnd(); } PageContent pageContent = new PageContent(); pageContent.Bytes = memoryStream.ToArray(); pageContent.Charset = charset; pageContent.Encoding = e; pageContent.Text = content; return pageContent; } }
public PageContent GetContent(WebResponse response) { using (MemoryStream memoryStream = GetRawData(response)) { String charset = GetCharsetFromHeaders(response); if (charset == null) charset = GetCharsetFromBody(memoryStream); memoryStream.Seek(0, SeekOrigin.Begin); Encoding e = GetEncoding(charset); string content = ""; using (StreamReader sr = new StreamReader(memoryStream, e)) { content = sr.ReadToEnd(); } PageContent pageContent = new PageContent(); pageContent.Bytes = memoryStream.ToArray(); pageContent.Charset = charset; pageContent.Encoding = e; pageContent.Text = content; return pageContent; } }
public override PageContent GetContent(WebResponse p_Response) { // Navigate to the requested page using the WebDriver. PhantomJS will navigate to the page // just like a normal browser and the resulting html will be set in the PageSource property. m_WebDriver.Navigate().GoToUrl(p_Response.ResponseUri); // Let the JavaScript execute for a while if needed, for instance if the pages are doing async calls. //Thread.Sleep(1000); // Try to retrieve the charset and encoding from the response or body. string pageBody = m_WebDriver.PageSource; string charset = GetCharsetFromHeaders(p_Response); if (charset == null) { charset = GetCharsetFromBody(pageBody); } Encoding encoding = GetEncoding(charset); PageContent pageContent = new PageContent { Encoding = encoding, Charset = charset, Text = pageBody, Bytes = encoding.GetBytes(pageBody) }; return pageContent; }
public void Constructor_CreatesInstance() { PageContent unitUnderTest = new PageContent(); Assert.IsNull(unitUnderTest.Bytes); Assert.IsNull(unitUnderTest.Charset); Assert.IsNull(unitUnderTest.Encoding); Assert.AreEqual("", unitUnderTest.Text); }