Пример #1
0
 public CrawledPage(Uri uri)
     : base(uri)
 {
     _htmlDocument = new Lazy<HtmlDocument>(() => InitializeHtmlAgilityPackDocument() );
     _csQueryDocument = new Lazy<CQ>(() => InitializeCsQueryDocument());
     Content = new PageContent();
 }
Пример #2
0
        public virtual PageContent GetContent(WebResponse response)
        {
            using (MemoryStream memoryStream = GetRawData(response))
            {
                String charset = GetCharsetFromHeaders(response);

                if (charset == null) {
                    memoryStream.Seek(0, SeekOrigin.Begin);

                    // Do not wrap in closing statement to prevent closing of this stream.
                    StreamReader srr = new StreamReader(memoryStream, Encoding.ASCII);
                    String body = srr.ReadToEnd();
                    charset = GetCharsetFromBody(body);
                }
                memoryStream.Seek(0, SeekOrigin.Begin);

                charset = CleanCharset(charset);
                Encoding e = GetEncoding(charset);
                string content = "";
                using (StreamReader sr = new StreamReader(memoryStream, e))
                {
                    content = sr.ReadToEnd();
                }

                PageContent pageContent = new PageContent();
                pageContent.Bytes = memoryStream.ToArray();
                pageContent.Charset = charset;
                pageContent.Encoding = e;
                pageContent.Text = content;

                return pageContent;
            }
        }
Пример #3
0
        public PageContent GetContent(WebResponse response)
        {
            using (MemoryStream memoryStream = GetRawData(response))
            {
                String charset = GetCharsetFromHeaders(response);

                if (charset == null)
                    charset = GetCharsetFromBody(memoryStream);

                memoryStream.Seek(0, SeekOrigin.Begin);

                Encoding e = GetEncoding(charset);
                string content = "";
                using (StreamReader sr = new StreamReader(memoryStream, e))
                {
                    content = sr.ReadToEnd();
                }

                PageContent pageContent = new PageContent();
                pageContent.Bytes = memoryStream.ToArray();
                pageContent.Charset = charset;
                pageContent.Encoding = e;
                pageContent.Text = content;

                return pageContent;
            }
        }
        public override PageContent GetContent(WebResponse p_Response)
        {
            // Navigate to the requested page using the WebDriver. PhantomJS will navigate to the page
            // just like a normal browser and the resulting html will be set in the PageSource property.
            m_WebDriver.Navigate().GoToUrl(p_Response.ResponseUri);

            // Let the JavaScript execute for a while if needed, for instance if the pages are doing async calls.
            //Thread.Sleep(1000);

            // Try to retrieve the charset and encoding from the response or body.
            string pageBody = m_WebDriver.PageSource;
            string charset = GetCharsetFromHeaders(p_Response);
            if (charset == null) {
                charset = GetCharsetFromBody(pageBody);
            }

            Encoding encoding = GetEncoding(charset);

            PageContent pageContent = new PageContent {
                    Encoding = encoding,
                    Charset = charset,
                    Text = pageBody,
                    Bytes = encoding.GetBytes(pageBody)
                };

            return pageContent;
        }
Пример #5
0
 public void Constructor_CreatesInstance()
 {
     PageContent unitUnderTest = new PageContent();
     Assert.IsNull(unitUnderTest.Bytes);
     Assert.IsNull(unitUnderTest.Charset);
     Assert.IsNull(unitUnderTest.Encoding);
     Assert.AreEqual("", unitUnderTest.Text);
 }