Ejemplo n.º 1
0
 private IEnumerable<string> ExtractUrls(WebPage page)
 {
     // Overhead, we are essentially parsing a web page twice (once for links, second time for content)
     HtmlWeb hw = new HtmlWeb();
     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
     doc.LoadHtml(page.Content);
     foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
     {
         string hrefValue = link.GetAttributeValue("href", string.Empty);
         yield return hrefValue;
     }
 }
Ejemplo n.º 2
0
 public static string GetRawTextFromPage(WebPage page)
 {
     HtmlDocument doc = new HtmlDocument();
     doc.LoadHtml(page.Content);
     return GetRawFromHTML(doc);
 }
Ejemplo n.º 3
0
 public HtmlDocument GetDocument(WebPage webPage)
 {
     return Cache[webPage.Url];
 }
Ejemplo n.º 4
0
        private WebPage FetchPage(string url)
        {
            var page = new WebPage();
            var request = (HttpWebRequest)WebRequest.Create(url);
            request.UserAgent = DefaultUserAgent;

            try
            {
                using (var response = request.GetResponse())
                {
                    using (var reader = new StreamReader(response.GetResponseStream()))
                    {
                        var html = reader.ReadToEnd();
                        page.Content = html;
                        page.Url = url;
                    }
                }
            }
            catch (WebException ex)
            {
                HttpWebResponse webResponse = (HttpWebResponse)ex.Response;
                if (webResponse.StatusCode == HttpStatusCode.NotFound)
                {
                    // 404
                }
                else
                {
                    throw ex;
                }
            }

            return page;
        }
Ejemplo n.º 5
0
 public void Add(WebPage webPage)
 {
     var document = new HtmlDocument();
     document.LoadHtml(webPage.Content.ToLower());
     Cache.Add(webPage.Url, document);
 }