private IEnumerable<string> ExtractUrls(WebPage page) { // Overhead, we are essentially parsing a web page twice (once for links, second time for content) HtmlWeb hw = new HtmlWeb(); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(page.Content); foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]")) { string hrefValue = link.GetAttributeValue("href", string.Empty); yield return hrefValue; } }
public static string GetRawTextFromPage(WebPage page) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(page.Content); return GetRawFromHTML(doc); }
public HtmlDocument GetDocument(WebPage webPage) { return Cache[webPage.Url]; }
private WebPage FetchPage(string url) { var page = new WebPage(); var request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = DefaultUserAgent; try { using (var response = request.GetResponse()) { using (var reader = new StreamReader(response.GetResponseStream())) { var html = reader.ReadToEnd(); page.Content = html; page.Url = url; } } } catch (WebException ex) { HttpWebResponse webResponse = (HttpWebResponse)ex.Response; if (webResponse.StatusCode == HttpStatusCode.NotFound) { // 404 } else { throw ex; } } return page; }
public void Add(WebPage webPage) { var document = new HtmlDocument(); document.LoadHtml(webPage.Content.ToLower()); Cache.Add(webPage.Url, document); }