// Sets the _text field to the HTML content of the URL link private void SetText(PageStructure page) { try { // Create web connection to specific URL var req = (HttpWebRequest)WebRequest.Create(page.Url); // Get the response from the website, stream its content, and read it // Need to create some sort of handling for ill-formatted response/error response from the website var res = req.GetResponse(); var resStream = res.GetResponseStream(); if (resStream == null) { return; } var readStream = new StreamReader(resStream); // Turn stream into a usable string Text = WebUtility.HtmlDecode(readStream.ReadToEnd()); } catch (WebException) { //var except = new StreamReader(wex.Response.GetResponseStream()).ReadToEnd(); Text = string.Empty; } }
public WebPage(PageStructure pageStructure) { PageStructure = pageStructure; SetText(PageStructure); //sets the _text of the webpage given the page url as raw HTML SetArticle(Text); // sets the Boolean representing whether or not the page is an article page document.LoadHtml(Text); if (Article) { SetBlurb(); // sets the _blurb of the webpage given the text of the page SetTitle(); // sets the _title of the webpage given the text of the page } else { Title = ""; Blurb = ""; } }
private void CrawlPage(WebPage webPage) { _crawledPages.Add(webPage.PageStructure.Url, ""); // iterate over all the URL strings of the children, turn them into a webpage, and // put them on the queue if they havem't been crawled yet if (!TooDeep(webPage)) { // all the "children" of the webpage...all links the webpage contains var children = webPage.SetUrls(); foreach (var child in children) { if ((!HasBeenCrawled(child)) && IsDescendent(child, webPage) && (!_childrenInQueue.ContainsKey(child))) { _childrenInQueue.Add(child, ""); var childPage = new PageStructure(child, webPage.PageStructure.Depth + 1, webPage.PageStructure.ArticleStructure, webPage.PageStructure.BaseUri); _queue.Enqueue(childPage); } } } }