示例#1
0
        // Sets the _text field to the HTML content of the URL link
        private void SetText(PageStructure page)
        {
            try
            {
                // Create web connection to specific URL
                var req = (HttpWebRequest)WebRequest.Create(page.Url);


                // Get the response from the website, stream its content, and read it
                // Need to create some sort of handling for ill-formatted response/error response from the website
                var res       = req.GetResponse();
                var resStream = res.GetResponseStream();
                if (resStream == null)
                {
                    return;
                }
                var readStream = new StreamReader(resStream);

                // Turn stream into a usable string
                Text = WebUtility.HtmlDecode(readStream.ReadToEnd());
            }
            catch (WebException)
            {
                //var except = new StreamReader(wex.Response.GetResponseStream()).ReadToEnd();
                Text = string.Empty;
            }
        }
示例#2
0
 public WebPage(PageStructure pageStructure)
 {
     PageStructure = pageStructure;
     SetText(PageStructure); //sets the _text of the webpage given the page url as raw HTML
     SetArticle(Text);       // sets the Boolean representing whether or not the page is an article page
     document.LoadHtml(Text);
     if (Article)
     {
         SetBlurb(); // sets the _blurb of the webpage given the text of the page
         SetTitle(); // sets the _title of the webpage given the text of the page
     }
     else
     {
         Title = "";
         Blurb = "";
     }
 }
示例#3
0
        private void CrawlPage(WebPage webPage)
        {
            _crawledPages.Add(webPage.PageStructure.Url, "");
            // iterate over all the URL strings of the children, turn them into a webpage, and
            // put them on the queue if they havem't been crawled yet
            if (!TooDeep(webPage))
            {
                // all the "children" of the webpage...all links the webpage contains
                var children = webPage.SetUrls();

                foreach (var child in children)
                {
                    if ((!HasBeenCrawled(child)) && IsDescendent(child, webPage) && (!_childrenInQueue.ContainsKey(child)))
                    {
                        _childrenInQueue.Add(child, "");
                        var childPage = new PageStructure(child, webPage.PageStructure.Depth + 1, webPage.PageStructure.ArticleStructure, webPage.PageStructure.BaseUri);
                        _queue.Enqueue(childPage);
                    }
                }
            }
        }