Exemplo n.º 1
0
        async Task <CrawlerPageNode> IPageLoader.LoadPageAsync(string url, CrawlerPageNode parentPage)
        {
            if (_alreadyLoadedUrls.Contains(url))
            {
                return(null);
            }

            var page = await _baseLoader.LoadPageAsync(url, parentPage);

            _alreadyLoadedUrls.Add(url);
            return(page);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Recursively capture the page content and links
        /// </summary>
        /// <param name="url">The url to start from</param>
        /// <returns>A set of all of the child pages</returns>
        async Task <CrawlerPageNode> IPageLoader.LoadPageAsync(string url, CrawlerPageNode parentPage)
        {
            var pageText = await _webClient.DownloadStringTaskAsync(url);

            // Make sure to add the startingUrl to the set of visited pages so we don't wrap around to the top
            // page.

            var thisPage = new CrawlerPageNode
            {
                PageUrl     = url,
                HTMLContent = pageText,
                Parent      = parentPage
            };

            var navLinks = _navigationLinkParser.ParseHtml(pageText);

            thisPage.LinksInPage = navLinks;

            return(thisPage);
        }