Exemple #1
0
        /// <summary>
        /// Recursive method to go through all the link
        /// </summary>
        /// <param name="url"></param>
        public static void CrawlPage(Uri url)
        {
            if (!PageHasBeenCrawled(url))
            {
                var htmlText = getPageContent(url);

                var linkParser = new LinkParser();

                _pages.Add(url);

                linkParser.ParseLinks(htmlText, url);

                AddRangeButNoDuplicates(_sameDomainPage, linkParser.ValidUrls);
                AddRangeButNoDuplicates(_externalPages, linkParser.ExternalUrls);
                AddRangeButNoDuplicates(_staticContent, linkParser.OtherUrls);

                //Crawl all the links found on the page.
                foreach (Uri link in linkParser.ValidUrls)
                {
                    try
                    {
                        if (link.ToString() != String.Empty && link.ToString() != ConfigurationManager.AppSettings["url"] &&
                            link != url)
                        {
                            CrawlPage(link);
                        }
                    }
                    catch (Exception)
                    {
                        // add code to find the list of broken url's
                    }
                }
            }
        }
Exemple #2
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private void CrawlPage(string url)
        {
            if (!PageHasBeenCrawled(url))
            {
                var htmlText = GetWebText(url);

                var linkParser = new LinkParser();

                var page = new Page();
                page.Text = htmlText;
                page.Url  = url;

                _pages.Add(page);

                linkParser.ParseLinks(page, url);

                //Add data to main data lists
                if (isCurrentPage)
                {
                    AddRangeButNoDuplicates(_currentPageUrlRepository.List, linkParser.ExternalUrls);
                }

                AddRangeButNoDuplicates(_externalUrlRepository.List, linkParser.ExternalUrls);
                AddRangeButNoDuplicates(_otherUrlRepository.List, linkParser.OtherUrls);
                AddRangeButNoDuplicates(_failedUrlRepository.List, linkParser.BadUrls);

                foreach (string exception in linkParser.Exceptions)
                {
                    _exceptions.Add(exception);
                }

                isCurrentPage = false;
                //Crawl all the links found on the page.
                foreach (string link in _externalUrlRepository.List)
                {
                    string formattedLink = link;
                    try
                    {
                        formattedLink = FixPath(url, formattedLink);

                        if (formattedLink != String.Empty)
                        {
                            CrawlPage(formattedLink);
                        }
                    }
                    catch (Exception exc)
                    {
                        _failedUrlRepository.List.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message);
                    }
                }
            }
        }