/// <summary> /// Recursive method to go through all the link /// </summary> /// <param name="url"></param> public static void CrawlPage(Uri url) { if (!PageHasBeenCrawled(url)) { var htmlText = getPageContent(url); var linkParser = new LinkParser(); _pages.Add(url); linkParser.ParseLinks(htmlText, url); AddRangeButNoDuplicates(_sameDomainPage, linkParser.ValidUrls); AddRangeButNoDuplicates(_externalPages, linkParser.ExternalUrls); AddRangeButNoDuplicates(_staticContent, linkParser.OtherUrls); //Crawl all the links found on the page. foreach (Uri link in linkParser.ValidUrls) { try { if (link.ToString() != String.Empty && link.ToString() != ConfigurationManager.AppSettings["url"] && link != url) { CrawlPage(link); } } catch (Exception) { // add code to find the list of broken url's } } } }
/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private void CrawlPage(string url) { if (!PageHasBeenCrawled(url)) { var htmlText = GetWebText(url); var linkParser = new LinkParser(); var page = new Page(); page.Text = htmlText; page.Url = url; _pages.Add(page); linkParser.ParseLinks(page, url); //Add data to main data lists if (isCurrentPage) { AddRangeButNoDuplicates(_currentPageUrlRepository.List, linkParser.ExternalUrls); } AddRangeButNoDuplicates(_externalUrlRepository.List, linkParser.ExternalUrls); AddRangeButNoDuplicates(_otherUrlRepository.List, linkParser.OtherUrls); AddRangeButNoDuplicates(_failedUrlRepository.List, linkParser.BadUrls); foreach (string exception in linkParser.Exceptions) { _exceptions.Add(exception); } isCurrentPage = false; //Crawl all the links found on the page. foreach (string link in _externalUrlRepository.List) { string formattedLink = link; try { formattedLink = FixPath(url, formattedLink); if (formattedLink != String.Empty) { CrawlPage(formattedLink); } } catch (Exception exc) { _failedUrlRepository.List.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message); } } } }