/// <summary> /// Method gets all urls from requested domain /// </summary> public HashSet <string> CollectAllUrls() { TrimUlrNameIfNeeded(DomainUrl); for (int i = 0; i < DepthOfUrlsSearch; i++) { ParseUrlsFromCurrentLayer(); SaveCurrentPageUrlsToLayerUrlCollection(); SaveCurrentLayerToAllUrlsCollection(); } AllUrlsOnCurrentPage.Clear(); AllUrlsOnCurrentLayer.Clear(); return(AllUrls); }
/// <summary> /// Method parse all unique urls from current page /// </summary> private void GetCurrentPageUrls(string currentPage) { var web = new HtmlWeb(); HtmlDocument doc = web.Load(currentPage); foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//a[@href]")) { string hrefValue = node.GetAttributeValue("href", string.Empty); if (hrefValue.StartsWith("/") || hrefValue.StartsWith(DomainUrl) && !hrefValue.StartsWith("//")) { if (hrefValue.StartsWith("/")) { AllUrlsOnCurrentPage.Add(DomainUrl + hrefValue); } else { AllUrlsOnCurrentPage.Add(hrefValue); } } } }
/// <summary> /// Method takes all links from embedded layer /// </summary> private void ParseUrlsFromCurrentLayer() { AllUrlsOnCurrentPage.Clear(); if (AllUrls.Count == 0) { AllUrlsOnCurrentLayer.Add(DomainUrl); } foreach (var url in AllUrlsOnCurrentLayer) { if (url.StartsWith("/")) { GetCurrentPageUrls(DomainUrl + url); } else { GetCurrentPageUrls(url); } } AllUrlsOnCurrentLayer.Clear(); }