Пример #1
0
        /// <summary>
        /// Method gets all urls from requested domain
        /// </summary>
        public HashSet <string> CollectAllUrls()
        {
            TrimUlrNameIfNeeded(DomainUrl);

            for (int i = 0; i < DepthOfUrlsSearch; i++)
            {
                ParseUrlsFromCurrentLayer();
                SaveCurrentPageUrlsToLayerUrlCollection();
                SaveCurrentLayerToAllUrlsCollection();
            }

            AllUrlsOnCurrentPage.Clear();
            AllUrlsOnCurrentLayer.Clear();
            return(AllUrls);
        }
Пример #2
0
        /// <summary>
        /// Method parse all unique urls from current page
        /// </summary>
        private void GetCurrentPageUrls(string currentPage)
        {
            var          web = new HtmlWeb();
            HtmlDocument doc = web.Load(currentPage);

            foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//a[@href]"))
            {
                string hrefValue = node.GetAttributeValue("href", string.Empty);

                if (hrefValue.StartsWith("/") || hrefValue.StartsWith(DomainUrl) && !hrefValue.StartsWith("//"))
                {
                    if (hrefValue.StartsWith("/"))
                    {
                        AllUrlsOnCurrentPage.Add(DomainUrl + hrefValue);
                    }
                    else
                    {
                        AllUrlsOnCurrentPage.Add(hrefValue);
                    }
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Method takes all links from embedded layer
        /// </summary>
        private void ParseUrlsFromCurrentLayer()
        {
            AllUrlsOnCurrentPage.Clear();

            if (AllUrls.Count == 0)
            {
                AllUrlsOnCurrentLayer.Add(DomainUrl);
            }

            foreach (var url in AllUrlsOnCurrentLayer)
            {
                if (url.StartsWith("/"))
                {
                    GetCurrentPageUrls(DomainUrl + url);
                }
                else
                {
                    GetCurrentPageUrls(url);
                }
            }

            AllUrlsOnCurrentLayer.Clear();
        }