Esempio n. 1
0
        /// <summary>
        /// Getting all unique links on the page
        /// </summary>
        /// <param name="page"></param>
        /// <returns></returns>
        protected List <string> GetUniqueLinks(HtmlPageInfo page)
        {
            if (page.Document == null)
            {
                return(new List <string>());
            }

            return(page.Document.All
                   // Select only the tag <a> (link)
                   .Where(tag => tag.LocalName == "a")
                   // Project the value of the href attribute (link)
                   .Select(tag => ToAbsoluteUrl(tag.GetAttribute("href")))
                   // Select only links of this domain (including relative links)
                   .Where(href => !string.IsNullOrEmpty(href) && (href.StartsWith(Settings?.UrlSchemeAndHost) || href.StartsWith("/")))
                   // Select links that have not been added before and remove the repetition
                   .Where(href => !_tempLinks.Contains(href)).Distinct().ToList());
        }
Esempio n. 2
0
        /// <summary>
        /// Start parse process
        /// </summary>
        /// <returns></returns>
        public async Task Start()
        {
            HtmlPageInfo rootPage = await GetPageInfo(Settings?.Url?.AbsoluteUri);

            if (rootPage == null)
            {
                OnCompleted?.Invoke(this);
                return;
            }

            _tempLinks.Add(rootPage.Url);
            Pages.Add(rootPage);

            await FillChildPages(rootPage);

            OnCompleted?.Invoke(this);
        }
Esempio n. 3
0
        /// <summary>
        /// Recursive parsing of link tree
        /// </summary>
        /// <param name="parrentPage"></param>
        /// <returns></returns>
        protected async Task FillChildPages(HtmlPageInfo parrentPage)
        {
            List <string> links = GetUniqueLinks(parrentPage);

            _tempLinks.AddRange(links);

            foreach (string link in links)
            {
                HtmlPageInfo childPage = await GetPageInfo(link);

                if (childPage == null)
                {
                    continue;
                }

                Pages.Add(childPage);
                OnNewPage?.Invoke(this, childPage);
                await FillChildPages(childPage);
            }
        }