/// <summary> /// Getting all unique links on the page /// </summary> /// <param name="page"></param> /// <returns></returns> protected List <string> GetUniqueLinks(HtmlPageInfo page) { if (page.Document == null) { return(new List <string>()); } return(page.Document.All // Select only the tag <a> (link) .Where(tag => tag.LocalName == "a") // Project the value of the href attribute (link) .Select(tag => ToAbsoluteUrl(tag.GetAttribute("href"))) // Select only links of this domain (including relative links) .Where(href => !string.IsNullOrEmpty(href) && (href.StartsWith(Settings?.UrlSchemeAndHost) || href.StartsWith("/"))) // Select links that have not been added before and remove the repetition .Where(href => !_tempLinks.Contains(href)).Distinct().ToList()); }
/// <summary> /// Start parse process /// </summary> /// <returns></returns> public async Task Start() { HtmlPageInfo rootPage = await GetPageInfo(Settings?.Url?.AbsoluteUri); if (rootPage == null) { OnCompleted?.Invoke(this); return; } _tempLinks.Add(rootPage.Url); Pages.Add(rootPage); await FillChildPages(rootPage); OnCompleted?.Invoke(this); }
/// <summary> /// Recursive parsing of link tree /// </summary> /// <param name="parrentPage"></param> /// <returns></returns> protected async Task FillChildPages(HtmlPageInfo parrentPage) { List <string> links = GetUniqueLinks(parrentPage); _tempLinks.AddRange(links); foreach (string link in links) { HtmlPageInfo childPage = await GetPageInfo(link); if (childPage == null) { continue; } Pages.Add(childPage); OnNewPage?.Invoke(this, childPage); await FillChildPages(childPage); } }