public List<string> ParseLinks(Page page) { all_links.Clear(); relativeLinks.Clear(); List<string> result_links = new List<string>(); html.LoadHtml(page.Text); var root_node = html.DocumentNode.SelectNodes("//a[@href]"); if (root_node != null) { foreach (var link in root_node) { all_links.Add(link.Attributes["href"].Value); } } relativeLinks = (from link in all_links where (IsAWebPage(link)) && !link.StartsWith("mailto:") && (link.StartsWith(SiteName(page.Url)) || !link.Contains("http")) select link).ToList<string>(); foreach (string link in relativeLinks) { result_links.Add(FixPath(link, page.Url)); } return result_links; }
public async Task CrawlPage(string site) { Page fpage = new Page(); LinkParser linkParser = new LinkParser(); fpage.Url = site; fpage.Text = await GetPageText(fpage.Url); if (fpage.Text != string.Empty) { fpage.InternalLinks = linkParser.ParseLinks(fpage); } int lvl = DepthLevel; crawledUrls.Add(fpage.Url); pagesToCrawl.Enqueue(fpage); while (pagesToCrawl.Count != 0 && lvl >= 0) { Page p = pagesToCrawl.Dequeue(); foreach (string link in p.InternalLinks) { string url = link; if (!crawledUrls.Contains(url)) { Page spage = new Page(); spage.Url = url; spage.Text = await GetPageText(url); Console.WriteLine(url); if (spage.Text != string.Empty) { spage.InternalLinks = linkParser.ParseLinks(spage); } pagesToCrawl.Enqueue(spage); crawledUrls.Add(spage.Url); } } lvl--; } }