public List<string> ParseLinks(Page page)
        {
            all_links.Clear();
            relativeLinks.Clear();
            List<string> result_links = new List<string>();

            html.LoadHtml(page.Text);
            var root_node = html.DocumentNode.SelectNodes("//a[@href]");
            if (root_node != null)
            {
                foreach (var link in root_node)
                {
                    all_links.Add(link.Attributes["href"].Value);
                }
            }
            relativeLinks = (from link in all_links
                             where (IsAWebPage(link)) && !link.StartsWith("mailto:")
                             && (link.StartsWith(SiteName(page.Url)) || !link.Contains("http"))
                             select link).ToList<string>();
            foreach (string link in relativeLinks)
            {
                result_links.Add(FixPath(link, page.Url));
            }
            return result_links;
        }
示例#2
0
        public async Task CrawlPage(string site)
        {
            Page fpage = new Page();
            LinkParser linkParser = new LinkParser();

            fpage.Url = site;
            fpage.Text = await GetPageText(fpage.Url);

            if (fpage.Text != string.Empty)
            {
                fpage.InternalLinks = linkParser.ParseLinks(fpage);
            }
            int lvl = DepthLevel;

            crawledUrls.Add(fpage.Url);
            pagesToCrawl.Enqueue(fpage);
            while (pagesToCrawl.Count != 0 && lvl >= 0)
            {

                Page p = pagesToCrawl.Dequeue();
                foreach (string link in p.InternalLinks)
                {
                    string url = link;
                    if (!crawledUrls.Contains(url))
                    {
                        Page spage = new Page();
                        spage.Url = url;
                        spage.Text = await GetPageText(url);
                        Console.WriteLine(url);
                        if (spage.Text != string.Empty)
                        {
                            spage.InternalLinks = linkParser.ParseLinks(spage);
                        }
                        pagesToCrawl.Enqueue(spage);
                        crawledUrls.Add(spage.Url);

                    }
                }
                lvl--;
            }
        }