Exemplo n.º 1
0
        public static List <string> GetAllLinks(HtmlSource html)
        {
            var doc = new HtmlDocument();

            //await GetHtmlSource(html);
            doc.LoadHtml(html.source);
            var result = doc.DocumentNode.Descendants("a").Select(node => node.GetAttributeValue("href", "null")).ToList();
            var links  = GetRightLinks(result, html);

            return(links);
        }
Exemplo n.º 2
0
        public static async Task GetHtmlSource(HtmlSource html)
        {
            using (var client = new HttpClient())
            {
                var response = await client.GetAsync(html.url);

                string result = null;
                if (response != null && response.StatusCode == HttpStatusCode.OK)
                {
                    result = await response.Content.ReadAsStringAsync();
                }
                html.source = result;
                html.body   = GetBody(result);
                html.domain = GetDomain(html.url);
            }
        }
Exemplo n.º 3
0
        public async Task <List <HtmlSource> > Walk(int currentDepth, List <string> currentLinks, List <HtmlSource> resultLinks)
        {
            foreach (var link in currentLinks)
            {
                if (!addedLinks.Contains(link))
                {
                    addedLinks.Add(link);
                    var html = new HtmlSource()
                    {
                        url = link
                    };
                    await HtmlSourceMethods.GetHtmlSource(html);

                    resultLinks.Add(html);

                    if (depth > currentDepth)
                    {
                        await Walk(currentDepth + 1, HtmlSourceMethods.GetAllLinks(html), resultLinks);
                    }
                }
            }
            return(resultLinks);
        }
Exemplo n.º 4
0
        private static List <string> GetRightLinks(List <string> links, HtmlSource html)
        {
            var    result       = new List <string>();
            string modifiedLink = null;

            foreach (var link in links)
            {
                if (link.StartsWith("https://", StringComparison.InvariantCulture))
                {
                    modifiedLink = link;
                }
                else if (link.StartsWith('/'))
                {
                    modifiedLink = html.domain + link;
                }
                else
                {
                    break;
                }
                result.Add(modifiedLink);
            }
            return(result);
        }