public static List <string> GetAllLinks(HtmlSource html) { var doc = new HtmlDocument(); //await GetHtmlSource(html); doc.LoadHtml(html.source); var result = doc.DocumentNode.Descendants("a").Select(node => node.GetAttributeValue("href", "null")).ToList(); var links = GetRightLinks(result, html); return(links); }
public static async Task GetHtmlSource(HtmlSource html) { using (var client = new HttpClient()) { var response = await client.GetAsync(html.url); string result = null; if (response != null && response.StatusCode == HttpStatusCode.OK) { result = await response.Content.ReadAsStringAsync(); } html.source = result; html.body = GetBody(result); html.domain = GetDomain(html.url); } }
public async Task <List <HtmlSource> > Walk(int currentDepth, List <string> currentLinks, List <HtmlSource> resultLinks) { foreach (var link in currentLinks) { if (!addedLinks.Contains(link)) { addedLinks.Add(link); var html = new HtmlSource() { url = link }; await HtmlSourceMethods.GetHtmlSource(html); resultLinks.Add(html); if (depth > currentDepth) { await Walk(currentDepth + 1, HtmlSourceMethods.GetAllLinks(html), resultLinks); } } } return(resultLinks); }
private static List <string> GetRightLinks(List <string> links, HtmlSource html) { var result = new List <string>(); string modifiedLink = null; foreach (var link in links) { if (link.StartsWith("https://", StringComparison.InvariantCulture)) { modifiedLink = link; } else if (link.StartsWith('/')) { modifiedLink = html.domain + link; } else { break; } result.Add(modifiedLink); } return(result); }