private static async Task <IEnumerable <Uri> > GetLinksFromBody(Uri url, IScrapedUrl scrapedUrl) { var result = new HashSet <Uri>(); try { var document = new HtmlDocument(); document.LoadHtml(scrapedUrl.Body); var tags = document.DocumentNode.SelectNodes("//a") .Select(p => p.GetAttributeValue("href", "not found")) .Where(x => x.Length > 1 && !x.ToLower().StartsWith("javascript") && !x.ToLower().StartsWith("tel:") && !x.ToLower().StartsWith("mailto:") && x != "not found") .Distinct() .ToList(); var bag = new ConcurrentBag <Uri>(); var tasks = tags.Select(async tag => { await Task.Run(() => { bag.Add(new Uri(url, tag)); }); }); await Task.WhenAll(tasks); foreach (var tag in bag) { result.Add(tag); } } catch (Exception) { //Handle Exception gracefully } return(result); }
public static IScrapedUrlResult ToScrapedUrlResult(this IScrapedUrl scrapedUrl) { var scrapedUrlResult = new ScrapedUrlResult { Title = scrapedUrl.Title, Description = scrapedUrl.Description, Url = scrapedUrl.Url }; return(scrapedUrlResult); }