Example #1
0
        private static async Task <IEnumerable <Uri> > GetLinksFromBody(Uri url, IScrapedUrl scrapedUrl)
        {
            var result = new HashSet <Uri>();

            try
            {
                var document = new HtmlDocument();
                document.LoadHtml(scrapedUrl.Body);

                var tags = document.DocumentNode.SelectNodes("//a")
                           .Select(p => p.GetAttributeValue("href", "not found"))
                           .Where(x => x.Length > 1 &&
                                  !x.ToLower().StartsWith("javascript") &&
                                  !x.ToLower().StartsWith("tel:") &&
                                  !x.ToLower().StartsWith("mailto:") &&
                                  x != "not found")
                           .Distinct()
                           .ToList();

                var bag   = new ConcurrentBag <Uri>();
                var tasks = tags.Select(async tag => { await Task.Run(() => { bag.Add(new Uri(url, tag)); }); });
                await Task.WhenAll(tasks);

                foreach (var tag in bag)
                {
                    result.Add(tag);
                }
            }
            catch (Exception)
            {
                //Handle Exception gracefully
            }
            return(result);
        }
Example #2
0
        public static IScrapedUrlResult ToScrapedUrlResult(this IScrapedUrl scrapedUrl)
        {
            var scrapedUrlResult = new ScrapedUrlResult
            {
                Title = scrapedUrl.Title, Description = scrapedUrl.Description, Url = scrapedUrl.Url
            };

            return(scrapedUrlResult);
        }