public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements)
        {
            Directory.CreateDirectory(@"c:\temp\WebScraper");

            var result = new List <object>();

            using (var client = new HttpClient())
            {
                foreach (var element in elements.Where(e => e.TagName.Equals("a", StringComparison.OrdinalIgnoreCase)))
                {
                    var href     = new Uri(crawlResult.RequestUrl, new Uri(element.Attributes["href"].Value, UriKind.RelativeOrAbsolute));
                    var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(href.LocalPath));

                    if (File.Exists(fileName) == false)
                    {
                        Console.WriteLine($"Downloading {href} to {fileName}");

                        using (var f = File.OpenWrite(fileName))
                            using (var s = client.GetStreamAsync(href).Result)
                            {
                                s.CopyTo(f);
                            }
                    }
                    else
                    {
                        Console.WriteLine($"Skipping download of {href} to {fileName}");
                    }

                    result.Add(new { FileName = fileName, Title = element.TextContent });
                }

                return(result);
            }
        }
        public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements)
        {
            Directory.CreateDirectory(@"c:\temp\WebScraper");

            using (var client = new HttpClient())
            {
                foreach (var img in elements.Where(e => e.HasAttribute(AttributeNames.Src)))
                {
                    var src = new Uri(crawlResult.RequestUrl, new Uri(img.GetAttribute(AttributeNames.Src), UriKind.RelativeOrAbsolute));

                    var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(src.LocalPath));

                    if (File.Exists(fileName) == false)
                    {
                        Console.WriteLine($"Downloading {src} to {fileName}");

                        using (var f = File.OpenWrite(fileName))
                            using (var s = client.GetStreamAsync(src).Result)
                            {
                                s.CopyTo(f);
                            }
                    }
                    else
                    {
                        Console.WriteLine($"Skipping download of {src} to {fileName}");
                    }

                    img.SetAttribute("data-local-src", fileName);
                }
            }

            return(elements.Select(e => e.OuterHtml).Aggregate((prod, next) => prod + "\n" + next));
        }
        public IResult <Task <Product>[]> GetProducts(IHtmlCollection <IElement> productElements)
        {
            try
            {
                var result = productElements
                             .Where(x => GetName(x) != null)
                             .Select(async x =>
                {
                    var productCardHref = GetProductCardHref(x);

                    if (!productCardHref.Success)
                    {
                        return(null);
                    }

                    var productCard = await _puppeteerSharpParser.GetProductCard(productCardHref.Data);

                    if (!productCard.Success)
                    {
                        return(null);
                    }

                    return(productCard.Data);
                })
                             .ToArray();

                return(Result <Task <Product>[]> .CreateSuccess(result));
            }
            catch
            {
                return(Result <Task <Product>[]> .CreateFailed("GETTING_PRODUCTS_ERROR"));
            }
        }
Esempio n. 4
0
        private void GetVenueInformation(IDocument document, CrawledInfo crawledInfo)
        {
            // navigate down to the classes we want
            IHtmlCollection <IElement> elements = document.GetElementsByClassName("eventinfo");
            var element      = elements.Where(e => e.Children.HasClass("g-ui-box-content")).First();
            var childElement = element.FirstElementChild;

            foreach (IElement child in childElement.Children)
            {
                if (child.TagName == "H3")
                {
                    GetCityAndVenue(crawledInfo, child.FirstElementChild.GetAttribute("title"));
                }
                else if (child.TagName == "P" && child.ClassName == null)
                {
                    crawledInfo.EventDate = this.dateTimeExtracter.Extract(child.FirstElementChild.Text());
                }
            }
        }
Esempio n. 5
0
        private static async ValueTask <Track> GetTrackStopsFromHtmlAsync(Track track, IHtmlCollection <IElement> stopsRawHtmlCollection)
        {
            var stopsHtmlCollection = stopsRawHtmlCollection.Where(p => p.ClassName.Contains("zwyk") ||
                                                                   p.ClassName.Contains("stre") || p.ClassName.Contains("wyj"));

            track.BusStops = new List <BusStop>();
            foreach (IElement htmlRawStop in stopsHtmlCollection)
            {
                IElement htmlStop = htmlRawStop.LastElementChild;
                if (htmlStop == null)
                {
                    return(null);
                }

                string  busHtmlContentStyle = htmlStop.GetAttribute("style");
                BusStop busStop             = new BusStop()
                {
                    Id                = _StopId++,
                    IdOfLine          = track.IdOfLine,
                    IdOfSchedule      = track.IdOfSchedule,
                    IdOfTrack         = track.Id,
                    Url               = htmlStop.LastElementChild.GetAttribute("href"),
                    IsVariant         = htmlStop.ClassName.Contains("wariant"),
                    IsLastStopOnTrack = (busHtmlContentStyle == null ? false : busHtmlContentStyle.Contains("bold")),
                    IsBusStopZone     = htmlRawStop.ClassName.Contains("stref")
                };

                string busStopName = htmlStop.LastElementChild.TextContent;
                busStop = SetBusStopName(busStop, busStopName);

                busStop = await GetStopHoursAsync(track, busStop);

                if (busStop == null)
                {
                    return(null);
                }

                track.BusStops.Add(busStop);
            }
            return(track);
        }