public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements) { Directory.CreateDirectory(@"c:\temp\WebScraper"); var result = new List <object>(); using (var client = new HttpClient()) { foreach (var element in elements.Where(e => e.TagName.Equals("a", StringComparison.OrdinalIgnoreCase))) { var href = new Uri(crawlResult.RequestUrl, new Uri(element.Attributes["href"].Value, UriKind.RelativeOrAbsolute)); var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(href.LocalPath)); if (File.Exists(fileName) == false) { Console.WriteLine($"Downloading {href} to {fileName}"); using (var f = File.OpenWrite(fileName)) using (var s = client.GetStreamAsync(href).Result) { s.CopyTo(f); } } else { Console.WriteLine($"Skipping download of {href} to {fileName}"); } result.Add(new { FileName = fileName, Title = element.TextContent }); } return(result); } }
public object ExtractProperties(CrawlResult crawlResult, IHtmlCollection <IElement> elements) { Directory.CreateDirectory(@"c:\temp\WebScraper"); using (var client = new HttpClient()) { foreach (var img in elements.Where(e => e.HasAttribute(AttributeNames.Src))) { var src = new Uri(crawlResult.RequestUrl, new Uri(img.GetAttribute(AttributeNames.Src), UriKind.RelativeOrAbsolute)); var fileName = Path.Combine(@"c:\temp\WebScraper", Path.GetFileName(src.LocalPath)); if (File.Exists(fileName) == false) { Console.WriteLine($"Downloading {src} to {fileName}"); using (var f = File.OpenWrite(fileName)) using (var s = client.GetStreamAsync(src).Result) { s.CopyTo(f); } } else { Console.WriteLine($"Skipping download of {src} to {fileName}"); } img.SetAttribute("data-local-src", fileName); } } return(elements.Select(e => e.OuterHtml).Aggregate((prod, next) => prod + "\n" + next)); }
public IResult <Task <Product>[]> GetProducts(IHtmlCollection <IElement> productElements) { try { var result = productElements .Where(x => GetName(x) != null) .Select(async x => { var productCardHref = GetProductCardHref(x); if (!productCardHref.Success) { return(null); } var productCard = await _puppeteerSharpParser.GetProductCard(productCardHref.Data); if (!productCard.Success) { return(null); } return(productCard.Data); }) .ToArray(); return(Result <Task <Product>[]> .CreateSuccess(result)); } catch { return(Result <Task <Product>[]> .CreateFailed("GETTING_PRODUCTS_ERROR")); } }
private void GetVenueInformation(IDocument document, CrawledInfo crawledInfo) { // navigate down to the classes we want IHtmlCollection <IElement> elements = document.GetElementsByClassName("eventinfo"); var element = elements.Where(e => e.Children.HasClass("g-ui-box-content")).First(); var childElement = element.FirstElementChild; foreach (IElement child in childElement.Children) { if (child.TagName == "H3") { GetCityAndVenue(crawledInfo, child.FirstElementChild.GetAttribute("title")); } else if (child.TagName == "P" && child.ClassName == null) { crawledInfo.EventDate = this.dateTimeExtracter.Extract(child.FirstElementChild.Text()); } } }
private static async ValueTask <Track> GetTrackStopsFromHtmlAsync(Track track, IHtmlCollection <IElement> stopsRawHtmlCollection) { var stopsHtmlCollection = stopsRawHtmlCollection.Where(p => p.ClassName.Contains("zwyk") || p.ClassName.Contains("stre") || p.ClassName.Contains("wyj")); track.BusStops = new List <BusStop>(); foreach (IElement htmlRawStop in stopsHtmlCollection) { IElement htmlStop = htmlRawStop.LastElementChild; if (htmlStop == null) { return(null); } string busHtmlContentStyle = htmlStop.GetAttribute("style"); BusStop busStop = new BusStop() { Id = _StopId++, IdOfLine = track.IdOfLine, IdOfSchedule = track.IdOfSchedule, IdOfTrack = track.Id, Url = htmlStop.LastElementChild.GetAttribute("href"), IsVariant = htmlStop.ClassName.Contains("wariant"), IsLastStopOnTrack = (busHtmlContentStyle == null ? false : busHtmlContentStyle.Contains("bold")), IsBusStopZone = htmlRawStop.ClassName.Contains("stref") }; string busStopName = htmlStop.LastElementChild.TextContent; busStop = SetBusStopName(busStop, busStopName); busStop = await GetStopHoursAsync(track, busStop); if (busStop == null) { return(null); } track.BusStops.Add(busStop); } return(track); }