public void ExtractDayCareList(string query, List <ScrapeSource> list, string county, string zipCode) { var url = home + query; HtmlDocument doc = web.Load(url); HtmlNodeCollection dayCareCenterNodes = doc.DocumentNode.SelectNodes("//a[contains(@href,'CDC_LIC_NBR')]"); if (dayCareCenterNodes != null) { foreach (HtmlNode node in dayCareCenterNodes) { string href = node.Attributes["href"].Value; if (list.Any(x => x.DetailUrl.Equals(href))) { continue; } var model = new ScrapeSource(); model.DetailUrl = href; model.County = county; model.ZipCode = zipCode; list.Add(model); } } var nextPageNode = doc.DocumentNode.SelectSingleNode("//img[contains(@src,'next.gif')]"); if (nextPageNode != null) { var nextUrl = nextPageNode.ParentNode.Attributes["href"].Value; LogHelper.log.Info("next page:" + nextUrl); ExtractDayCareList(nextUrl, list, county, zipCode); } }
public static string BuildLogMessage(ScrapeSource model) { var county = string.IsNullOrEmpty(model.County) ? "" : model.County; var zipCode = string.IsNullOrEmpty(model.ZipCode) ? "" : model.ZipCode; return(string.Format("County:{0}-ZipCode{1}-Detail{2} ", county, zipCode, model.DetailUrl)); }
public async Task ScrapeAsync(int id, CancellationToken cancellationToken = default) { Scrape scrape; try { scrape = await _context.Scrapes().FirstOrDefaultAsync(x => x.Id == id, cancellationToken); if (scrape is null) { throw new Exception($"cannot find scrape record {id}"); } } catch (Exception e) { _logger.LogError(e, "failed when getting scrape record {id}", id); // TODO requeue on db fail return; } scrape.ScrapeSources = new List <ScrapeSource>(); IScrapeSession session = null; foreach (var scraper in _scrapers) { var source = new ScrapeSource { Source = scraper.Source, Type = scraper.Type, StartDate = _clock.UtcNow }; scrape.ScrapeSources.Add(source); try { session = await _movieService.GetScrapeSessionAsync(scraper.Source, scraper.Type, session, cancellationToken); var result = await scraper.ScrapeAsync(session, cancellationToken); source.Success = true; source.EndDate = _clock.UtcNow; source.MovieCount = result.MovieCount; source.TorrentCount = result.TorrentCount; switch (scraper.Type) { case ScraperType.Local: scrape.LocalMovieCount += result.MovieCount; break; case ScraperType.Torrent: scrape.MovieCount += result.MovieCount; scrape.TorrentCount += result.TorrentCount; break; default: throw new ArgumentOutOfRangeException(); } } catch (Exception e) { _logger.LogError(e, "failed to scrape {source}", scraper.Source); source.Success = false; source.EndDate = _clock.UtcNow; source.Error = e.ToString(); } } try { scrape.ImageCount = await ScrapeImagesAsync(session, cancellationToken); } catch (Exception e) { _logger.LogError(e, "failed to scrape images"); } scrape.Success = scrape.ScrapeSources.All(x => x.Success); scrape.EndDate = _clock.UtcNow; await _context.SaveChangesAsync(cancellationToken); }