예제 #1
0
        public void ExtractDayCareList(string query, List <ScrapeSource> list, string county, string zipCode)
        {
            var                url = home + query;
            HtmlDocument       doc = web.Load(url);
            HtmlNodeCollection dayCareCenterNodes = doc.DocumentNode.SelectNodes("//a[contains(@href,'CDC_LIC_NBR')]");

            if (dayCareCenterNodes != null)
            {
                foreach (HtmlNode node in dayCareCenterNodes)
                {
                    string href = node.Attributes["href"].Value;
                    if (list.Any(x => x.DetailUrl.Equals(href)))
                    {
                        continue;
                    }
                    var model = new ScrapeSource();
                    model.DetailUrl = href;
                    model.County    = county;
                    model.ZipCode   = zipCode;
                    list.Add(model);
                }
            }
            var nextPageNode = doc.DocumentNode.SelectSingleNode("//img[contains(@src,'next.gif')]");

            if (nextPageNode != null)
            {
                var nextUrl = nextPageNode.ParentNode.Attributes["href"].Value;
                LogHelper.log.Info("next page:" + nextUrl);
                ExtractDayCareList(nextUrl, list, county, zipCode);
            }
        }
예제 #2
0
        public static string BuildLogMessage(ScrapeSource model)
        {
            var county  = string.IsNullOrEmpty(model.County) ? "" : model.County;
            var zipCode = string.IsNullOrEmpty(model.ZipCode) ? "" : model.ZipCode;

            return(string.Format("County:{0}-ZipCode{1}-Detail{2} ", county, zipCode, model.DetailUrl));
        }
예제 #3
0
        public async Task ScrapeAsync(int id, CancellationToken cancellationToken = default)
        {
            Scrape scrape;

            try
            {
                scrape = await _context.Scrapes().FirstOrDefaultAsync(x => x.Id == id, cancellationToken);

                if (scrape is null)
                {
                    throw new Exception($"cannot find scrape record {id}");
                }
            }
            catch (Exception e)
            {
                _logger.LogError(e, "failed when getting scrape record {id}", id);
                // TODO requeue on db fail
                return;
            }

            scrape.ScrapeSources = new List <ScrapeSource>();
            IScrapeSession session = null;

            foreach (var scraper in _scrapers)
            {
                var source = new ScrapeSource
                {
                    Source    = scraper.Source,
                    Type      = scraper.Type,
                    StartDate = _clock.UtcNow
                };
                scrape.ScrapeSources.Add(source);

                try
                {
                    session = await _movieService.GetScrapeSessionAsync(scraper.Source, scraper.Type, session, cancellationToken);

                    var result = await scraper.ScrapeAsync(session, cancellationToken);

                    source.Success      = true;
                    source.EndDate      = _clock.UtcNow;
                    source.MovieCount   = result.MovieCount;
                    source.TorrentCount = result.TorrentCount;

                    switch (scraper.Type)
                    {
                    case ScraperType.Local:
                        scrape.LocalMovieCount += result.MovieCount;
                        break;

                    case ScraperType.Torrent:
                        scrape.MovieCount   += result.MovieCount;
                        scrape.TorrentCount += result.TorrentCount;
                        break;

                    default:
                        throw new ArgumentOutOfRangeException();
                    }
                }
                catch (Exception e)
                {
                    _logger.LogError(e, "failed to scrape {source}", scraper.Source);
                    source.Success = false;
                    source.EndDate = _clock.UtcNow;
                    source.Error   = e.ToString();
                }
            }

            try
            {
                scrape.ImageCount = await ScrapeImagesAsync(session, cancellationToken);
            }
            catch (Exception e)
            {
                _logger.LogError(e, "failed to scrape images");
            }

            scrape.Success = scrape.ScrapeSources.All(x => x.Success);
            scrape.EndDate = _clock.UtcNow;
            await _context.SaveChangesAsync(cancellationToken);
        }