public PageScrapeHistory ScrapePages([FromBody] IEnumerable <string> request) { // If no pages were specified, scrape them all. PageMetadata[] pagesToScrape; if (request == null) { pagesToScrape = PageMetadataRepository.All().Data.ToArray(); } else { pagesToScrape = request.Select(id => PageMetadataRepository.Get(id)).ToArray(); } DateTime scrapeStart = DateTime.Now; ScrapedPage[] pages = PageScraper.Scrape(pagesToScrape, scrapeStart).ToArray(); // Now update the per-page list of all scraped pages. foreach (PageMetadata pageMetadata in pagesToScrape) { ScrapedPage scrapedPage = pages.First(p => p.FacebookId == pageMetadata.FacebookId); pageMetadata.FanCountHistory.Insert(0, new DatedFanCount { Date = scrapedPage.Date, FanCount = scrapedPage.FanCount, }); pageMetadata.LatestScrape = scrapeStart; PageMetadataRepository.Save(pageMetadata, Refresh.False); // Only save the fan count on this date. pageMetadata.FanCountHistory = pageMetadata.FanCountHistory.Take(1).ToList(); } // Now update the total-page list of the scrape. var pageScrapeHistory = new PageScrapeHistory { Id = Guid.NewGuid().ToString(), ImportStart = scrapeStart, ImportEnd = DateTime.Now, Pages = pagesToScrape }; return(PageScrapeHistoryRepository.Save(pageScrapeHistory)); }
public async Task DetailPageScraper_ShouldBeOk() { //Arrange var pageSettings = this.GetPageSettings(); var scraper = new PageScraper( pageSettings, this._loggerMock.Object); scraper.BeforeDocumentOpen += (sender, e) => { e.BrowsingContext.Should().NotBeNull(); e.Url.Should().NotBeNullOrEmpty(); }; scraper.AfterDocumentOpen += (sender, e) => { e.Document.Should().NotBeNull(); }; scraper.PropertyScraped += (sender, e) => { e.ScrapedProperty.Name.Should().Be("Title"); e.ScrapedProperty.Element.Should().NotBeNull(); e.ScrapedProperty.Element.TextContent.Should().Be("Tomorrowland – Official Aftermovie – 30-JUL-2018"); e.Settings.Should().Equals(pageSettings.Properties); }; scraper.PageScraped += (sender, e) => { e.Url.Should().NotBeNullOrEmpty(); e.Settings.Should().Equals(pageSettings.Properties); this.AssertProperties( e.Properties.ToList(), pageSettings.Properties); }; //Act await scraper.Scrape(); }