public PageScrapeHistory ScrapePages([FromBody] IEnumerable <string> request)
        {
            // If no pages were specified, scrape them all.
            PageMetadata[] pagesToScrape;
            if (request == null)
            {
                pagesToScrape = PageMetadataRepository.All().Data.ToArray();
            }
            else
            {
                pagesToScrape = request.Select(id => PageMetadataRepository.Get(id)).ToArray();
            }

            DateTime scrapeStart = DateTime.Now;

            ScrapedPage[] pages = PageScraper.Scrape(pagesToScrape, scrapeStart).ToArray();

            // Now update the per-page list of all scraped pages.
            foreach (PageMetadata pageMetadata in pagesToScrape)
            {
                ScrapedPage scrapedPage = pages.First(p => p.FacebookId == pageMetadata.FacebookId);
                pageMetadata.FanCountHistory.Insert(0, new DatedFanCount
                {
                    Date     = scrapedPage.Date,
                    FanCount = scrapedPage.FanCount,
                });
                pageMetadata.LatestScrape = scrapeStart;
                PageMetadataRepository.Save(pageMetadata, Refresh.False);

                // Only save the fan count on this date.
                pageMetadata.FanCountHistory = pageMetadata.FanCountHistory.Take(1).ToList();
            }

            // Now update the total-page list of the scrape.
            var pageScrapeHistory = new PageScrapeHistory
            {
                Id          = Guid.NewGuid().ToString(),
                ImportStart = scrapeStart,
                ImportEnd   = DateTime.Now,
                Pages       = pagesToScrape
            };

            return(PageScrapeHistoryRepository.Save(pageScrapeHistory));
        }
Exemple #2
0
        public async Task DetailPageScraper_ShouldBeOk()
        {
            //Arrange
            var pageSettings = this.GetPageSettings();
            var scraper      = new PageScraper(
                pageSettings,
                this._loggerMock.Object);

            scraper.BeforeDocumentOpen += (sender, e) =>
            {
                e.BrowsingContext.Should().NotBeNull();
                e.Url.Should().NotBeNullOrEmpty();
            };
            scraper.AfterDocumentOpen += (sender, e) =>
            {
                e.Document.Should().NotBeNull();
            };
            scraper.PropertyScraped += (sender, e) =>
            {
                e.ScrapedProperty.Name.Should().Be("Title");
                e.ScrapedProperty.Element.Should().NotBeNull();
                e.ScrapedProperty.Element.TextContent.Should().Be("Tomorrowland – Official Aftermovie – 30-JUL-2018");
                e.Settings.Should().Equals(pageSettings.Properties);
            };
            scraper.PageScraped += (sender, e) =>
            {
                e.Url.Should().NotBeNullOrEmpty();
                e.Settings.Should().Equals(pageSettings.Properties);
                this.AssertProperties(
                    e.Properties.ToList(),
                    pageSettings.Properties);
            };

            //Act
            await scraper.Scrape();
        }