public PostScrapeController(PostScraper postScraper, CommentScraper commentScraper, PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, ElasticSearchRepository <PostScrapeHistory> postScrapeHistoryRepository)
 {
     PostScraper                 = postScraper;
     CommentScraper              = commentScraper;
     PageScraper                 = pageScraper;
     PageMetadataRepository      = pageMetadataRepository;
     PostScrapeHistoryRepository = postScrapeHistoryRepository;
 }
Пример #2
0
 private void cmdGet_Click(object sender, EventArgs e)
 {
     oItems = PageScraper.GetItems();
     if (oItems != null && oItems.Count > 0)
     {
         dgvResults.DataSource = oItems;
     }
     else
     {
         MessageBox.Show("No data to display");
     }
 }
        public PageScrapeHistory ScrapePages([FromBody] IEnumerable <string> request)
        {
            // If no pages were specified, scrape them all.
            PageMetadata[] pagesToScrape;
            if (request == null)
            {
                pagesToScrape = PageMetadataRepository.All().Data.ToArray();
            }
            else
            {
                pagesToScrape = request.Select(id => PageMetadataRepository.Get(id)).ToArray();
            }

            DateTime scrapeStart = DateTime.Now;

            ScrapedPage[] pages = PageScraper.Scrape(pagesToScrape, scrapeStart).ToArray();

            // Now update the per-page list of all scraped pages.
            foreach (PageMetadata pageMetadata in pagesToScrape)
            {
                ScrapedPage scrapedPage = pages.First(p => p.FacebookId == pageMetadata.FacebookId);
                pageMetadata.FanCountHistory.Insert(0, new DatedFanCount
                {
                    Date     = scrapedPage.Date,
                    FanCount = scrapedPage.FanCount,
                });
                pageMetadata.LatestScrape = scrapeStart;
                PageMetadataRepository.Save(pageMetadata, Refresh.False);

                // Only save the fan count on this date.
                pageMetadata.FanCountHistory = pageMetadata.FanCountHistory.Take(1).ToList();
            }

            // Now update the total-page list of the scrape.
            var pageScrapeHistory = new PageScrapeHistory
            {
                Id          = Guid.NewGuid().ToString(),
                ImportStart = scrapeStart,
                ImportEnd   = DateTime.Now,
                Pages       = pagesToScrape
            };

            return(PageScrapeHistoryRepository.Save(pageScrapeHistory));
        }
Пример #4
0
        public void when_gettingLinksFor_with_real_looking_data_expect_correct_number_returned()
        {
            // arrage
            var pageDataProvider = MockRepository.GenerateStub <IPageDataProvider>();

            pageDataProvider.Stub(x => x.GetPageFor(string.Empty))
            .Return(Task.Factory.StartNew(() => TestData))
            .IgnoreArguments();

            var linkhelper = MockRepository.GenerateStub <ILinkHelper>();

            linkhelper.Stub(x => x.ParseLink(string.Empty, string.Empty))
            .Return("http://google.com")
            .IgnoreArguments();
            // act
            var result = new PageScraper(pageDataProvider, linkhelper).GetLinksFor(string.Empty);

            // act
            Assert.That(result.Result.Count(), Is.EqualTo(82));
        }
Пример #5
0
        public void EqualBook()
        {
            ChromeOptions options = new ChromeOptions();

            options.AddArguments("--window-position=0,0");
            options.AddArguments("--window-size=1920,1080");

            SeleneDriver driver = new SeleneDriver(new ChromeDriver());

            driver.Open("https://amazon.com");

            HomePage initPage = new HomePage(driver);

            initPage.FillField("Java");

            SearchPage search = initPage.Submit();

            PageScraper scraper = search.Confirm();

            CheckedBook checkedBook = scraper.InitComparableBook(search.Urls());

            Assert.True(checkedBook.IsEqual(scraper.allBooks));
            driver.Close();
        }
Пример #6
0
        public async Task DetailPageScraper_ShouldBeOk()
        {
            //Arrange
            var pageSettings = this.GetPageSettings();
            var scraper      = new PageScraper(
                pageSettings,
                this._loggerMock.Object);

            scraper.BeforeDocumentOpen += (sender, e) =>
            {
                e.BrowsingContext.Should().NotBeNull();
                e.Url.Should().NotBeNullOrEmpty();
            };
            scraper.AfterDocumentOpen += (sender, e) =>
            {
                e.Document.Should().NotBeNull();
            };
            scraper.PropertyScraped += (sender, e) =>
            {
                e.ScrapedProperty.Name.Should().Be("Title");
                e.ScrapedProperty.Element.Should().NotBeNull();
                e.ScrapedProperty.Element.TextContent.Should().Be("Tomorrowland – Official Aftermovie – 30-JUL-2018");
                e.Settings.Should().Equals(pageSettings.Properties);
            };
            scraper.PageScraped += (sender, e) =>
            {
                e.Url.Should().NotBeNullOrEmpty();
                e.Settings.Should().Equals(pageSettings.Properties);
                this.AssertProperties(
                    e.Properties.ToList(),
                    pageSettings.Properties);
            };

            //Act
            await scraper.Scrape();
        }
Пример #7
0
        public void ConfigureServices(IServiceCollection services)
        {
            // Boilerplate: add service and create Policy with options
            services.AddCors(options =>
            {
                options.AddPolicy("CorsPolicy",
                                  builder => builder.AllowAnyOrigin()
                                  .AllowAnyMethod()
                                  .AllowAnyHeader()
                                  .AllowCredentials());
            });

            services.AddMvc();
            services.AddSingleton(Configuration);

            // Register our repositories with ASP.NET Core to allow them to be injected
            // into our controllers. This preserves the same state between the controllers.

            Version facebookGraphAPIVersion = new Version(Configuration["facebook:graphAPIVersion"]);
            string  facebookAppId           = Configuration["facebook:appId"];
            string  facebookAppSecret       = Configuration["facebook:appSecret"];
            var     graphClient             = new GraphClient(facebookGraphAPIVersion, facebookAppId, facebookAppSecret);

            services.AddSingleton(graphClient);

            string elasticSearchUrl          = Configuration["elasticsearch:url"];
            string elasticSearchDefaultIndex = Configuration["elasticsearch:defaultIndex"];

            string elasticSearchUserName = Configuration["elasticsearch:user"];
            string elasticSearchPassword = Configuration["elasticsearch:password"];

            var node = new Uri(elasticSearchUrl);
            Func <ConnectionSettings> settings = () =>
            {
                var connectionSettings = new ConnectionSettings(node);
                if (string.IsNullOrEmpty(elasticSearchUserName))
                {
                    return(connectionSettings);
                }

                return(connectionSettings.BasicAuthentication(elasticSearchUserName, elasticSearchPassword));
            };

            var pageMetadataRepository = new ElasticSearchRepository <PageMetadata>(settings(), elasticSearchDefaultIndex + "-metadata-page");

            services.AddSingleton(pageMetadataRepository);

            var pageScrapeHistoryRepository = new ElasticSearchRepository <PageScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-pagescrape");

            services.AddSingleton(pageScrapeHistoryRepository);

            var postScrapeRepository = new ElasticSearchRepository <PostScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-postscrape");

            services.AddSingleton(postScrapeRepository);

            var pageScraper = new PageScraper(settings(), elasticSearchDefaultIndex + "-page", graphClient);

            services.AddSingleton(pageScraper);

            var postScraper = new PostScraper(settings(), elasticSearchDefaultIndex + "-post", pageScraper, graphClient);

            services.AddSingleton(postScraper);

            var commentScraper = new CommentScraper(settings(), elasticSearchDefaultIndex + "-comment", graphClient);

            services.AddSingleton(commentScraper);
        }
Пример #8
0
        public void Setup()
        {
            var mockHttpClientFactory = new MockHttpClientFactory(PageSource());

            pageScraper = new PageScraper(mockHttpClientFactory);
        }
Пример #9
0
        public void when_gettingLinksFor_with_simple_data_expect_correct_number_returned()
        {
            // arrage
            var pageDataProvider = MockRepository.GenerateStub<IPageDataProvider>();
            pageDataProvider.Stub(x => x.GetPageFor(string.Empty))
                            .Return(Task.Factory.StartNew(() => SimpleTestData))
                            .IgnoreArguments();

            var linkhelper = MockRepository.GenerateStub<ILinkHelper>();
            linkhelper.Stub(x => x.ParseLink(string.Empty, string.Empty))
                .Return("http://google.com")
                .IgnoreArguments();

            // act
            var result = new PageScraper(pageDataProvider, linkhelper).GetLinksFor(string.Empty);

            // act
            Assert.That(result.Result.Count(), Is.EqualTo(2));
        }
 public PagedResponse AllScrapes([FromBody] ElasticSearchRequest request)
 {
     return(PageScraper.Paged(request.PageNumber, request.PageSize, request.Query, request.Sort));
 }
 public ScrapedPage GetScrape(string id) => PageScraper.Get(id);
 public PageScrapeController(PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, ElasticSearchRepository <PageScrapeHistory> pageScrapeRepository)
 {
     PageScraper                 = pageScraper;
     PageMetadataRepository      = pageMetadataRepository;
     PageScrapeHistoryRepository = pageScrapeRepository;
 }
Пример #13
0
 public ScrapeImporter(PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, PostScraper postScraper)
 {
     PageScraper            = pageScraper;
     PageMetadataRepository = pageMetadataRepository;
     PostScraper            = postScraper;
 }
Пример #14
0
        public IEnumerable <ScrapedPage> ImportPages(IEnumerable <string> fanCountCSVs)
        {
            var      pages       = new List <ScrapedPage>();
            DateTime now         = DateTime.Now;
            int      numberSaved = 0;

            Read(fanCountCSVs, record =>
            {
                // The date is a string in a 2016-12-25 format.
                string dateString = (string)record["Dates"];
                if (dateString == "Date" || dateString == "")
                {
                    // Skip the header if it isn't parsed.
                    return;
                }

                DateTime date = DateTime.ParseExact(dateString, "yyyy-MM-dd", null);

                // Now get the list of all the pages.
                foreach (string pageName in record.Keys)
                {
                    // Skip all columns that are empty or are the "Dates" field.
                    if (pageName == "" || pageName == "Dates")
                    {
                        continue;
                    }

                    // Yuck: page names have varying degrees of leading and trailing whitespace.
                    // Yuck: page names for the same page vary between instances.
                    PageMetadata mappedPage = Mappings[pageName.Trim()];

                    // Now get the number of likes from the table.
                    // Yuck: some data is missing, or contains letters in.
                    // Yuck: some full numbers have decimal points in.
                    string numberOfLikesAsString = (string)record[pageName];
                    if (!int.TryParse(numberOfLikesAsString, NumberStyles.AllowDecimalPoint, null, out int numberOfLikes))
                    {
                        // If we can't parse the number of likes as an actual number, skip it.
                        Console.WriteLine("Can't parse number of likes");
                        continue;
                    }

                    // Add this to the fan count history.
                    ScrapedPage savedPage = PageScraper.Closest(p => p.Name, mappedPage.Name, date);
                    if (savedPage == null || savedPage.Date != date)
                    {
                        // Page doesn't have this date already. Add it.
                        savedPage = new ScrapedPage
                        {
                            Id         = Guid.NewGuid().ToString(),
                            Name       = mappedPage.Name,
                            Category   = mappedPage.Category,
                            FacebookId = mappedPage.FacebookId,
                            Date       = date,
                            FanCount   = numberOfLikes
                        };
                    }
                    else
                    {
                        // Page already has this date already. Update it.
                        savedPage.FanCount = numberOfLikes;
                    }

                    // Save the page.
                    numberSaved++;
                    Console.WriteLine(numberSaved);
                    pages.Add(PageScraper.Save(savedPage, Refresh.False));
                }
            });

            return(pages);
        }