public PostScrapeController(PostScraper postScraper, CommentScraper commentScraper, PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, ElasticSearchRepository <PostScrapeHistory> postScrapeHistoryRepository) { PostScraper = postScraper; CommentScraper = commentScraper; PageScraper = pageScraper; PageMetadataRepository = pageMetadataRepository; PostScrapeHistoryRepository = postScrapeHistoryRepository; }
private void cmdGet_Click(object sender, EventArgs e) { oItems = PageScraper.GetItems(); if (oItems != null && oItems.Count > 0) { dgvResults.DataSource = oItems; } else { MessageBox.Show("No data to display"); } }
public PageScrapeHistory ScrapePages([FromBody] IEnumerable <string> request) { // If no pages were specified, scrape them all. PageMetadata[] pagesToScrape; if (request == null) { pagesToScrape = PageMetadataRepository.All().Data.ToArray(); } else { pagesToScrape = request.Select(id => PageMetadataRepository.Get(id)).ToArray(); } DateTime scrapeStart = DateTime.Now; ScrapedPage[] pages = PageScraper.Scrape(pagesToScrape, scrapeStart).ToArray(); // Now update the per-page list of all scraped pages. foreach (PageMetadata pageMetadata in pagesToScrape) { ScrapedPage scrapedPage = pages.First(p => p.FacebookId == pageMetadata.FacebookId); pageMetadata.FanCountHistory.Insert(0, new DatedFanCount { Date = scrapedPage.Date, FanCount = scrapedPage.FanCount, }); pageMetadata.LatestScrape = scrapeStart; PageMetadataRepository.Save(pageMetadata, Refresh.False); // Only save the fan count on this date. pageMetadata.FanCountHistory = pageMetadata.FanCountHistory.Take(1).ToList(); } // Now update the total-page list of the scrape. var pageScrapeHistory = new PageScrapeHistory { Id = Guid.NewGuid().ToString(), ImportStart = scrapeStart, ImportEnd = DateTime.Now, Pages = pagesToScrape }; return(PageScrapeHistoryRepository.Save(pageScrapeHistory)); }
public void when_gettingLinksFor_with_real_looking_data_expect_correct_number_returned() { // arrage var pageDataProvider = MockRepository.GenerateStub <IPageDataProvider>(); pageDataProvider.Stub(x => x.GetPageFor(string.Empty)) .Return(Task.Factory.StartNew(() => TestData)) .IgnoreArguments(); var linkhelper = MockRepository.GenerateStub <ILinkHelper>(); linkhelper.Stub(x => x.ParseLink(string.Empty, string.Empty)) .Return("http://google.com") .IgnoreArguments(); // act var result = new PageScraper(pageDataProvider, linkhelper).GetLinksFor(string.Empty); // act Assert.That(result.Result.Count(), Is.EqualTo(82)); }
public void EqualBook() { ChromeOptions options = new ChromeOptions(); options.AddArguments("--window-position=0,0"); options.AddArguments("--window-size=1920,1080"); SeleneDriver driver = new SeleneDriver(new ChromeDriver()); driver.Open("https://amazon.com"); HomePage initPage = new HomePage(driver); initPage.FillField("Java"); SearchPage search = initPage.Submit(); PageScraper scraper = search.Confirm(); CheckedBook checkedBook = scraper.InitComparableBook(search.Urls()); Assert.True(checkedBook.IsEqual(scraper.allBooks)); driver.Close(); }
public async Task DetailPageScraper_ShouldBeOk() { //Arrange var pageSettings = this.GetPageSettings(); var scraper = new PageScraper( pageSettings, this._loggerMock.Object); scraper.BeforeDocumentOpen += (sender, e) => { e.BrowsingContext.Should().NotBeNull(); e.Url.Should().NotBeNullOrEmpty(); }; scraper.AfterDocumentOpen += (sender, e) => { e.Document.Should().NotBeNull(); }; scraper.PropertyScraped += (sender, e) => { e.ScrapedProperty.Name.Should().Be("Title"); e.ScrapedProperty.Element.Should().NotBeNull(); e.ScrapedProperty.Element.TextContent.Should().Be("Tomorrowland – Official Aftermovie – 30-JUL-2018"); e.Settings.Should().Equals(pageSettings.Properties); }; scraper.PageScraped += (sender, e) => { e.Url.Should().NotBeNullOrEmpty(); e.Settings.Should().Equals(pageSettings.Properties); this.AssertProperties( e.Properties.ToList(), pageSettings.Properties); }; //Act await scraper.Scrape(); }
public void ConfigureServices(IServiceCollection services) { // Boilerplate: add service and create Policy with options services.AddCors(options => { options.AddPolicy("CorsPolicy", builder => builder.AllowAnyOrigin() .AllowAnyMethod() .AllowAnyHeader() .AllowCredentials()); }); services.AddMvc(); services.AddSingleton(Configuration); // Register our repositories with ASP.NET Core to allow them to be injected // into our controllers. This preserves the same state between the controllers. Version facebookGraphAPIVersion = new Version(Configuration["facebook:graphAPIVersion"]); string facebookAppId = Configuration["facebook:appId"]; string facebookAppSecret = Configuration["facebook:appSecret"]; var graphClient = new GraphClient(facebookGraphAPIVersion, facebookAppId, facebookAppSecret); services.AddSingleton(graphClient); string elasticSearchUrl = Configuration["elasticsearch:url"]; string elasticSearchDefaultIndex = Configuration["elasticsearch:defaultIndex"]; string elasticSearchUserName = Configuration["elasticsearch:user"]; string elasticSearchPassword = Configuration["elasticsearch:password"]; var node = new Uri(elasticSearchUrl); Func <ConnectionSettings> settings = () => { var connectionSettings = new ConnectionSettings(node); if (string.IsNullOrEmpty(elasticSearchUserName)) { return(connectionSettings); } return(connectionSettings.BasicAuthentication(elasticSearchUserName, elasticSearchPassword)); }; var pageMetadataRepository = new ElasticSearchRepository <PageMetadata>(settings(), elasticSearchDefaultIndex + "-metadata-page"); services.AddSingleton(pageMetadataRepository); var pageScrapeHistoryRepository = new ElasticSearchRepository <PageScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-pagescrape"); services.AddSingleton(pageScrapeHistoryRepository); var postScrapeRepository = new ElasticSearchRepository <PostScrapeHistory>(settings(), elasticSearchDefaultIndex + "-metadata-postscrape"); services.AddSingleton(postScrapeRepository); var pageScraper = new PageScraper(settings(), elasticSearchDefaultIndex + "-page", graphClient); services.AddSingleton(pageScraper); var postScraper = new PostScraper(settings(), elasticSearchDefaultIndex + "-post", pageScraper, graphClient); services.AddSingleton(postScraper); var commentScraper = new CommentScraper(settings(), elasticSearchDefaultIndex + "-comment", graphClient); services.AddSingleton(commentScraper); }
public void Setup() { var mockHttpClientFactory = new MockHttpClientFactory(PageSource()); pageScraper = new PageScraper(mockHttpClientFactory); }
public void when_gettingLinksFor_with_simple_data_expect_correct_number_returned() { // arrage var pageDataProvider = MockRepository.GenerateStub<IPageDataProvider>(); pageDataProvider.Stub(x => x.GetPageFor(string.Empty)) .Return(Task.Factory.StartNew(() => SimpleTestData)) .IgnoreArguments(); var linkhelper = MockRepository.GenerateStub<ILinkHelper>(); linkhelper.Stub(x => x.ParseLink(string.Empty, string.Empty)) .Return("http://google.com") .IgnoreArguments(); // act var result = new PageScraper(pageDataProvider, linkhelper).GetLinksFor(string.Empty); // act Assert.That(result.Result.Count(), Is.EqualTo(2)); }
public PagedResponse AllScrapes([FromBody] ElasticSearchRequest request) { return(PageScraper.Paged(request.PageNumber, request.PageSize, request.Query, request.Sort)); }
public ScrapedPage GetScrape(string id) => PageScraper.Get(id);
public PageScrapeController(PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, ElasticSearchRepository <PageScrapeHistory> pageScrapeRepository) { PageScraper = pageScraper; PageMetadataRepository = pageMetadataRepository; PageScrapeHistoryRepository = pageScrapeRepository; }
public ScrapeImporter(PageScraper pageScraper, ElasticSearchRepository <PageMetadata> pageMetadataRepository, PostScraper postScraper) { PageScraper = pageScraper; PageMetadataRepository = pageMetadataRepository; PostScraper = postScraper; }
public IEnumerable <ScrapedPage> ImportPages(IEnumerable <string> fanCountCSVs) { var pages = new List <ScrapedPage>(); DateTime now = DateTime.Now; int numberSaved = 0; Read(fanCountCSVs, record => { // The date is a string in a 2016-12-25 format. string dateString = (string)record["Dates"]; if (dateString == "Date" || dateString == "") { // Skip the header if it isn't parsed. return; } DateTime date = DateTime.ParseExact(dateString, "yyyy-MM-dd", null); // Now get the list of all the pages. foreach (string pageName in record.Keys) { // Skip all columns that are empty or are the "Dates" field. if (pageName == "" || pageName == "Dates") { continue; } // Yuck: page names have varying degrees of leading and trailing whitespace. // Yuck: page names for the same page vary between instances. PageMetadata mappedPage = Mappings[pageName.Trim()]; // Now get the number of likes from the table. // Yuck: some data is missing, or contains letters in. // Yuck: some full numbers have decimal points in. string numberOfLikesAsString = (string)record[pageName]; if (!int.TryParse(numberOfLikesAsString, NumberStyles.AllowDecimalPoint, null, out int numberOfLikes)) { // If we can't parse the number of likes as an actual number, skip it. Console.WriteLine("Can't parse number of likes"); continue; } // Add this to the fan count history. ScrapedPage savedPage = PageScraper.Closest(p => p.Name, mappedPage.Name, date); if (savedPage == null || savedPage.Date != date) { // Page doesn't have this date already. Add it. savedPage = new ScrapedPage { Id = Guid.NewGuid().ToString(), Name = mappedPage.Name, Category = mappedPage.Category, FacebookId = mappedPage.FacebookId, Date = date, FanCount = numberOfLikes }; } else { // Page already has this date already. Update it. savedPage.FanCount = numberOfLikes; } // Save the page. numberSaved++; Console.WriteLine(numberSaved); pages.Add(PageScraper.Save(savedPage, Refresh.False)); } }); return(pages); }