public void CanGetTheLastPageNumberIfTotalProductIsGiven() { //Arrange _settingRepository.Setup(x => x.GetSettingValue <bool>("PageListScrapper.Pagination.IsLastPageGiven")).Returns(false); _settingRepository.Setup(x => x.GetSettingValue <bool>("PageListScrapper.Pagination.IsTotalNumberOfProductsGiven")).Returns(true); _settingRepository.Setup(x => x.GetSettingValue <int>("PageListScrapper.Pagination.ProductsPerPage")).Returns(40); _settingRepository.Setup(x => x.GetSettingValue <string>("PageListScrapper.Pagination.TotalNumberOfProductsSelector")).Returns(".products .gallery-header .summary .results .total"); var scrapper = new PageListScrapper(new SettingRepository(), _loggingService.Object, _webQueryService.Object, MightyAppePageListScrapperTest.LoadHtmlFromText()) { PaginationSettings = new PageListPagination { PaginationSelector = ".pagination li active span" } }; var pageListPaginationService = new PageListPaginationService(_settingRepository.Object, _loggingService.Object, scrapper); var node = scrapper.GetLoadedHtmlNode(); //Act var lastPage = pageListPaginationService.GetLastPageNumber(node); //Assert Assert.That(lastPage, Is.EqualTo(13)); }
public virtual async Task Scrape() { //Loads the initial page var scrappedData = await _scrapper.Scrape(); //Save to db _resultCollectionService.SaveScrappedData(scrappedData); var htmlNode = _scrapper.GetLoadedHtmlNode(); var isFirstPage = _pageListPaginationService.IsFirstPage(htmlNode); if (isFirstPage) { var lastPage = _pageListPaginationService.GetLastPageNumber(htmlNode); if (lastPage > 1) { ScrapeOtherPages(lastPage, _scrapper); } } //https://www.mightyape.co.nz/games/ps4/best-sellers?page=1 //Scrape the whole list? PageListScrapper.Strategy.AutomaticallyScrapeAllInCategory //This is only for scraping page with normal pagination. //If the page has LoadMore button like ComputerLounge or Carousell, then use SmartScrapper //Goal is to find the last page number //if not given, //Determine if there is a summary of pagination. Like how many items are displaying per page and what is the total products in the category //Check if total number of product in the list is given. Example. 1 to 40 of 601. Total product is 601 //Determine how many products are displaying in the list. Example. 1 to 40 of 601. Products in list are 40 //if these are all available then it's a little bit easier. //However if not available, like on maxshop site, it will be a little bit complicated //Check if there's a ViewAll button? Most cases there will be not, In Maxshop's case there is. //if last page is given then we do not do the complicated steps above. Example of website that already give the last page, https://www.glassons.com/clothing/tops, https://www.numberoneshoes.co.nz/womens //if on page 1, //Determine the last page. Given the total number of products and product display per page, we can compute the last page (if not given) by Math.Ceil(totalproducts/productperpage) //if not just execute the code below to scrape the current page. //after determining the pages, we can decide how we want to scrape the products to the other pages. //if we want to parallel scrapping or linear. Scrapping.PageListScrapper.IsParallelScrapping //Maximum of 4 thread running }
public void CanDetermineIfFirstPageByLookingAtThePaginationDOM_ShouldNOTPass_InvalidDOMPage() { //Arrange var scrapper = new PageListScrapper(new SettingRepository(), _loggingService.Object, _webQueryService.Object, MightyAppePageListScrapperTest.LoadHtmlFromText()) { PaginationSettings = new PageListPagination { PaginationSelector = ".pagination li active span" } }; var pageListPaginationService = new PageListPaginationService(_settingRepository.Object, _loggingService.Object, scrapper); //Act var result = pageListPaginationService.IsFirstPage(scrapper.GetLoadedHtmlNode()); //Asserts Assert.That(result, Is.EqualTo(false)); }