public async Task CanScrapeDataFromTable() { //Arrange _scrapper.ItemXPath = ".body-content table:first-child > tbody > tr"; _scrapper.Url = "http://lychee.scrapper.localhost/Home/Data"; _scrapper.Items = new List <ScrapeItemSetting> { new ScrapeItemSetting { Key = "Name", Selector = "td:nth-child(1)" }, new ScrapeItemSetting { Key = "Price", Selector = "td:nth-child(2) input", AttributeName = "value" }, new ScrapeItemSetting { Key = "PlatformValue", Selector = "td:nth-child(4) select option[selected=\"selected\"]", AttributeName = "value" }, new ScrapeItemSetting { Key = "PlatformText", Selector = "td:nth-child(4) select option[selected=\"selected\"]" }, new ScrapeItemSetting { Key = "IsActive", Selector = "td:nth-child(5) input", AttributeName = "checked" }, }; //Act var result = await _scrapper.Scrape(); //Asserts Assert.That(result, Is.Not.Null); }
/// <summary> /// If the page list has more than 1 page, we scrape all the other pages on the same category too /// </summary> public virtual void ScrapeOtherPages(int lastPage, IPageListScrapper firstPageScrapper) { var actions = new List <Action>(); for (var i = 2; i <= lastPage; i++) { var scrapper = new PageListScrapper(_settingRepository, _loggingService, _webQueryService); firstPageScrapper.Clone(scrapper); scrapper.Url = GetNextUrl(i, firstPageScrapper.Url); actions.Add(() => { var data = scrapper.Scrape(); _resultCollectionService.SaveScrappedData(data.Result); }); } //Invoke all the tasks try { Parallel.Invoke(new ParallelOptions { MaxDegreeOfParallelism = 8 }, actions.ToArray()); } catch (AggregateException ex) { var exceptions = string.Join(ex.InnerExceptions.ToString(), ","); _loggingService.Logger.Error(exceptions); } }
public async Task ScrapeTest() { //Arrange _scrapper.ItemXPath = "div.product-list div.product"; _scrapper.Items = new List <ScrapeItemSetting> { new ScrapeItemSetting { Key = "Url", AttributeName = "href", Selector = "div.details div.title a" }, new ScrapeItemSetting { Key = "ProductName", Selector = "div.details div.title a", IsIdentifier = true }, new ScrapeItemSetting { Key = "Price", Selector = "div.product-price span.price" }, new ScrapeItemSetting { Key = "Image", AttributeName = "src", ValueRequired = true, Selector = "div.image img" } }; //Act var products = await _scrapper.Scrape(); //Assert Assert.That(products, Is.Not.Null); Assert.That(products, Is.All.Not.Null); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "ProductName" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Price" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Url" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); }
public async Task ScrapeTest() { //Arrange _scrapper.ItemXPath = ".js-productContent article"; _scrapper.Items = new List <ScrapeItemSetting> { new ScrapeItemSetting { Key = "Url", AttributeName = "href", Selector = "a.js-imagehover" }, new ScrapeItemSetting { Key = "ProductName", Selector = "div:nth-child(2) a", AttributeName = "title", IsIdentifier = true }, new ScrapeItemSetting { Key = "Price", Selector = "div.price" }, new ScrapeItemSetting { Key = "Image", AttributeName = "src", ValueRequired = true, Selector = "img", MultipleValue = true }, new ScrapeItemSetting { Key = "ColourSwatch", AttributeName = "title", MultipleValue = true, Selector = ".swatchContainer a" } }; //Act var products = await _scrapper.Scrape(); //Assert Assert.That(products, Is.Not.Null); Assert.That(products, Is.All.Not.Null); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "ProductName" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Price" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Url" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); }
public virtual async Task Scrape() { //Loads the initial page var scrappedData = await _scrapper.Scrape(); //Save to db _resultCollectionService.SaveScrappedData(scrappedData); var htmlNode = _scrapper.GetLoadedHtmlNode(); var isFirstPage = _pageListPaginationService.IsFirstPage(htmlNode); if (isFirstPage) { var lastPage = _pageListPaginationService.GetLastPageNumber(htmlNode); if (lastPage > 1) { ScrapeOtherPages(lastPage, _scrapper); } } //https://www.mightyape.co.nz/games/ps4/best-sellers?page=1 //Scrape the whole list? PageListScrapper.Strategy.AutomaticallyScrapeAllInCategory //This is only for scraping page with normal pagination. //If the page has LoadMore button like ComputerLounge or Carousell, then use SmartScrapper //Goal is to find the last page number //if not given, //Determine if there is a summary of pagination. Like how many items are displaying per page and what is the total products in the category //Check if total number of product in the list is given. Example. 1 to 40 of 601. Total product is 601 //Determine how many products are displaying in the list. Example. 1 to 40 of 601. Products in list are 40 //if these are all available then it's a little bit easier. //However if not available, like on maxshop site, it will be a little bit complicated //Check if there's a ViewAll button? Most cases there will be not, In Maxshop's case there is. //if last page is given then we do not do the complicated steps above. Example of website that already give the last page, https://www.glassons.com/clothing/tops, https://www.numberoneshoes.co.nz/womens //if on page 1, //Determine the last page. Given the total number of products and product display per page, we can compute the last page (if not given) by Math.Ceil(totalproducts/productperpage) //if not just execute the code below to scrape the current page. //after determining the pages, we can decide how we want to scrape the products to the other pages. //if we want to parallel scrapping or linear. Scrapping.PageListScrapper.IsParallelScrapping //Maximum of 4 thread running }
public async Task ScrapeTest() { //Arrange _scrapper.Url = "http://mymovies.localhost/"; _scrapper.ItemXPath = "#content .container .row:first-child .col-sm-6"; _scrapper.Items = new List <ScrapeItemSetting> { new ScrapeItemSetting { Key = "MovieName", Selector = "h3" } }; _settingRepository.Setup(x => x.GetSettingValue <bool>("Core.Logger.LogDownloadedPage")).Returns(true); //Act var products = await _scrapper.Scrape(); //Assert Assert.That(products, Is.Not.Null); Assert.That(products, Is.All.Not.Null); Assert.That(products.All(x => x.Items.Exists(y => y.Name == "MovieName" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True); }