Exemple #1
0
        public async Task CanScrapeDataFromTable()
        {
            //Arrange
            _scrapper.ItemXPath = ".body-content table:first-child > tbody > tr";
            _scrapper.Url       = "http://lychee.scrapper.localhost/Home/Data";
            _scrapper.Items     = new List <ScrapeItemSetting>
            {
                new ScrapeItemSetting {
                    Key = "Name", Selector = "td:nth-child(1)"
                },
                new ScrapeItemSetting {
                    Key = "Price", Selector = "td:nth-child(2) input", AttributeName = "value"
                },
                new ScrapeItemSetting {
                    Key = "PlatformValue", Selector = "td:nth-child(4) select option[selected=\"selected\"]", AttributeName = "value"
                },
                new ScrapeItemSetting {
                    Key = "PlatformText", Selector = "td:nth-child(4) select option[selected=\"selected\"]"
                },
                new ScrapeItemSetting {
                    Key = "IsActive", Selector = "td:nth-child(5) input", AttributeName = "checked"
                },
            };

            //Act
            var result = await _scrapper.Scrape();

            //Asserts
            Assert.That(result, Is.Not.Null);
        }
        /// <summary>
        /// If the page list has more than 1 page, we scrape all the other pages on the same category too
        /// </summary>
        public virtual void ScrapeOtherPages(int lastPage, IPageListScrapper firstPageScrapper)
        {
            var actions = new List <Action>();

            for (var i = 2; i <= lastPage; i++)
            {
                var scrapper = new PageListScrapper(_settingRepository, _loggingService, _webQueryService);
                firstPageScrapper.Clone(scrapper);
                scrapper.Url = GetNextUrl(i, firstPageScrapper.Url);

                actions.Add(() =>
                {
                    var data = scrapper.Scrape();
                    _resultCollectionService.SaveScrappedData(data.Result);
                });
            }

            //Invoke all the tasks
            try
            {
                Parallel.Invoke(new ParallelOptions {
                    MaxDegreeOfParallelism = 8
                }, actions.ToArray());
            }
            catch (AggregateException ex)
            {
                var exceptions = string.Join(ex.InnerExceptions.ToString(), ",");
                _loggingService.Logger.Error(exceptions);
            }
        }
Exemple #3
0
        public async Task ScrapeTest()
        {
            //Arrange
            _scrapper.ItemXPath = "div.product-list div.product";
            _scrapper.Items     = new List <ScrapeItemSetting>
            {
                new ScrapeItemSetting
                {
                    Key           = "Url",
                    AttributeName = "href",
                    Selector      = "div.details div.title a"
                },
                new ScrapeItemSetting
                {
                    Key          = "ProductName",
                    Selector     = "div.details div.title a",
                    IsIdentifier = true
                },
                new ScrapeItemSetting
                {
                    Key      = "Price",
                    Selector = "div.product-price span.price"
                },
                new ScrapeItemSetting
                {
                    Key           = "Image",
                    AttributeName = "src",
                    ValueRequired = true,
                    Selector      = "div.image img"
                }
            };

            //Act
            var products = await _scrapper.Scrape();

            //Assert
            Assert.That(products, Is.Not.Null);
            Assert.That(products, Is.All.Not.Null);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "ProductName" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Price" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Url" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
        }
        public async Task ScrapeTest()
        {
            //Arrange
            _scrapper.ItemXPath = ".js-productContent article";
            _scrapper.Items     = new List <ScrapeItemSetting>
            {
                new ScrapeItemSetting
                {
                    Key           = "Url",
                    AttributeName = "href",
                    Selector      = "a.js-imagehover"
                },
                new ScrapeItemSetting
                {
                    Key           = "ProductName",
                    Selector      = "div:nth-child(2) a",
                    AttributeName = "title",
                    IsIdentifier  = true
                },
                new ScrapeItemSetting
                {
                    Key      = "Price",
                    Selector = "div.price"
                },
                new ScrapeItemSetting
                {
                    Key           = "Image",
                    AttributeName = "src",
                    ValueRequired = true,
                    Selector      = "img",
                    MultipleValue = true
                },
                new ScrapeItemSetting
                {
                    Key           = "ColourSwatch",
                    AttributeName = "title",
                    MultipleValue = true,
                    Selector      = ".swatchContainer a"
                }
            };

            //Act
            var products = await _scrapper.Scrape();


            //Assert
            Assert.That(products, Is.Not.Null);
            Assert.That(products, Is.All.Not.Null);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "ProductName" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Price" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "Url" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
        }
        public virtual async Task Scrape()
        {
            //Loads the initial page
            var scrappedData = await _scrapper.Scrape();

            //Save to db
            _resultCollectionService.SaveScrappedData(scrappedData);

            var htmlNode    = _scrapper.GetLoadedHtmlNode();
            var isFirstPage = _pageListPaginationService.IsFirstPage(htmlNode);

            if (isFirstPage)
            {
                var lastPage = _pageListPaginationService.GetLastPageNumber(htmlNode);
                if (lastPage > 1)
                {
                    ScrapeOtherPages(lastPage, _scrapper);
                }
            }


            //https://www.mightyape.co.nz/games/ps4/best-sellers?page=1
            //Scrape the whole list? PageListScrapper.Strategy.AutomaticallyScrapeAllInCategory


            //This is only for scraping page with normal pagination.
            //If the page has LoadMore button like ComputerLounge or Carousell, then use SmartScrapper


            //Goal is to find the last page number
            //if not given,
            //Determine if there is a summary of pagination. Like how many items are displaying per page and what is the total products in the category
            //Check if total number of product in the list is given. Example. 1 to 40 of 601. Total product is 601
            //Determine how many products are displaying in the list. Example. 1 to 40 of 601. Products in list are 40

            //if these are all available then it's a little bit easier.
            //However if not available, like on maxshop site, it will be a little bit complicated
            //Check if there's a ViewAll button? Most cases there will be not, In Maxshop's case there is.

            //if last page is given then we do not do the complicated steps above. Example of website that already give the last page, https://www.glassons.com/clothing/tops, https://www.numberoneshoes.co.nz/womens

            //if on page 1,
            //Determine the last page. Given the total number of products and product display per page, we can compute the last page (if not given) by Math.Ceil(totalproducts/productperpage)
            //if not just execute the code below to scrape the current page.


            //after determining the pages, we can decide how we want to scrape the products to the other pages.
            //if we want to parallel scrapping or linear. Scrapping.PageListScrapper.IsParallelScrapping
            //Maximum of 4 thread running
        }
        public async Task ScrapeTest()
        {
            //Arrange
            _scrapper.Url       = "http://mymovies.localhost/";
            _scrapper.ItemXPath = "#content .container .row:first-child .col-sm-6";
            _scrapper.Items     = new List <ScrapeItemSetting>
            {
                new ScrapeItemSetting
                {
                    Key      = "MovieName",
                    Selector = "h3"
                }
            };

            _settingRepository.Setup(x => x.GetSettingValue <bool>("Core.Logger.LogDownloadedPage")).Returns(true);

            //Act
            var products = await _scrapper.Scrape();

            //Assert
            Assert.That(products, Is.Not.Null);
            Assert.That(products, Is.All.Not.Null);
            Assert.That(products.All(x => x.Items.Exists(y => y.Name == "MovieName" && !string.IsNullOrEmpty(y.Value.ToString()))), Is.True);
        }