Exemple #1
0
        public async Task <Hashtable> ScrapePricesForProduct(string productName)
        {
            var similarity = 3000;
            var bestSimilarityCoefficient = 3000;
            var headingIndex = 0;

            var configuration = Configuration.Default.WithDefaultLoader().WithCookies().WithMetaRefresh();
            var context       = BrowsingContext.New(configuration);
            await context.OpenAsync(SearchUrlSelector + productName);

            // get headings
            productHeadings = context.Active.QuerySelectorAll(ProductHeadingSelector);
            // get prices container
            productPrices = context.Active.QuerySelectorAll(ProductPriceSelector);
            // get product links
            productLinks = context.Active.QuerySelectorAll(ProductLinkSelector);


            for (int i = 0; i < productHeadings.Length; i++)
            {
                // just for testing
                // Console.WriteLine(productHeadings[i].Text().ToLower());

                similarity = StringSimilarity.ComputeLevenshteinDistance(productHeadings[i].Text().ToLower(), productName);
                if (similarity < bestSimilarityCoefficient)
                {
                    bestSimilarityCoefficient = similarity;
                    headingIndex = i;
                }
            }


            Console.WriteLine("Amazon -------------");
            Console.WriteLine("Amazon best similarity " + bestSimilarityCoefficient);
            Console.WriteLine("Found product: " + productHeadings[headingIndex].Text() + "\n" + "price: " + productPrices[headingIndex].Text());
            Console.WriteLine("");


            // check if there is more than one price in the price element
            var pricesCount = productPrices[headingIndex].TextContent.Count(x => x == '£');

            if (pricesCount != 1)
            {
                // two prices
                var formattedPrices       = productPrices[headingIndex].TextContent.Replace('£', ' ').SplitWithTrimming('-').ToList();
                var formattedPricesDouble = formattedPrices.Select(x => double.Parse(x)).ToList();
                var pricesAverage         = formattedPricesDouble.Average();

                var anchorElement = (IHtmlAnchorElement)productLinks[headingIndex];
                var link          = anchorElement.Href;

                return(new Hashtable
                {
                    { "Price", pricesAverage },
                    { "Formatted Price", pricesAverage },
                    { "Similarity", bestSimilarityCoefficient },
                    { "Product Heading", productHeadings[headingIndex].Text() },
                    { "Product Link", link }
                });
            }

            // only one price
            var formattedPrice       = productPrices[headingIndex].TextContent.Replace('£', ' ');
            var formattedPriceDouble = Convert.ToDouble(formattedPrice);

            var anchorElementSinglePrice = (IHtmlAnchorElement)productLinks[headingIndex];
            var linkSinglePrice          = anchorElementSinglePrice.Href;

            return(new Hashtable
            {
                { "Price", productPrices[headingIndex].Text() },
                { "Formatted Price", formattedPrice },
                { "Similarity", bestSimilarityCoefficient },
                { "Product Heading", productHeadings[headingIndex].Text() },
                { "Product Link", linkSinglePrice }
            });
        }
        public async Task <Hashtable> ScrapePricesForProduct(string productName)
        {
            var similarity = 3000;
            var bestSimilarityCoefficient = 3000;
            var headingIndex = 0;

            var configuration = Configuration.Default.WithDefaultLoader().WithCookies().WithMetaRefresh();
            var context       = BrowsingContext.New(configuration);
            var searchPath    = SearchUrlSelector.Replace("{productPlaceHolder}", productName);
            await context.OpenAsync(searchPath);

            // get headings
            productHeadings = context.Active.QuerySelectorAll(ProductHeadingSelector);
            // get prices container
            productPrices = context.Active.QuerySelectorAll(ProductPriceSelector);
            // get product links
            productLinks = context.Active.QuerySelectorAll(ProductLinkSelector);

            // in case the url readdress to a single product page (no need to calculate string similarity as only one result in the page)
            if (productHeadings.Length == 0 || productPrices.Length == 0)
            {
                var singleProductHeading = context.Active.QuerySelectorAll(".product-header__title");
                var singleProductPrice   = context.Active.QuerySelectorAll(".price--large");
                var singleProductLink    = context.Active.Url;

                return(new Hashtable
                {
                    { "Price", singleProductPrice.FirstOrDefault().Text().Replace('£', ' ') },
                    { "Formatted Price", singleProductPrice.FirstOrDefault().Text().Replace('£', ' ') },
                    { "Product Link", singleProductLink },
                    { "Product Heading", singleProductHeading.FirstOrDefault().Text() }
                });
            }


            for (int i = 0; i < productHeadings.Length; i++)
            {
                similarity = StringSimilarity.ComputeLevenshteinDistance(productHeadings[i].Text().ToLower(), productName);
                if (similarity < bestSimilarityCoefficient)
                {
                    bestSimilarityCoefficient = similarity;
                    headingIndex = i;
                }
            }


            Console.WriteLine("John Lewis -------------");
            Console.WriteLine("John Lewis best similarity " + bestSimilarityCoefficient);
            Console.WriteLine("Found product: " + productHeadings[headingIndex].Text() + "\n" + "price: " + productPrices[headingIndex].Text());
            Console.WriteLine("");

            // check if there is more than one price in the price element
            var pricesCount = productPrices[headingIndex].TextContent.Count(x => x == '£');

            if (pricesCount != 1)
            {
                // two prices
                var formattedPrices       = productPrices[headingIndex].TextContent.Replace('£', ' ').SplitWithTrimming('-').ToList();
                var formattedPricesDouble = formattedPrices.Select(x => double.Parse(x)).ToList();
                var pricesAverage         = formattedPricesDouble.Average();

                return(new Hashtable
                {
                    { "Price", pricesAverage },
                    { "Formatted Price", pricesAverage },
                    { "Similarity", bestSimilarityCoefficient },
                    { "Product Heading", productHeadings[headingIndex].Text() },
                    { "Product Link", "https://www.johnlewis.com" + productLinks[headingIndex].GetAttribute("href") }
                });
            }

            // only one price
            var formattedPrice       = productPrices[headingIndex].TextContent.Replace('£', ' ');
            var formattedPriceDouble = Convert.ToDouble(formattedPrice);

            return(new Hashtable
            {
                { "Price", productPrices[headingIndex].Text() },
                { "Formatted Price", formattedPrice },
                { "Similarity", bestSimilarityCoefficient },
                { "Product Heading", productHeadings[headingIndex].Text() },
                { "Product Link", "https://www.johnlewis.com" + productLinks[headingIndex].GetAttribute("href") }
            });
        }