示例#1
0
        protected override Task <string> ExtractName(IDocument inputData, ExtractorSettings parserSettings)
        {
            var nameElement = inputData.QuerySelectorAll(parserSettings.Name).FirstOrDefault();

            logger.LogInformation($"The processed part of the document by product name: {nameElement?.OuterHtml}");

            return(Task.FromResult(nameElement?.TextContent.Trim()));
        }
示例#2
0
        protected override bool IsCaughtByCaptcha(IDocument inputData, ExtractorSettings parserSettings)
        {
            if (inputData.Title == "Ой!")
            {
                logger.LogError($"Попали на капчу {inputData.Source.Text}");
                return(true);
            }

            return(false);
        }
示例#3
0
        public async Task <ProductData> Extract(T inputData, ExtractorSettings parserSettings)
        {
            if (IsCaughtByCaptcha(inputData, parserSettings))
            {
                throw new AggregateException($"Caught by CAPTCHA");
            }

            var name = await ExtractName(inputData, parserSettings);

            var(price, discountPrice) = await ExtractPrice(inputData, parserSettings);

            var additionalInformation = await ExtractAdditionalInformation(inputData, parserSettings);

            if (discountPrice == null && price == null)
            {
                additionalInformation = await ExtractOutofstockInformation(inputData, parserSettings);

                if (additionalInformation is null)
                {
                    throw new FormatException("Unknown error while extract product data");
                }
                else
                {
                    logger.LogInformation($"The item may be out of stock or has been removed. Info: {additionalInformation}");
                }
            }

            return(new ProductData
            {
                Name = name,
                Price = price,
                DiscountPrice = discountPrice,
                AdditionalInformation = additionalInformation,
                Date = DateTime.Now
            });
        }
示例#4
0
        protected override Task <string> ExtractAdditionalInformation(IDocument htmlDocument, ExtractorSettings parserSettings)
        {
            if (parserSettings.AdditionalInformation == null)
            {
                return(Task.FromResult <string>(null));
            }

            var additionaInformation = new Dictionary <string, string>();

            foreach (var keyValue in parserSettings.AdditionalInformation)
            {
                var element = htmlDocument.QuerySelectorAll(keyValue.Value).FirstOrDefault();

                if (element == null)
                {
                    logger.LogWarning($"Не удалось извлечь информацию о {keyValue.Key} по пути {keyValue.Value}");
                }

                var textContent = element?.TextContent ?? String.Empty;

                additionaInformation.Add(keyValue.Key, TransformAdditionalInformation(textContent));
            }

            var options = new JsonSerializerOptions()
            {
                Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping
            };
            var additionalInformationString = JsonSerializer.Serialize(additionaInformation, options);

            logger.LogInformation($"Найденная дополнительная информация {additionalInformationString}");

            return(Task.FromResult(additionalInformationString));
        }
示例#5
0
        protected override Task <(decimal?price, decimal?discountPrice)> ExtractPrice(IDocument inputData, ExtractorSettings parserSettings)
        {
            var discountPriceElement = inputData.QuerySelectorAll(parserSettings.DiscountHtmlPath).FirstOrDefault();

            logger.LogInformation($"The processed part of the document by discount: {discountPriceElement?.OuterHtml}");

            var discountPrice = discountPriceElement?.TextContent;

            string price = default;

            foreach (var priceHtmlPath in parserSettings.PriceHtmlPath)
            {
                var priceElement = inputData.QuerySelectorAll(priceHtmlPath).FirstOrDefault();
                logger.LogInformation($"The processed part of the document by price {priceElement?.OuterHtml}");

                price = priceElement?.TextContent;

                if (!string.IsNullOrEmpty(price))
                {
                    break;
                }
            }

            if (discountPrice == null && price == null)
            {
                return(Task.FromResult <(decimal?price, decimal?discountPrice)>((null, null)));
            }

            if (discountPrice != null && price == null)
            {
                price         = discountPrice;
                discountPrice = null;
            }

            if (discountPrice != null)
            {
                discountPrice = TransformPrice(discountPrice);
                discountPrice = ExtractPrice(discountPrice);
            }

            if (price != null)
            {
                price = TransformPrice(price);
                price = ExtractPrice(price);
            }

            if (!decimal.TryParse(price, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal priceValue))
            {
                throw new InvalidCastException($"Can not convert {nameof(price)}={price} to {typeof(decimal)}");
            }

            if (!decimal.TryParse(discountPrice, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal discountPriceTemp) && discountPrice != null)
            {
                throw new InvalidCastException($"Can not convert {nameof(discountPrice)}={discountPrice} to {typeof(decimal)}");
            }

            decimal?discountPriceValue = discountPrice == null ? null : (decimal?)discountPriceTemp;

            return(Task.FromResult(((decimal?)priceValue, discountPriceValue)));
        }
示例#6
0
 protected override Task <string> ExtractOutofstockInformation(IDocument inputData, ExtractorSettings parserSettings)
 => Task.FromResult(inputData.QuerySelectorAll(parserSettings.OutOfStockHtmlPath).FirstOrDefault()?.TextContent);
示例#7
0
        protected override Task <(decimal?price, decimal?discountPrice)> ExtractPrice(string inputData, ExtractorSettings parserSettings)
        {
            //TODO Сделать вырез цены из скрипшота
            ModelInput sampleData = new ModelInput {
                ImageSource = inputData
            };

            // Make a single prediction on the sample data and print results
            var prediction = predictionEnginePool.Predict(modelName: "CVPriceDetectionModel", example: sampleData);

            var priceBox = prediction.BoundingBoxes?.Where(p => p.Label == "price").OrderByDescending(p => p.Score).First();

            using Bitmap source      = new Bitmap(sampleData.ImageSource);
            using Bitmap resizeImage = ResizeBitmap(source, 800, 600);
            Rectangle section = new Rectangle(new Point((int)priceBox.Left, (int)priceBox.Top), new Size((int)priceBox.Right - (int)priceBox.Left, (int)priceBox.Bottom - (int)priceBox.Top));

            using Bitmap priceImage = CropImage(resizeImage, section);
            var priceImagePath = Path.Combine(configuration.GetValue <string>("ImagesFolder"), $"{Path.GetFileNameWithoutExtension(inputData)}-price.png");

            priceImage.Save(priceImagePath, System.Drawing.Imaging.ImageFormat.Png);

            var root = configuration.GetValue <string>(WebHostDefaults.ContentRootKey);

            using var tesseractEngine = new TesseractEngine($"{root}/CV/Tesseract", "eng+rus", EngineMode.Default);
            using var image           = Pix.LoadFromFile(priceImagePath);
            using var page            = tesseractEngine.Process(image);
            var price = page.GetText();

            File.Delete(priceImagePath);

            if (price is null)
            {
                throw new AggregateException($"Can not find content from image {inputData}");
            }

            price = TransformPrice(price);
            price = ExtractPrice(price);

            if (!decimal.TryParse(price, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal priceValue))
            {
                throw new InvalidCastException($"Can not convert {nameof(price)}={price} to {typeof(decimal)}");
            }

            return(Task.FromResult(((decimal?)priceValue, (decimal?)null)));
        }
示例#8
0
 protected override Task <string> ExtractOutofstockInformation(string inputData, ExtractorSettings parserSettings) => Task.FromResult <string>(null);
示例#9
0
 protected override Task <string> ExtractName(string inputData, ExtractorSettings parserSettings) => Task.FromResult <string>(null);
示例#10
0
 protected override Task <string> ExtractAdditionalInformation(IDocument inputData, ExtractorSettings parserSettings) => Task.FromResult <string>(null);
示例#11
0
        protected override Task <(decimal?price, decimal?discountPrice)> ExtractPrice(IDocument inputData, ExtractorSettings parserSettings)
        {
            string priceHtmlElement         = null;
            string discountPriceHtmlElement = null;
            int    count = 0;

            foreach (var priceData in GetPriceData(inputData))
            {
                var pricePrediction = predictionEnginePool.Predict(modelName: "MLPriceDetectionModel", example: priceData);

                if (!bool.TryParse(pricePrediction.Prediction, out bool isPrice))
                {
                    throw new InvalidCastException($"Can not convert {pricePrediction.Prediction} to {typeof(bool)}");
                }

                if (isPrice)
                {
                    if (priceHtmlElement != null && count < MaxPriceElementsInterval)
                    {
                        discountPriceHtmlElement = priceData.HtmlElement;
                        logger.LogInformation($"Detect {nameof(discountPriceHtmlElement)}={discountPriceHtmlElement} with {pricePrediction.Score}=[{string.Join(';', pricePrediction.Score)}]");
                    }

                    if (priceHtmlElement == null)
                    {
                        priceHtmlElement = priceData.HtmlElement;
                        logger.LogInformation($"Detect {nameof(priceHtmlElement)}={priceHtmlElement} with {pricePrediction.Score}=[{string.Join(';', pricePrediction.Score)}]");
                    }

                    count = 0;
                }

                count++;
            }

            if (priceHtmlElement is null && discountPriceHtmlElement is null)
            {
                return(Task.FromResult(((decimal?)null, (decimal?)null)));
            }

            string price         = ExtractPrice(priceHtmlElement);
            string discountPrice = ExtractPrice(discountPriceHtmlElement);

            if (!decimal.TryParse(price, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal priceValue))
            {
                throw new InvalidCastException($"Can not convert {nameof(price)}={price} to {typeof(decimal)}");
            }

            if (!decimal.TryParse(discountPrice, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal discountPriceTemp) && discountPrice != null)
            {
                throw new InvalidCastException($"Can not convert {nameof(discountPrice)}={discountPrice} to {typeof(decimal)}");
            }

            decimal?discountPriceValue = null;

            if (discountPrice != null && discountPriceTemp < priceValue)
            {
                decimal temp = priceValue;
                priceValue         = (decimal)discountPriceValue;
                discountPriceValue = temp;
            }

            return(Task.FromResult(((decimal?)priceValue, discountPriceValue)));
        }
示例#12
0
 protected virtual bool IsCaughtByCaptcha(T inputData, ExtractorSettings parserSettings) => false;
示例#13
0
 protected abstract Task <string> ExtractOutofstockInformation(T inputData, ExtractorSettings parserSettings);
示例#14
0
 protected abstract Task <string> ExtractAdditionalInformation(T inputData, ExtractorSettings parserSettings);
示例#15
0
 protected abstract Task <(decimal?price, decimal?discountPrice)> ExtractPrice(T inputData, ExtractorSettings parserSettings);
示例#16
0
 protected abstract Task <string> ExtractName(T inputData, ExtractorSettings parserSettings);