protected override Task <string> ExtractName(IDocument inputData, ExtractorSettings parserSettings) { var nameElement = inputData.QuerySelectorAll(parserSettings.Name).FirstOrDefault(); logger.LogInformation($"The processed part of the document by product name: {nameElement?.OuterHtml}"); return(Task.FromResult(nameElement?.TextContent.Trim())); }
protected override bool IsCaughtByCaptcha(IDocument inputData, ExtractorSettings parserSettings) { if (inputData.Title == "Ой!") { logger.LogError($"Попали на капчу {inputData.Source.Text}"); return(true); } return(false); }
public async Task <ProductData> Extract(T inputData, ExtractorSettings parserSettings) { if (IsCaughtByCaptcha(inputData, parserSettings)) { throw new AggregateException($"Caught by CAPTCHA"); } var name = await ExtractName(inputData, parserSettings); var(price, discountPrice) = await ExtractPrice(inputData, parserSettings); var additionalInformation = await ExtractAdditionalInformation(inputData, parserSettings); if (discountPrice == null && price == null) { additionalInformation = await ExtractOutofstockInformation(inputData, parserSettings); if (additionalInformation is null) { throw new FormatException("Unknown error while extract product data"); } else { logger.LogInformation($"The item may be out of stock or has been removed. Info: {additionalInformation}"); } } return(new ProductData { Name = name, Price = price, DiscountPrice = discountPrice, AdditionalInformation = additionalInformation, Date = DateTime.Now }); }
protected override Task <string> ExtractAdditionalInformation(IDocument htmlDocument, ExtractorSettings parserSettings) { if (parserSettings.AdditionalInformation == null) { return(Task.FromResult <string>(null)); } var additionaInformation = new Dictionary <string, string>(); foreach (var keyValue in parserSettings.AdditionalInformation) { var element = htmlDocument.QuerySelectorAll(keyValue.Value).FirstOrDefault(); if (element == null) { logger.LogWarning($"Не удалось извлечь информацию о {keyValue.Key} по пути {keyValue.Value}"); } var textContent = element?.TextContent ?? String.Empty; additionaInformation.Add(keyValue.Key, TransformAdditionalInformation(textContent)); } var options = new JsonSerializerOptions() { Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping }; var additionalInformationString = JsonSerializer.Serialize(additionaInformation, options); logger.LogInformation($"Найденная дополнительная информация {additionalInformationString}"); return(Task.FromResult(additionalInformationString)); }
protected override Task <(decimal?price, decimal?discountPrice)> ExtractPrice(IDocument inputData, ExtractorSettings parserSettings) { var discountPriceElement = inputData.QuerySelectorAll(parserSettings.DiscountHtmlPath).FirstOrDefault(); logger.LogInformation($"The processed part of the document by discount: {discountPriceElement?.OuterHtml}"); var discountPrice = discountPriceElement?.TextContent; string price = default; foreach (var priceHtmlPath in parserSettings.PriceHtmlPath) { var priceElement = inputData.QuerySelectorAll(priceHtmlPath).FirstOrDefault(); logger.LogInformation($"The processed part of the document by price {priceElement?.OuterHtml}"); price = priceElement?.TextContent; if (!string.IsNullOrEmpty(price)) { break; } } if (discountPrice == null && price == null) { return(Task.FromResult <(decimal?price, decimal?discountPrice)>((null, null))); } if (discountPrice != null && price == null) { price = discountPrice; discountPrice = null; } if (discountPrice != null) { discountPrice = TransformPrice(discountPrice); discountPrice = ExtractPrice(discountPrice); } if (price != null) { price = TransformPrice(price); price = ExtractPrice(price); } if (!decimal.TryParse(price, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal priceValue)) { throw new InvalidCastException($"Can not convert {nameof(price)}={price} to {typeof(decimal)}"); } if (!decimal.TryParse(discountPrice, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal discountPriceTemp) && discountPrice != null) { throw new InvalidCastException($"Can not convert {nameof(discountPrice)}={discountPrice} to {typeof(decimal)}"); } decimal?discountPriceValue = discountPrice == null ? null : (decimal?)discountPriceTemp; return(Task.FromResult(((decimal?)priceValue, discountPriceValue))); }
protected override Task <string> ExtractOutofstockInformation(IDocument inputData, ExtractorSettings parserSettings) => Task.FromResult(inputData.QuerySelectorAll(parserSettings.OutOfStockHtmlPath).FirstOrDefault()?.TextContent);
protected override Task <(decimal?price, decimal?discountPrice)> ExtractPrice(string inputData, ExtractorSettings parserSettings) { //TODO Сделать вырез цены из скрипшота ModelInput sampleData = new ModelInput { ImageSource = inputData }; // Make a single prediction on the sample data and print results var prediction = predictionEnginePool.Predict(modelName: "CVPriceDetectionModel", example: sampleData); var priceBox = prediction.BoundingBoxes?.Where(p => p.Label == "price").OrderByDescending(p => p.Score).First(); using Bitmap source = new Bitmap(sampleData.ImageSource); using Bitmap resizeImage = ResizeBitmap(source, 800, 600); Rectangle section = new Rectangle(new Point((int)priceBox.Left, (int)priceBox.Top), new Size((int)priceBox.Right - (int)priceBox.Left, (int)priceBox.Bottom - (int)priceBox.Top)); using Bitmap priceImage = CropImage(resizeImage, section); var priceImagePath = Path.Combine(configuration.GetValue <string>("ImagesFolder"), $"{Path.GetFileNameWithoutExtension(inputData)}-price.png"); priceImage.Save(priceImagePath, System.Drawing.Imaging.ImageFormat.Png); var root = configuration.GetValue <string>(WebHostDefaults.ContentRootKey); using var tesseractEngine = new TesseractEngine($"{root}/CV/Tesseract", "eng+rus", EngineMode.Default); using var image = Pix.LoadFromFile(priceImagePath); using var page = tesseractEngine.Process(image); var price = page.GetText(); File.Delete(priceImagePath); if (price is null) { throw new AggregateException($"Can not find content from image {inputData}"); } price = TransformPrice(price); price = ExtractPrice(price); if (!decimal.TryParse(price, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal priceValue)) { throw new InvalidCastException($"Can not convert {nameof(price)}={price} to {typeof(decimal)}"); } return(Task.FromResult(((decimal?)priceValue, (decimal?)null))); }
protected override Task <string> ExtractOutofstockInformation(string inputData, ExtractorSettings parserSettings) => Task.FromResult <string>(null);
protected override Task <string> ExtractName(string inputData, ExtractorSettings parserSettings) => Task.FromResult <string>(null);
protected override Task <string> ExtractAdditionalInformation(IDocument inputData, ExtractorSettings parserSettings) => Task.FromResult <string>(null);
protected override Task <(decimal?price, decimal?discountPrice)> ExtractPrice(IDocument inputData, ExtractorSettings parserSettings) { string priceHtmlElement = null; string discountPriceHtmlElement = null; int count = 0; foreach (var priceData in GetPriceData(inputData)) { var pricePrediction = predictionEnginePool.Predict(modelName: "MLPriceDetectionModel", example: priceData); if (!bool.TryParse(pricePrediction.Prediction, out bool isPrice)) { throw new InvalidCastException($"Can not convert {pricePrediction.Prediction} to {typeof(bool)}"); } if (isPrice) { if (priceHtmlElement != null && count < MaxPriceElementsInterval) { discountPriceHtmlElement = priceData.HtmlElement; logger.LogInformation($"Detect {nameof(discountPriceHtmlElement)}={discountPriceHtmlElement} with {pricePrediction.Score}=[{string.Join(';', pricePrediction.Score)}]"); } if (priceHtmlElement == null) { priceHtmlElement = priceData.HtmlElement; logger.LogInformation($"Detect {nameof(priceHtmlElement)}={priceHtmlElement} with {pricePrediction.Score}=[{string.Join(';', pricePrediction.Score)}]"); } count = 0; } count++; } if (priceHtmlElement is null && discountPriceHtmlElement is null) { return(Task.FromResult(((decimal?)null, (decimal?)null))); } string price = ExtractPrice(priceHtmlElement); string discountPrice = ExtractPrice(discountPriceHtmlElement); if (!decimal.TryParse(price, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal priceValue)) { throw new InvalidCastException($"Can not convert {nameof(price)}={price} to {typeof(decimal)}"); } if (!decimal.TryParse(discountPrice, NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture, out decimal discountPriceTemp) && discountPrice != null) { throw new InvalidCastException($"Can not convert {nameof(discountPrice)}={discountPrice} to {typeof(decimal)}"); } decimal?discountPriceValue = null; if (discountPrice != null && discountPriceTemp < priceValue) { decimal temp = priceValue; priceValue = (decimal)discountPriceValue; discountPriceValue = temp; } return(Task.FromResult(((decimal?)priceValue, discountPriceValue))); }
protected virtual bool IsCaughtByCaptcha(T inputData, ExtractorSettings parserSettings) => false;
protected abstract Task <string> ExtractOutofstockInformation(T inputData, ExtractorSettings parserSettings);
protected abstract Task <string> ExtractAdditionalInformation(T inputData, ExtractorSettings parserSettings);
protected abstract Task <(decimal?price, decimal?discountPrice)> ExtractPrice(T inputData, ExtractorSettings parserSettings);
protected abstract Task <string> ExtractName(T inputData, ExtractorSettings parserSettings);