static IEnumerable <HtmlDataSet> ParseDocument(IDocument document, DataSetGeneratorSettings dataSetGeneratorSettings, int priceElementRepeatCount) { var htmlElements = document.QuerySelectorAll("*").Where(el => el.ChildElementCount == 0 && !String.IsNullOrEmpty(el.OuterHtml)); htmlElements = htmlElements.OfTypes(new Type[] { typeof(IHtmlSpanElement), typeof(IHtmlDivElement), typeof(IHtmlMetaElement), typeof(IHtmlListItemElement) }); var list = new List <HtmlDataSet>(); foreach (var element in htmlElements) { bool isContainsPrice = false; foreach (var priceTag in dataSetGeneratorSettings.PriceTags) { if (element.OuterHtml.Contains(priceTag)) { isContainsPrice = true; } } foreach (var regex in dataSetGeneratorSettings.Regex) { if (Regex.IsMatch(element.OuterHtml, regex)) { isContainsPrice = true; } } var htmlElement = Transform(element.OuterHtml); if (isContainsPrice) { for (int i = 0; i < priceElementRepeatCount; i++) { var randomGeneratedHtmlElement = GenerateRandomPriceElement(htmlElement); list.Add(new HtmlDataSet { IsContainsPrice = isContainsPrice, HtmlElement = randomGeneratedHtmlElement, ClassName = element.ClassName, HtmlElementName = element.LocalName }); } } list.Add(new HtmlDataSet { IsContainsPrice = isContainsPrice, HtmlElement = htmlElement, ClassName = element.ClassName, HtmlElementName = element.LocalName }); } return(list); }
static async Task <IEnumerable <HtmlDataSet> > FileStorageDataSetGenerate(string folderPath, ServiceProvider serviceProvider, DataSetGeneratorSettings dataSetSettings) { var context = BrowsingContext.New(Configuration.Default); var parser = context.GetService <IHtmlParser>(); List <HtmlDataSet> list = new List <HtmlDataSet>(); foreach (var filePath in Directory.GetFiles(folderPath, "*.txt")) { var document = parser.ParseDocument(await File.ReadAllTextAsync(filePath)); list.AddRange(ParseDocument(document, dataSetSettings, PriceElementRepeatCount)); } return(list); }