예제 #1
0
        static IEnumerable <HtmlDataSet> ParseDocument(IDocument document, DataSetGeneratorSettings dataSetGeneratorSettings, int priceElementRepeatCount)
        {
            var htmlElements = document.QuerySelectorAll("*").Where(el => el.ChildElementCount == 0 && !String.IsNullOrEmpty(el.OuterHtml));

            htmlElements = htmlElements.OfTypes(new Type[]
            {
                typeof(IHtmlSpanElement),
                typeof(IHtmlDivElement),
                typeof(IHtmlMetaElement),
                typeof(IHtmlListItemElement)
            });

            var list = new List <HtmlDataSet>();

            foreach (var element in htmlElements)
            {
                bool isContainsPrice = false;

                foreach (var priceTag in dataSetGeneratorSettings.PriceTags)
                {
                    if (element.OuterHtml.Contains(priceTag))
                    {
                        isContainsPrice = true;
                    }
                }

                foreach (var regex in dataSetGeneratorSettings.Regex)
                {
                    if (Regex.IsMatch(element.OuterHtml, regex))
                    {
                        isContainsPrice = true;
                    }
                }

                var htmlElement = Transform(element.OuterHtml);

                if (isContainsPrice)
                {
                    for (int i = 0; i < priceElementRepeatCount; i++)
                    {
                        var randomGeneratedHtmlElement = GenerateRandomPriceElement(htmlElement);
                        list.Add(new HtmlDataSet {
                            IsContainsPrice = isContainsPrice, HtmlElement = randomGeneratedHtmlElement, ClassName = element.ClassName, HtmlElementName = element.LocalName
                        });
                    }
                }

                list.Add(new HtmlDataSet {
                    IsContainsPrice = isContainsPrice, HtmlElement = htmlElement, ClassName = element.ClassName, HtmlElementName = element.LocalName
                });
            }

            return(list);
        }
예제 #2
0
        static async Task <IEnumerable <HtmlDataSet> > FileStorageDataSetGenerate(string folderPath, ServiceProvider serviceProvider, DataSetGeneratorSettings dataSetSettings)
        {
            var context = BrowsingContext.New(Configuration.Default);
            var parser  = context.GetService <IHtmlParser>();

            List <HtmlDataSet> list = new List <HtmlDataSet>();

            foreach (var filePath in Directory.GetFiles(folderPath, "*.txt"))
            {
                var document = parser.ParseDocument(await File.ReadAllTextAsync(filePath));
                list.AddRange(ParseDocument(document, dataSetSettings, PriceElementRepeatCount));
            }

            return(list);
        }