コード例 #1
0
        /// <summary>
        /// Read html content from a file and validate it, then extract neccessary informations based on config file
        /// </summary>
        /// <returns>Json string of result, based on config file</returns>
        public string ExtractHotelInformationFromHtml()
        {
            //Read File ContentPart
            var htmlContent = _fileService.ReadFileContent(Path.Combine(FilePathHelper.CallingAssemblyDirectoryPath, "Assets", "extraction.booking.html"));

            //Validate html to see if it includes html tags which we need them.
            if (string.IsNullOrWhiteSpace(htmlContent))
            {
                throw new UserFriendlyException(HotelExtractorResourceKeys.HtmlContentEmptyMessage);
            }

            //Read html document with the help of HtmlAgility Package
            //https://html-agility-pack.net/documentation
            var doc = new HtmlDocument();

            doc.LoadHtml(htmlContent);

            //Check if neccessary nodes with validation flags exists
            var res = _htmlExtractorService.ValidateNodes(HotelExtractorNodes, doc.DocumentNode);

            if (res != null)
            {
                throw new UserFriendlyException(string.Format(HotelExtractorResourceKeys.InvalidHtmlFormat, res));
            }

            //Proccess raw data to extract information
            return(_htmlExtractorService.GetValuesFromNodes(HotelExtractorNodes, doc.DocumentNode).ToString());
        }
コード例 #2
0
        public void GetValuesFromNodes_Test()
        {
            var doc = new HtmlDocument();

            doc.LoadHtml(HtmlNode);

            var nodes = new HtmlExtractorNode[] {
                new HtmlExtractorNode {
                    Name = "Head", ShouldShowInOutPut = true, XPath = "html/head"
                },
                new HtmlExtractorNode {
                    Name = "H1Text", ShouldShowInOutPut = true, XPath = "html/body/h1"
                },
                new HtmlExtractorNode {
                    Name = "H2Text", ShouldShowInOutPut = true, XPath = "html/body/h2", Type = "float"
                },
                new HtmlExtractorNode {
                    Name = "Cars", ShouldShowInOutPut = true, XPath = "./html/body/div/div", Childs = new HtmlExtractorNode[] {
                        new HtmlExtractorNode {
                            Name = "Name", ShouldShowInOutPut = true, XPath = "./span[1]"
                        },
                        new HtmlExtractorNode {
                            Name = "Year", ShouldShowInOutPut = true, XPath = "./span[2]"
                        },
                        new HtmlExtractorNode {
                            Name = "CarModels", ShouldShowInOutPut = true, XPath = "./div/span", Childs = new HtmlExtractorNode[] {
                                new HtmlExtractorNode {
                                    Name = "Name", ShouldShowInOutPut = true, XPath = "./text()"
                                },
                            }
                        },
                    }
                }
            };

            var response = _htmlExtractorService.GetValuesFromNodes(nodes, doc.DocumentNode);


            var result = Newtonsoft.Json.JsonConvert.DeserializeObject <GetValuesFromNodesOutPut>(response.ToString());

            Assert.AreEqual("test", result.Head);
            StringAssert.Contains("Mojtaba", result.H1Text);
            Assert.AreEqual(12.1f, result.H2Text);
            Assert.That(result.Cars, Has.Count.EqualTo(3));
            Assert.That(result.Cars[0].CarModels, Is.Null);
            Assert.That(result.Cars[1].CarModels, Has.Count.EqualTo(2));
            Assert.That(result.Cars[2].CarModels, Has.Count.EqualTo(4));
        }