/// <summary> /// Read html content from a file and validate it, then extract neccessary informations based on config file /// </summary> /// <returns>Json string of result, based on config file</returns> public string ExtractHotelInformationFromHtml() { //Read File ContentPart var htmlContent = _fileService.ReadFileContent(Path.Combine(FilePathHelper.CallingAssemblyDirectoryPath, "Assets", "extraction.booking.html")); //Validate html to see if it includes html tags which we need them. if (string.IsNullOrWhiteSpace(htmlContent)) { throw new UserFriendlyException(HotelExtractorResourceKeys.HtmlContentEmptyMessage); } //Read html document with the help of HtmlAgility Package //https://html-agility-pack.net/documentation var doc = new HtmlDocument(); doc.LoadHtml(htmlContent); //Check if neccessary nodes with validation flags exists var res = _htmlExtractorService.ValidateNodes(HotelExtractorNodes, doc.DocumentNode); if (res != null) { throw new UserFriendlyException(string.Format(HotelExtractorResourceKeys.InvalidHtmlFormat, res)); } //Proccess raw data to extract information return(_htmlExtractorService.GetValuesFromNodes(HotelExtractorNodes, doc.DocumentNode).ToString()); }
public void GetValuesFromNodes_Test() { var doc = new HtmlDocument(); doc.LoadHtml(HtmlNode); var nodes = new HtmlExtractorNode[] { new HtmlExtractorNode { Name = "Head", ShouldShowInOutPut = true, XPath = "html/head" }, new HtmlExtractorNode { Name = "H1Text", ShouldShowInOutPut = true, XPath = "html/body/h1" }, new HtmlExtractorNode { Name = "H2Text", ShouldShowInOutPut = true, XPath = "html/body/h2", Type = "float" }, new HtmlExtractorNode { Name = "Cars", ShouldShowInOutPut = true, XPath = "./html/body/div/div", Childs = new HtmlExtractorNode[] { new HtmlExtractorNode { Name = "Name", ShouldShowInOutPut = true, XPath = "./span[1]" }, new HtmlExtractorNode { Name = "Year", ShouldShowInOutPut = true, XPath = "./span[2]" }, new HtmlExtractorNode { Name = "CarModels", ShouldShowInOutPut = true, XPath = "./div/span", Childs = new HtmlExtractorNode[] { new HtmlExtractorNode { Name = "Name", ShouldShowInOutPut = true, XPath = "./text()" }, } }, } } }; var response = _htmlExtractorService.GetValuesFromNodes(nodes, doc.DocumentNode); var result = Newtonsoft.Json.JsonConvert.DeserializeObject <GetValuesFromNodesOutPut>(response.ToString()); Assert.AreEqual("test", result.Head); StringAssert.Contains("Mojtaba", result.H1Text); Assert.AreEqual(12.1f, result.H2Text); Assert.That(result.Cars, Has.Count.EqualTo(3)); Assert.That(result.Cars[0].CarModels, Is.Null); Assert.That(result.Cars[1].CarModels, Has.Count.EqualTo(2)); Assert.That(result.Cars[2].CarModels, Has.Count.EqualTo(4)); }