/// <summary> /// Read html content from a file and validate it, then extract neccessary informations based on config file /// </summary> /// <returns>Json string of result, based on config file</returns> public string ExtractHotelInformationFromHtml() { //Read File ContentPart var htmlContent = _fileService.ReadFileContent(Path.Combine(FilePathHelper.CallingAssemblyDirectoryPath, "Assets", "extraction.booking.html")); //Validate html to see if it includes html tags which we need them. if (string.IsNullOrWhiteSpace(htmlContent)) { throw new UserFriendlyException(HotelExtractorResourceKeys.HtmlContentEmptyMessage); } //Read html document with the help of HtmlAgility Package //https://html-agility-pack.net/documentation var doc = new HtmlDocument(); doc.LoadHtml(htmlContent); //Check if neccessary nodes with validation flags exists var res = _htmlExtractorService.ValidateNodes(HotelExtractorNodes, doc.DocumentNode); if (res != null) { throw new UserFriendlyException(string.Format(HotelExtractorResourceKeys.InvalidHtmlFormat, res)); } //Proccess raw data to extract information return(_htmlExtractorService.GetValuesFromNodes(HotelExtractorNodes, doc.DocumentNode).ToString()); }
public void ValidateNodes_ValidData_Test() { var doc = new HtmlDocument(); doc.LoadHtml(HtmlNode); var validNodes = new HtmlExtractorNode[] { new HtmlExtractorNode { Name = "Head", ShouldShowInOutPut = true, XPath = "html/head", ShouldCheckInValidation = true }, new HtmlExtractorNode { Name = "H1Text", ShouldShowInOutPut = true, XPath = "html/body/h1", ShouldCheckInValidation = true }, new HtmlExtractorNode { Name = "H2Text", ShouldShowInOutPut = true, XPath = "html/body/h2", Type = "float", ShouldCheckInValidation = true }, new HtmlExtractorNode { Name = "Cars", ShouldShowInOutPut = true, XPath = "./html/body/div/div", Childs = new HtmlExtractorNode[] { new HtmlExtractorNode { Name = "Name", ShouldShowInOutPut = true, XPath = "./span[1]", ShouldCheckInValidation = true }, new HtmlExtractorNode { Name = "Year", ShouldShowInOutPut = true, XPath = "./span[2]", ShouldCheckInValidation = true }, new HtmlExtractorNode { Name = "CarModels", ShouldShowInOutPut = true, XPath = "./div/span", Childs = new HtmlExtractorNode[] { new HtmlExtractorNode { Name = "Name", ShouldShowInOutPut = true, XPath = "./text()" }, new HtmlExtractorNode { Name = "Model", ShouldShowInOutPut = true, XPath = "./span/div", ShouldCheckInValidation = false } } }, } } }; var response = _htmlExtractorService.ValidateNodes(validNodes, doc.DocumentNode); Assert.IsNull(response); }