public void ContainsSections() { var data = new List <KnowlegeData>(); var inputFilePath = GetDataPath(); using (var reader = XmlReader.Create(inputFilePath)) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name.Equals("page")) { var outerXml = reader.ReadOuterXml(); var infoboxes = outerXml.Split(WordUtils.Infobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); var geoboxes = outerXml.Split(WordUtils.Geobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); var citations = outerXml.Split(WordUtils.Citacia, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); if (infoboxes.Any() || geoboxes.Any() || citations.Any()) { WordUtils.TrimBoxes <Infobox>(infoboxes, ref data); WordUtils.TrimBoxes <Geobox>(geoboxes, ref data); WordUtils.TrimBoxes <Citation>(citations, ref data); break; } } } } Assert.IsTrue(data.Any()); }
/// <summary> /// Read XML - get pages structure /// </summary> /// <param name="inputFilePath"></param> public void SetPagesFromInputFile(string inputFilePath) { using (var reader = XmlReader.Create(inputFilePath)) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name.Equals("page")) { var outerXml = reader.ReadOuterXml(); //paralel work Parallel.Invoke(() => { //get all infoboxes from page var infoboxes = outerXml.Split(WordUtils.Infobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); //get all geoboxes from page var geoboxes = outerXml.Split(WordUtils.Geobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); //get all citations from page var citations = outerXml.Split(WordUtils.Citacia, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); if (infoboxes.Any() || geoboxes.Any() || citations.Any()) { var data = new List <KnowlegeData>(); WordUtils.TrimBoxes <Infobox>(infoboxes, ref data); WordUtils.TrimBoxes <Geobox>(geoboxes, ref data); WordUtils.TrimBoxes <Citation>(citations, ref data); if (data.Any()) { Pages.Add(new Page(data)); } } }); } } } }
/// <summary> /// Get issac newton page /// </summary> /// <returns></returns> private Page GetPage() { var inputFilePath = GetDataPath(); Page page = null; using (var reader = XmlReader.Create(inputFilePath)) { while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element && reader.Name.Equals("page")) { var outerXml = reader.ReadOuterXml(); if (outerXml.Contains("<title>Isaac Newton</title>")) { var infobox = outerXml.Split(WordUtils.Infobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1); var data = new List <KnowlegeData>(); WordUtils.TrimBoxes <Infobox>(infobox, ref data); page = new Page(data); page.Infoboxes.ForEach(x => x.SetRegexAttributes()); break; } } } } if (page == null) { new AssertFailedException("Page not found"); } return(page); }