public void CanExportAltoXmlFormat() { using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes()); var xml = exporter.Get(document.GetPage(4), true); Assert.NotNull(xml); using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) using (var xmlReader = new XmlTextReader(xmlStream)) { var xDocument = XDocument.Load(xmlReader); Assert.NotNull(xDocument); } } }
public void CanExportAltoXmlFormatPage16() { // Page 16 contains an unprintable string and a single line of text which causes problems for Docstrum. using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) { var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes()); var xml = exporter.Get(document.GetPage(16), true); Assert.NotNull(xml); using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml))) using (var xmlReader = new XmlTextReader(xmlStream)) { var xDocument = XDocument.Load(xmlReader); Assert.NotNull(xDocument); } } }