public static void Run() { var exporter = new PageXmlTextExporter(NearestNeighbourWordExtractor.Instance, RecursiveXYCut.Instance, UnsupervisedReadingOrderDetector.Instance, scale: zoom); using (var converter = new PdfImageConverter(pdfPath)) using (PdfDocument document = PdfDocument.Open(pdfPath)) { var page = document.GetPage(pageNo); var xml = exporter.Get(page); File.WriteAllText(Path.ChangeExtension(pdfPath, pageNo + ".xml"), xml); using (var bitmap = converter.GetPage(page.Number, zoom)) using (var graphics = Graphics.FromImage(bitmap)) { // save pdf page as image bitmap.Save(Path.ChangeExtension(pdfPath, pageNo + "_raw.png")); // save empty image for LayoutEvalGUI Bitmap blackAndWhite = new Bitmap(bitmap.Width, bitmap.Height, PixelFormat.Format8bppIndexed); blackAndWhite.Save(Path.ChangeExtension(pdfPath, pageNo + "_bw_raw.png")); } } }
private static string GetXml(PageXmlTextExporter pageXmlTextExporter = null) { pageXmlTextExporter = pageXmlTextExporter ?? new PageXmlTextExporter( DefaultWordExtractor.Instance, RecursiveXYCut.Instance, UnsupervisedReadingOrderDetector.Instance); string xml; using (var document = PdfDocument.Open(GetFilename())) { var page = document.GetPage(1); xml = pageXmlTextExporter.Get(page); } return(xml); }