public static void Run()
        {
            var exporter = new PageXmlTextExporter(NearestNeighbourWordExtractor.Instance,
                                                   RecursiveXYCut.Instance,
                                                   UnsupervisedReadingOrderDetector.Instance,
                                                   scale: zoom);

            using (var converter = new PdfImageConverter(pdfPath))
                using (PdfDocument document = PdfDocument.Open(pdfPath))
                {
                    var page = document.GetPage(pageNo);

                    var xml = exporter.Get(page);
                    File.WriteAllText(Path.ChangeExtension(pdfPath, pageNo + ".xml"), xml);

                    using (var bitmap = converter.GetPage(page.Number, zoom))
                        using (var graphics = Graphics.FromImage(bitmap))
                        {
                            // save pdf page as image
                            bitmap.Save(Path.ChangeExtension(pdfPath, pageNo + "_raw.png"));

                            // save empty image for LayoutEvalGUI
                            Bitmap blackAndWhite = new Bitmap(bitmap.Width, bitmap.Height, PixelFormat.Format8bppIndexed);
                            blackAndWhite.Save(Path.ChangeExtension(pdfPath, pageNo + "_bw_raw.png"));
                        }
                }
        }
示例#2
0
        private static string GetXml(PageXmlTextExporter pageXmlTextExporter = null)
        {
            pageXmlTextExporter = pageXmlTextExporter ?? new PageXmlTextExporter(
                DefaultWordExtractor.Instance,
                RecursiveXYCut.Instance,
                UnsupervisedReadingOrderDetector.Instance);

            string xml;

            using (var document = PdfDocument.Open(GetFilename()))
            {
                var page = document.GetPage(1);
                xml = pageXmlTextExporter.Get(page);
            }

            return(xml);
        }