public static void Run()
        {
            var exporter = new PageXmlTextExporter(NearestNeighbourWordExtractor.Instance,
                                                   RecursiveXYCut.Instance,
                                                   UnsupervisedReadingOrderDetector.Instance,
                                                   scale: zoom);

            using (var converter = new PdfImageConverter(pdfPath))
                using (PdfDocument document = PdfDocument.Open(pdfPath))
                {
                    var page = document.GetPage(pageNo);

                    var xml = exporter.Get(page);
                    File.WriteAllText(Path.ChangeExtension(pdfPath, pageNo + ".xml"), xml);

                    using (var bitmap = converter.GetPage(page.Number, zoom))
                        using (var graphics = Graphics.FromImage(bitmap))
                        {
                            // save pdf page as image
                            bitmap.Save(Path.ChangeExtension(pdfPath, pageNo + "_raw.png"));

                            // save empty image for LayoutEvalGUI
                            Bitmap blackAndWhite = new Bitmap(bitmap.Width, bitmap.Height, PixelFormat.Format8bppIndexed);
                            blackAndWhite.Save(Path.ChangeExtension(pdfPath, pageNo + "_bw_raw.png"));
                        }
                }
        }
Beispiel #2
0
        public void WhenReadingOrder_ContainsReadingOrderXmlElements()
        {
            var pageXmlTextExporter = new PageXmlTextExporter(
                DefaultWordExtractor.Instance,
                RecursiveXYCut.Instance,
                UnsupervisedReadingOrderDetector.Instance);
            var xml = GetXml(pageXmlTextExporter);

            Assert.Contains("<ReadingOrder>", xml);
            Assert.Contains("</OrderedGroup>", xml);
        }
Beispiel #3
0
        public void NoPointsAreOnThePageBoundary()
        {
            var pageWidth  = 100;
            var pageHeight = 200;

            var topLeftPagePoint     = new PdfPoint(0, 0);
            var bottomLeftPagePoint  = new PdfPoint(0, pageHeight);
            var bottomRightPagePoint = new PdfPoint(pageWidth, pageHeight);
            var normalPoint          = new PdfPoint(60, 60);

            Assert.Equal("1,199", PageXmlTextExporter.PointToString(topLeftPagePoint, pageWidth, pageHeight));
            Assert.Equal("1,1", PageXmlTextExporter.PointToString(bottomLeftPagePoint, pageWidth, pageHeight));
            Assert.Equal("99,1", PageXmlTextExporter.PointToString(bottomRightPagePoint, pageWidth, pageHeight));
            Assert.Equal($"60,140", PageXmlTextExporter.PointToString(normalPoint, pageWidth, pageHeight));
        }
Beispiel #4
0
        private static string GetXml(PageXmlTextExporter pageXmlTextExporter = null)
        {
            pageXmlTextExporter = pageXmlTextExporter ?? new PageXmlTextExporter(
                DefaultWordExtractor.Instance,
                RecursiveXYCut.Instance,
                UnsupervisedReadingOrderDetector.Instance);

            string xml;

            using (var document = PdfDocument.Open(GetFilename()))
            {
                var page = document.GetPage(1);
                xml = pageXmlTextExporter.Get(page);
            }

            return(xml);
        }