Ejemplo n.º 1
0
        public static void Run()
        {
            string pageFilePath = @"D:\MachineLearning\Document Layout Analysis\hocr\PAGE samples\aletheiaexamplepage_2019.xml"; // Glyph_Sample01_General.xml";

            PageXmlDocument pageXml = PageXmlDocument.Deserialize(pageFilePath);

            var xml = pageXml.Serialize();

            File.WriteAllText(Path.ChangeExtension(pageFilePath, "new.xml"), xml);
        }
Ejemplo n.º 2
0
        public static void Run(string path)
        {
            PageXmlDocument pageXmlDocument = new PageXmlDocument()
            {
                Metadata = new PageXmlMetadata()
                {
                    Created    = DateTime.UtcNow,
                    LastChange = DateTime.UtcNow,
                    Creator    = "PdfPig",
                    Comments   = "", // algo used in here
                },
                Page = new PageXmlPage()
                {
                },
                PcGtsId = "pc-" + path.GetHashCode()
            };

            using (PdfDocument document = PdfDocument.Open(path))
            {
                //var testAlto = AltoDocument.FromPdfDocument(document);

                for (var i = 0; i < document.NumberOfPages; i++)
                {
                    Page pagePdf = document.GetPage(i + 1);
                    pageXmlDocument.Page = FromPdfPage(pagePdf);

                    //var words = pagePdf.GetWords(NearestNeighbourWordExtractor.Instance);

                    //var pageWordsH = words.Where(x => x.TextDirection == TextDirection.Horizontal || x.TextDirection == TextDirection.Rotate180).ToArray();
                    //var blocks = RecursiveXYCut.Instance.GetBlocks(pageWordsH);
                }
            }


            File.WriteAllText(Path.ChangeExtension(path, "pagexml.xml"), pageXmlDocument.Serialize());
        }