Ejemplo n.º 1
0
        public static void Run()
        {
            string       altoFilePath = @"D:\MachineLearning\Document Layout Analysis\hocr\Glyph_Sample01_General.xml"; // Glyph_Sample01_General.xml";
            AltoDocument alto         = AltoDocument.Deserialize(altoFilePath);


            var xml = alto.Serialize();

            File.WriteAllText(Path.ChangeExtension(altoFilePath, "new.xml"), xml);
        }
Ejemplo n.º 2
0
        private string Serialize(AltoDocument altoDocument)
        {
            var serializer = new XmlSerializer(typeof(AltoDocument));
            var settings   = new XmlWriterSettings
            {
                Encoding    = System.Text.Encoding.UTF8,
                Indent      = true,
                IndentChars = indentChar,
            };

            using (var memoryStream = new System.IO.MemoryStream())
                using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
                {
                    serializer.Serialize(xmlWriter, altoDocument);
                    return(System.Text.Encoding.UTF8.GetString(memoryStream.ToArray()));
                }
        }
Ejemplo n.º 3
0
        public static void Run(string path)
        {
            float zoom     = 10;
            var   pinkPen  = new Pen(Color.HotPink, zoom * 0.4f);
            var   greenPen = new Pen(Color.GreenYellow, zoom * 0.6f);
            var   bluePen  = new Pen(Color.Blue, zoom * 1.0f);

            AltoDocument alto = new AltoDocument()
            {
                Layout = new AltoLayout()
                {
                    StyleRefs = null
                },
                Description   = GetAltoDescription("filename"),
                SchemaVersion = "SCHEMAVERSION1",
                Styles        = new AltoStyles()
                {
                },
                Tags = new AltoTags()
                {
                }
            };

            List <AltoPage> altoPages = new List <AltoPage>();

            using (PdfDocument document = PdfDocument.Open(path))
            {
                var testAlto = AltoDocument.FromPdfDocument(document);

                for (var i = 0; i < document.NumberOfPages; i++)
                {
                    var pagePdf = document.GetPage(i + 1);

                    Stopwatch stopwatch = new Stopwatch();
                    stopwatch.Start();
                    var words = pagePdf.GetWords(NearestNeighbourWordExtractor.Instance);
                    stopwatch.Stop();
                    Console.WriteLine("GetWords() - Time elapsed: {0}", stopwatch.Elapsed);

                    var pageWordsH = words.Where(x => x.TextDirection == TextDirection.Horizontal || x.TextDirection == TextDirection.Rotate180).ToArray();

                    stopwatch.Reset();
                    stopwatch.Start();
                    var blocks = RecursiveXYCut.Instance.GetBlocks(pageWordsH);
                    stopwatch.Stop();
                    Console.WriteLine("RecursiveXYCut() - Time elapsed: {0}", stopwatch.Elapsed);

                    altoPages.Add(new AltoPage()
                    {
                        Height         = (float)pagePdf.Height,
                        Width          = (float)pagePdf.Width,
                        Accuracy       = float.NaN,
                        Quality        = AltoQuality.OK,
                        QualityDetail  = null,
                        BottomMargin   = null,
                        LeftMargin     = null,
                        RightMargin    = null,
                        TopMargin      = null,
                        Pc             = float.NaN,
                        PhysicalImgNr  = pagePdf.Number,
                        PrintedImgNr   = null,
                        PageClass      = null,
                        Position       = AltoPosition.Cover,
                        Processing     = null,
                        ProcessingRefs = null,
                        StyleRefs      = null,
                        PrintSpace     = new AltoPageSpace()
                        {
                            Height            = (float)pagePdf.Height,          // TBD
                            Width             = (float)pagePdf.Width,           // TBD
                            VPos              = 0f,                             // TBD
                            HPos              = 0f,                             // TBD
                            ComposedBlocks    = null,                           // TBD
                            GraphicalElements = null,                           // TBD
                            Illustrations     = null,                           // TBD
                            ProcessingRefs    = null,                           // TBD
                            StyleRefs         = null,                           // TBD
                            TextBlock         = blocks.Select(b => ToAltoTextBlock(b, pagePdf.Height)).ToArray(),
                            Id = "NA"
                        },
                        Id = "NA"
                    });
                }
            }

            alto.Layout.Pages = altoPages.ToArray();


            XmlSerializer xsSubmit = new XmlSerializer(typeof(AltoDocument));
            var           xml      = "";

            using (var sww = new StringWriter())
            {
                using (XmlTextWriter writer = new XmlTextWriter(sww)
                {
                    Formatting = Formatting.Indented
                })
                {
                    xsSubmit.Serialize(writer, alto);
                    xml = sww.ToString(); // Your XML
                }
            }

            File.WriteAllText(Path.ChangeExtension(path, "xml"), xml);
        }