Ejemplo n.º 1
0
        private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
        {
            var pageXmlPage = new PageXmlDocument.PageXmlPage()
            {
                ImageFilename = "unknown",
                ImageHeight   = (int)Math.Round(page.Height * scale),
                ImageWidth    = (int)Math.Round(page.Width * scale),
            };

            var regions = new List <PageXmlDocument.PageXmlRegion>();

            var words = page.GetWords(wordExtractor).ToList();

            if (words.Count > 0)
            {
                var blocks = pageSegmenter.GetBlocks(words);

                if (readingOrderDetector != null)
                {
                    blocks = readingOrderDetector.Get(blocks).ToList();
                }

                regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));

                if (orderedRegions.Count > 0)
                {
                    pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
                    {
                        Item = new PageXmlDocument.PageXmlOrderedGroup()
                        {
                            Items = orderedRegions.ToArray(),
                            Id    = "g" + groupOrderCount++
                        }
                    };
                }
            }

            var images = page.GetImages().ToList();

            if (images.Count > 0)
            {
                regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height)));
            }

            if (includePaths)
            {
                var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height));
                if (graphicalElements.Count(g => g != null) > 0)
                {
                    regions.AddRange(graphicalElements.Where(g => g != null));
                }
            }

            pageXmlPage.Items = regions.ToArray();
            return(pageXmlPage);
        }
Ejemplo n.º 2
0
        private static PageXmlDocument.PageXmlPage ToPageXmlPage(CocoEntry page, Dictionary <int, string> categories)
        {
            var pageXmlPage = new PageXmlDocument.PageXmlPage()
            {
                ImageFilename = page.FileName,
                ImageHeight   = (int)Math.Round(page.Height),
                ImageWidth    = (int)Math.Round(page.Width),
            };

            var regions = new List <PageXmlDocument.PageXmlRegion>();

            foreach (var annotation in page.Annotations)
            {
                var category      = categories[annotation.category_id];
                var segmentations = annotation.GetSegmentationPoints()[0];

                switch (category)
                {
                case "title":
                    regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Heading));
                    break;

                case "text":
                    regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Paragraph));
                    break;

                case "list":
                    regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.LisLabel));
                    break;

                case "figure":
                    regions.Add(ToPageXmlImageRegion(annotation.id, segmentations));
                    break;

                case "table":
                    regions.Add(ToPageXmlTableRegion(annotation.id, segmentations));
                    break;

                default:
                    regions.Add(ToPageXmlUnknownRegion(annotation.id, segmentations));
                    break;
                }
            }

            pageXmlPage.Items = regions.ToArray();

            return(pageXmlPage);
        }