private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) { var pageXmlPage = new PageXmlDocument.PageXmlPage() { ImageFilename = "unknown", ImageHeight = (int)Math.Round(page.Height * scale), ImageWidth = (int)Math.Round(page.Width * scale), }; var regions = new List <PageXmlDocument.PageXmlRegion>(); var words = page.GetWords(wordExtractor).ToList(); if (words.Count > 0) { var blocks = pageSegmenter.GetBlocks(words); if (readingOrderDetector != null) { blocks = readingOrderDetector.Get(blocks).ToList(); } regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height))); if (orderedRegions.Count > 0) { pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder() { Item = new PageXmlDocument.PageXmlOrderedGroup() { Items = orderedRegions.ToArray(), Id = "g" + groupOrderCount++ } }; } } var images = page.GetImages().ToList(); if (images.Count > 0) { regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height))); } if (includePaths) { var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height)); if (graphicalElements.Count(g => g != null) > 0) { regions.AddRange(graphicalElements.Where(g => g != null)); } } pageXmlPage.Items = regions.ToArray(); return(pageXmlPage); }
private static PageXmlDocument.PageXmlPage ToPageXmlPage(CocoEntry page, Dictionary <int, string> categories) { var pageXmlPage = new PageXmlDocument.PageXmlPage() { ImageFilename = page.FileName, ImageHeight = (int)Math.Round(page.Height), ImageWidth = (int)Math.Round(page.Width), }; var regions = new List <PageXmlDocument.PageXmlRegion>(); foreach (var annotation in page.Annotations) { var category = categories[annotation.category_id]; var segmentations = annotation.GetSegmentationPoints()[0]; switch (category) { case "title": regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Heading)); break; case "text": regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Paragraph)); break; case "list": regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.LisLabel)); break; case "figure": regions.Add(ToPageXmlImageRegion(annotation.id, segmentations)); break; case "table": regions.Add(ToPageXmlTableRegion(annotation.id, segmentations)); break; default: regions.Add(ToPageXmlUnknownRegion(annotation.id, segmentations)); break; } } pageXmlPage.Items = regions.ToArray(); return(pageXmlPage); }