Esempio n. 1
0
        /// <summary>
        /// Converts the single json file into several PAGE xml files.
        /// </summary>
        /// <param name="inputFilePath">The path to the json file.</param>
        /// <param name="outputFolderPath">The folder that will contain the PAGE xml files.</param>
        public static void Convert(string inputFilePath, string outputFolderPath)
        {
            using (FileStream s = File.Open(inputFilePath, FileMode.Open))
                using (StreamReader sr = new StreamReader(s))
                    using (JsonReader reader = new JsonTextReader(sr))
                    {
                        JsonSerializer serializer = new JsonSerializer();
                        var            cocoFile   = serializer.Deserialize <CocoFile>(reader);

                        Dictionary <int, string> categories = cocoFile.categories.ToDictionary(k => k.id, k => k.name);

                        int totalImageCount = cocoFile.images.Count;
                        for (int i = 0; i < totalImageCount; i++)
                        {
                            var    image          = cocoFile.images[i];
                            string outputFilePath = Path.ChangeExtension(Path.Combine(outputFolderPath, image.file_name), "xml");
                            if (File.Exists(outputFilePath))
                            {
                                continue;
                            }

                            var entry = new CocoEntry(image.id, image.file_name, image.height, image.width);

                            var annotations = cocoFile.annotations.Where(a => a.image_id == image.id);
                            foreach (var annotation in annotations)
                            {
                                entry.Annotations.Add(annotation);
                            }

                            var pageXml = Get(entry, categories);
                            File.WriteAllText(outputFilePath, pageXml);
                            Console.WriteLine("Done: \t" + Path.GetFileName(outputFilePath) + "\t" + i + @"/" + totalImageCount);
                        }
                    }
        }
Esempio n. 2
0
        private static PageXmlDocument.PageXmlPage ToPageXmlPage(CocoEntry page, Dictionary <int, string> categories)
        {
            var pageXmlPage = new PageXmlDocument.PageXmlPage()
            {
                ImageFilename = page.FileName,
                ImageHeight   = (int)Math.Round(page.Height),
                ImageWidth    = (int)Math.Round(page.Width),
            };

            var regions = new List <PageXmlDocument.PageXmlRegion>();

            foreach (var annotation in page.Annotations)
            {
                var category      = categories[annotation.category_id];
                var segmentations = annotation.GetSegmentationPoints()[0];

                switch (category)
                {
                case "title":
                    regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Heading));
                    break;

                case "text":
                    regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Paragraph));
                    break;

                case "list":
                    regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.LisLabel));
                    break;

                case "figure":
                    regions.Add(ToPageXmlImageRegion(annotation.id, segmentations));
                    break;

                case "table":
                    regions.Add(ToPageXmlTableRegion(annotation.id, segmentations));
                    break;

                default:
                    regions.Add(ToPageXmlUnknownRegion(annotation.id, segmentations));
                    break;
                }
            }

            pageXmlPage.Items = regions.ToArray();

            return(pageXmlPage);
        }
Esempio n. 3
0
        /// <summary>
        /// Get the PAGE-XML (XML) string of the pages layout.
        /// </summary>
        /// <param name="page"></param>
        /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
        private static string Get(CocoEntry page, Dictionary <int, string> categories)
        {
            PageXmlDocument pageXmlDocument = new PageXmlDocument()
            {
                Metadata = new PageXmlDocument.PageXmlMetadata()
                {
                    Created    = DateTime.UtcNow,
                    LastChange = DateTime.UtcNow,
                    Creator    = "PublayNetConverter",
                    Comments   = "PubLayNet dataset"
                },
                PcGtsId = "pc" + page.Id.ToString()
            };

            pageXmlDocument.Page = ToPageXmlPage(page, categories);

            return(Serialize(pageXmlDocument));
        }