/// <summary> /// Converts the single json file into several PAGE xml files. /// </summary> /// <param name="inputFilePath">The path to the json file.</param> /// <param name="outputFolderPath">The folder that will contain the PAGE xml files.</param> public static void Convert(string inputFilePath, string outputFolderPath) { using (FileStream s = File.Open(inputFilePath, FileMode.Open)) using (StreamReader sr = new StreamReader(s)) using (JsonReader reader = new JsonTextReader(sr)) { JsonSerializer serializer = new JsonSerializer(); var cocoFile = serializer.Deserialize <CocoFile>(reader); Dictionary <int, string> categories = cocoFile.categories.ToDictionary(k => k.id, k => k.name); int totalImageCount = cocoFile.images.Count; for (int i = 0; i < totalImageCount; i++) { var image = cocoFile.images[i]; string outputFilePath = Path.ChangeExtension(Path.Combine(outputFolderPath, image.file_name), "xml"); if (File.Exists(outputFilePath)) { continue; } var entry = new CocoEntry(image.id, image.file_name, image.height, image.width); var annotations = cocoFile.annotations.Where(a => a.image_id == image.id); foreach (var annotation in annotations) { entry.Annotations.Add(annotation); } var pageXml = Get(entry, categories); File.WriteAllText(outputFilePath, pageXml); Console.WriteLine("Done: \t" + Path.GetFileName(outputFilePath) + "\t" + i + @"/" + totalImageCount); } } }
private static PageXmlDocument.PageXmlPage ToPageXmlPage(CocoEntry page, Dictionary <int, string> categories) { var pageXmlPage = new PageXmlDocument.PageXmlPage() { ImageFilename = page.FileName, ImageHeight = (int)Math.Round(page.Height), ImageWidth = (int)Math.Round(page.Width), }; var regions = new List <PageXmlDocument.PageXmlRegion>(); foreach (var annotation in page.Annotations) { var category = categories[annotation.category_id]; var segmentations = annotation.GetSegmentationPoints()[0]; switch (category) { case "title": regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Heading)); break; case "text": regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.Paragraph)); break; case "list": regions.Add(ToPageXmlTextRegion(annotation.id, segmentations, PageXmlDocument.PageXmlTextSimpleType.LisLabel)); break; case "figure": regions.Add(ToPageXmlImageRegion(annotation.id, segmentations)); break; case "table": regions.Add(ToPageXmlTableRegion(annotation.id, segmentations)); break; default: regions.Add(ToPageXmlUnknownRegion(annotation.id, segmentations)); break; } } pageXmlPage.Items = regions.ToArray(); return(pageXmlPage); }
/// <summary> /// Get the PAGE-XML (XML) string of the pages layout. /// </summary> /// <param name="page"></param> /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param> private static string Get(CocoEntry page, Dictionary <int, string> categories) { PageXmlDocument pageXmlDocument = new PageXmlDocument() { Metadata = new PageXmlDocument.PageXmlMetadata() { Created = DateTime.UtcNow, LastChange = DateTime.UtcNow, Creator = "PublayNetConverter", Comments = "PubLayNet dataset" }, PcGtsId = "pc" + page.Id.ToString() }; pageXmlDocument.Page = ToPageXmlPage(page, categories); return(Serialize(pageXmlDocument)); }