private static XElement MakeTextBlockElement(GMTextBlock textBlock, IdCounter idCounter) { var textBlockElement = MakeXElementWithAttributes( HocrFormatConsts.XNameDiv, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassTextBlock }, { "id", idCounter.NextTextBlockId }, { "title", FormatTitleAttribute(textBlock.BoundingBox) } }); foreach (var paragraph in textBlock.Paragraphs()) { var paragraphElement = MakeXElementWithAttributes( HocrFormatConsts.XNameP, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassParagraph }, { "dir", "ltr" }, { "id", idCounter.NextParagraphId }, { "title", FormatTitleAttribute(paragraph.BoundingBox) } }); textBlockElement.Add(paragraphElement); foreach (var line in paragraph.Lines()) { var lineElement = MakeXElementWithAttributes( HocrFormatConsts.XNameSpan, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassLine }, { "id", idCounter.NextLineId }, { "title", FormatTitleAttribute(line.BoundingBox) } }); paragraphElement.Add(lineElement); foreach (var word in line.Words()) { var wordElement = MakeXElementWithAttributes( HocrFormatConsts.XNameSpan, new Dictionary <string, string>() { { "class", HocrFormatConsts.ClassWord }, { "id", idCounter.NextWordId }, { "title", FormatTitleAttribute(word.BoundingBox, word.Accuracy) }, { "accuracy", string.Format(CultureInfo.InvariantCulture, "{0:N2}", word.Accuracy / 100.0) } }); lineElement.Add(wordElement); wordElement.Add( new XElement(HocrFormatConsts.XNameStrong, word.Text)); } } // TODO: store information about standalone words (they should be split onto lines) } return(textBlockElement); }
private static GMTextBlock ParseTextBlock([NotNull] this XElement textBlockElement) { var boundingBox = textBlockElement.ParseBBoxAttribute(); var textBlock = new GMTextBlock(boundingBox); foreach (var paraElement in textBlockElement.ParagraphElements()) { var paragraph = paraElement.ParseParagraph(); if (!paragraph.IsEmpty()) { textBlock.AddParagraph(paragraph); } } return(textBlock); }