private static XElement MakeTextBlockElement(GMTextBlock textBlock, IdCounter idCounter)
        {
            var textBlockElement = MakeXElementWithAttributes(
                HocrFormatConsts.XNameDiv, new Dictionary <string, string>()
            {
                { "class", HocrFormatConsts.ClassTextBlock },
                { "id", idCounter.NextTextBlockId },
                { "title", FormatTitleAttribute(textBlock.BoundingBox) }
            });

            foreach (var paragraph in textBlock.Paragraphs())
            {
                var paragraphElement = MakeXElementWithAttributes(
                    HocrFormatConsts.XNameP, new Dictionary <string, string>()
                {
                    { "class", HocrFormatConsts.ClassParagraph },
                    { "dir", "ltr" },
                    { "id", idCounter.NextParagraphId },
                    { "title", FormatTitleAttribute(paragraph.BoundingBox) }
                });
                textBlockElement.Add(paragraphElement);

                foreach (var line in paragraph.Lines())
                {
                    var lineElement = MakeXElementWithAttributes(
                        HocrFormatConsts.XNameSpan, new Dictionary <string, string>()
                    {
                        { "class", HocrFormatConsts.ClassLine },
                        { "id", idCounter.NextLineId },
                        { "title", FormatTitleAttribute(line.BoundingBox) }
                    });
                    paragraphElement.Add(lineElement);

                    foreach (var word in line.Words())
                    {
                        var wordElement = MakeXElementWithAttributes(
                            HocrFormatConsts.XNameSpan, new Dictionary <string, string>()
                        {
                            { "class", HocrFormatConsts.ClassWord },
                            { "id", idCounter.NextWordId },
                            { "title", FormatTitleAttribute(word.BoundingBox, word.Accuracy) },
                            { "accuracy", string.Format(CultureInfo.InvariantCulture, "{0:N2}", word.Accuracy / 100.0) }
                        });
                        lineElement.Add(wordElement);

                        wordElement.Add(
                            new XElement(HocrFormatConsts.XNameStrong, word.Text));
                    }
                }
                // TODO: store information about standalone words (they should be split onto lines)
            }
            return(textBlockElement);
        }
Пример #2
0
        private static GMTextBlock ParseTextBlock([NotNull] this XElement textBlockElement)
        {
            var boundingBox = textBlockElement.ParseBBoxAttribute();
            var textBlock   = new GMTextBlock(boundingBox);

            foreach (var paraElement in textBlockElement.ParagraphElements())
            {
                var paragraph = paraElement.ParseParagraph();
                if (!paragraph.IsEmpty())
                {
                    textBlock.AddParagraph(paragraph);
                }
            }
            return(textBlock);
        }