private static void ParseTables(XElement pageElement, TextGeometryModel model)
 {
     foreach (var tableElement in pageElement.TableElements())
     {
         var table = tableElement.ParseTable();
         model.AddTable(table);
     }
 }
 private static void ParseSeparators(XElement pageElement, TextGeometryModel model)
 {
     foreach (var separatorElement in pageElement.SeparatorElements())
     {
         var separator = separatorElement.ParseSeparator();
         model.AddSeparator(separator);
     }
 }
        public HashSet <GMWord> GetSameWords([NotNull] TextGeometryModel fineReaderModel, [NotNull] TextGeometryModel tesseractModel)
        {
            const double targetResolution = 150;
            var          scaledFrModel    = fineReaderModel.ScaleModel(targetResolution / fineReaderModel.GridUnit.Divisor);
            var          scaledTesModel   = tesseractModel.ScaleModel(targetResolution / tesseractModel.GridUnit.Divisor);
            var          tesseractCache   = new LinesHashInGeometryModel(scaledTesModel.Words(), yEpsilon, xEpsilon, scaledTesModel.PageBox);
            var          allSameWords     = scaledFrModel.Words().Where(x => tesseractCache.Contains(x, String.Compare));

            return(new HashSet <GMWord>(allSameWords));
        }
        public TextGeometryModel GetSampleModel(TextGeometryModel fineReaderModel, TextGeometryModel tesseractModel, [NotNull] GridUnit gridUnit)
        {
            var scaledFrModel  = fineReaderModel.ScaleModel(gridUnit.Divisor / (double)fineReaderModel.GridUnit.Divisor);
            var scaledTesModel = tesseractModel.ScaleModel(gridUnit.Divisor / (double)tesseractModel.GridUnit.Divisor);
            var tesseractCache = new LinesHashInGeometryModel(scaledTesModel.Words(), yEpsilon, xEpsilon, scaledTesModel.PageBox);
            var sampleModel    = new TextGeometryModel(scaledFrModel.PageBox, gridUnit);
            var sameWords      = new List <GMWord>(EnumerateWordsFromFineReader(scaledFrModel, sampleModel, tesseractCache));

            AddWordsFromTesseract(scaledTesModel, sampleModel, sameWords);
            return(sampleModel);
        }
 private static void ParseTextBlocks(XElement pageElement, TextGeometryModel model)
 {
     foreach (var textBlockElement in pageElement.TextBlockElements())
     {
         var textBlock = textBlockElement.ParseTextBlock();
         if (!textBlock.IsEmpty())
         {
             model.AddTextBlock(textBlock);
         }
     }
 }
        //return HashSet coincidered words
        private static IEnumerable <GMWord> EnumerateWordsFromFineReader(TextGeometryModel fineReaderModel, TextGeometryModel sampleModel,
                                                                         LinesHashInGeometryModel tesseractCache)
        {
            foreach (var block in fineReaderModel.TextBlocks())
            {
                var blockBox  = block.BoundingBox;
                var thisBlock =
                    sampleModel.AddTextBlock(new BoundingBox(blockBox.XMin, blockBox.YMin, blockBox.XMax, blockBox.YMax));
                foreach (var paragraph in block.Paragraphs())
                {
                    var paragraphBox  = paragraph.BoundingBox;
                    var thisParagraph =
                        thisBlock.AddParagraph(new BoundingBox(paragraphBox.XMin, paragraphBox.YMin, paragraphBox.XMax,
                                                               paragraphBox.YMax));
                    foreach (var line in paragraph.Lines())
                    {
                        var lineBox  = line.BoundingBox;
                        var thisLine =
                            thisParagraph.AddLine(new BoundingBox(lineBox.XMin, lineBox.YMin, lineBox.XMax, lineBox.YMax));
                        foreach (var word in line.Words())
                        {
                            var wordBox  = word.BoundingBox;
                            var accuracy = 50;
                            if (tesseractCache.Contains(word, String.Compare))
                            {
                                yield return(word);

                                accuracy = 100;
                            }
                            var thisWord = new GMWord(wordBox, word.Text, accuracy);
                            thisLine.AddWord(thisWord);
                        }
                    }
                }
                foreach (var word in block.StandaloneWords())
                {
                    var wordBox  = word.BoundingBox;
                    var accuracy = 50;
                    if (tesseractCache.Contains(word, String.Compare))
                    {
                        yield return(word);

                        accuracy = 100;
                    }
                    var thisWord = new GMWord(wordBox, word.Text, accuracy);
                    thisBlock.AddStandaloneWord(thisWord);
                }
            }
        }
        public static void StoreModel(TextGeometryModel geometryModel, string targetPath, string targetFileName)
        {
            if (string.IsNullOrEmpty(targetPath))
            {
                targetPath = ".";
            }
            StoreHOCRVisualizerScripts(targetPath);
            var xmlDoc = ProcessModel(geometryModel, "./");

            using (
                var stream = File.Open(Path.Combine(targetPath, targetFileName), FileMode.Create, FileAccess.Write,
                                       FileShare.Read))
            {
                StoreXmlDoc(xmlDoc, stream);
            }
        }
        private static TextGeometryModel ParsePage([NotNull] this XElement pageElement, [CanBeNull] GridUnit defaultGridUnit)
        {
            var boundingBox = pageElement.ParseBBoxAttribute();
            var gridUnit    = pageElement.ParseGridUnitAttribute();

            if (GridUnit.UNKNOWN_UNITS.Equals(gridUnit) && defaultGridUnit != null)
            {
                gridUnit = defaultGridUnit;
            }
            var model = new TextGeometryModel(boundingBox, gridUnit);

            ParseTextBlocks(pageElement, model);
            ParseSeparators(pageElement, model);
            ParseTables(pageElement, model);
            return(model);
        }
        private static void AddWordsFromTesseract(TextGeometryModel tesseractModel, TextGeometryModel sampleModel, List <GMWord> set)
        {
            var box = new BoundingBox(tesseractModel.PageBox.XMin, tesseractModel.PageBox.YMin,
                                      tesseractModel.PageBox.XMax, tesseractModel.PageBox.YMax);
            var block     = sampleModel.AddTextBlock(box);
            var paragraph = block.AddParagraph(box);
            var line      = paragraph.AddLine(box);

            foreach (var word in tesseractModel.Words())
            {
                //compare with special comparator, which compare boundingboxes with inaccuracy
                if (!set.Contains(word, GMWordComparerWithInaccuracy.Instance))
                {
                    var wordBox = new BoundingBox(word.BoundingBox.XMin, word.BoundingBox.YMin, word.BoundingBox.XMax, word.BoundingBox.YMax);
                    line.AddWord(new GMWord(wordBox, word.Text, 50));
                }
            }
        }
 /// <summary>
 /// Creates new instance of recognition results assuming that results are not deskewed
 /// </summary>
 /// <param name="geometryModel"></param>
 public RecognitionResult(TextGeometryModel geometryModel)
 {
     this.geometryModel = geometryModel;
     deskewParameters   = null;
     isDeskewed         = false;
 }
 /// <summary>
 /// Creates new instance of recognition results assuming that results are deskewed with given transformation
 /// </summary>
 /// <param name="geometryModel">The model to store</param>
 /// <param name="deskewParameters">The set of deskew transformation parameters</param>
 public RecognitionResult([NotNull] TextGeometryModel geometryModel, [NotNull] DeskewParameters deskewParameters)
 {
     this.geometryModel    = geometryModel;
     this.deskewParameters = deskewParameters;
     isDeskewed            = true;
 }
        public static XDocument ProcessModel([NotNull] TextGeometryModel model, string pathToScripts = "")
        {
            var bodyElement = new XElement(HocrFormatConsts.XNameBody);
            var document    =
                new XDocument(
                    new XElement(HocrFormatConsts.XNameHtml,
                                 new XElement(HocrFormatConsts.XNameHead,
                                              new XElement(HocrFormatConsts.XNameTitle, "text geometric model"),
                                              new XElement(HocrFormatConsts.XNameLink, new XAttribute("href", pathToScripts + "hocr.css"), new XAttribute("rel", "stylesheet")),
                                              new XElement(HocrFormatConsts.XNameScript, new XAttribute("src", pathToScripts + "jquery.js"), " "),
                                              new XElement(HocrFormatConsts.XNameScript, new XAttribute("src", pathToScripts + "hocr.js"), " "),
                                              MakeXElementWithAttributes(HocrFormatConsts.XNameMeta, new Dictionary <string, string>()
            {
                { "http-equiv", "Content-Type" },
                { "content", "text/html; charset=utf-8" }
            }),
                                              MakeMetaElement("ocr-system", "kontur-recognition"),
                                              MakeMetaElement("ocr-capabilities", "ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf ocr_separator ocr_table")
                                              ),
                                 bodyElement
                                 )
                    );
            var pageElement =
                MakeXElementWithAttributes(
                    HocrFormatConsts.XNameDiv, new Dictionary <string, string>()
            {
                { "class", HocrFormatConsts.ClassPage },
                { "id", "page_1" },
                { "title", string.Format("{0}; ppageno 0", FormatTitleAttribute(model.PageBox)) },
                { "gridunit", string.Format("{0}", model.GridUnit.Divisor) }
            });

            bodyElement.Add(pageElement);

            var idCounter = new IdCounter();

            foreach (var textBlock in model.TextBlocks())
            {
                pageElement.Add(MakeTextBlockElement(textBlock, idCounter));
            }
            foreach (var separator in model.Separators())
            {
                var separatorElement = MakeXElementWithAttributes(
                    HocrFormatConsts.XNameDiv, new Dictionary <string, string>()
                {
                    { "class", HocrFormatConsts.ClassSeparator },
                    { "id", idCounter.NextSeparatorId },
                    { "title", FormatTitleAttribute(separator.BoundingBox) },
                    { "separator", string.Format("{0} {1} {2} {3} {4}",
                                                 separator.StartPointX, separator.StartPointY,
                                                 separator.EndPointX, separator.EndPointY, separator.Width) },
                });
                pageElement.Add(separatorElement);
            }
            foreach (GMTable table in model.Tables())
            {
                var tableElement = MakeXElementWithAttributes(
                    HocrFormatConsts.XNameDiv, new Dictionary <string, string>()
                {
                    { "class", HocrFormatConsts.ClassTable },
                    { "id", idCounter.NextTableId },
                    { "title", FormatTitleAttribute(table.BoundingBox) },
                    { HocrFormatConsts.AttrTableRowsCount, table.RowsCount.ToString() },
                    { HocrFormatConsts.AttrTableColsCount, table.ColsCount.ToString() }
                });
                foreach (var cell in table.Cells())
                {
                    tableElement.Add(MakeCellElement(cell, idCounter));
                }
                pageElement.Add(tableElement);
            }

            return(document);
        }
        public static void StoreModel(TextGeometryModel geometryModel, Stream target)
        {
            var xmlDoc = ProcessModel(geometryModel, "./");

            StoreXmlDoc(xmlDoc, target);
        }