예제 #1
0
 private static void ParseTextBlocks(XElement pageElement, TextGeometryModel model)
 {
     foreach (var textBlockElement in pageElement.TextBlockElements())
     {
         var textBlock = textBlockElement.ParseTextBlock();
         if (!textBlock.IsEmpty())
         {
             model.AddTextBlock(textBlock);
         }
     }
 }
        //return HashSet coincidered words
        private static IEnumerable <GMWord> EnumerateWordsFromFineReader(TextGeometryModel fineReaderModel, TextGeometryModel sampleModel,
                                                                         LinesHashInGeometryModel tesseractCache)
        {
            foreach (var block in fineReaderModel.TextBlocks())
            {
                var blockBox  = block.BoundingBox;
                var thisBlock =
                    sampleModel.AddTextBlock(new BoundingBox(blockBox.XMin, blockBox.YMin, blockBox.XMax, blockBox.YMax));
                foreach (var paragraph in block.Paragraphs())
                {
                    var paragraphBox  = paragraph.BoundingBox;
                    var thisParagraph =
                        thisBlock.AddParagraph(new BoundingBox(paragraphBox.XMin, paragraphBox.YMin, paragraphBox.XMax,
                                                               paragraphBox.YMax));
                    foreach (var line in paragraph.Lines())
                    {
                        var lineBox  = line.BoundingBox;
                        var thisLine =
                            thisParagraph.AddLine(new BoundingBox(lineBox.XMin, lineBox.YMin, lineBox.XMax, lineBox.YMax));
                        foreach (var word in line.Words())
                        {
                            var wordBox  = word.BoundingBox;
                            var accuracy = 50;
                            if (tesseractCache.Contains(word, String.Compare))
                            {
                                yield return(word);

                                accuracy = 100;
                            }
                            var thisWord = new GMWord(wordBox, word.Text, accuracy);
                            thisLine.AddWord(thisWord);
                        }
                    }
                }
                foreach (var word in block.StandaloneWords())
                {
                    var wordBox  = word.BoundingBox;
                    var accuracy = 50;
                    if (tesseractCache.Contains(word, String.Compare))
                    {
                        yield return(word);

                        accuracy = 100;
                    }
                    var thisWord = new GMWord(wordBox, word.Text, accuracy);
                    thisBlock.AddStandaloneWord(thisWord);
                }
            }
        }
        private static void AddWordsFromTesseract(TextGeometryModel tesseractModel, TextGeometryModel sampleModel, List <GMWord> set)
        {
            var box = new BoundingBox(tesseractModel.PageBox.XMin, tesseractModel.PageBox.YMin,
                                      tesseractModel.PageBox.XMax, tesseractModel.PageBox.YMax);
            var block     = sampleModel.AddTextBlock(box);
            var paragraph = block.AddParagraph(box);
            var line      = paragraph.AddLine(box);

            foreach (var word in tesseractModel.Words())
            {
                //compare with special comparator, which compare boundingboxes with inaccuracy
                if (!set.Contains(word, GMWordComparerWithInaccuracy.Instance))
                {
                    var wordBox = new BoundingBox(word.BoundingBox.XMin, word.BoundingBox.YMin, word.BoundingBox.XMax, word.BoundingBox.YMax);
                    line.AddWord(new GMWord(wordBox, word.Text, 50));
                }
            }
        }