private static void ParseTextBlocks(XElement pageElement, TextGeometryModel model) { foreach (var textBlockElement in pageElement.TextBlockElements()) { var textBlock = textBlockElement.ParseTextBlock(); if (!textBlock.IsEmpty()) { model.AddTextBlock(textBlock); } } }
//return HashSet coincidered words private static IEnumerable <GMWord> EnumerateWordsFromFineReader(TextGeometryModel fineReaderModel, TextGeometryModel sampleModel, LinesHashInGeometryModel tesseractCache) { foreach (var block in fineReaderModel.TextBlocks()) { var blockBox = block.BoundingBox; var thisBlock = sampleModel.AddTextBlock(new BoundingBox(blockBox.XMin, blockBox.YMin, blockBox.XMax, blockBox.YMax)); foreach (var paragraph in block.Paragraphs()) { var paragraphBox = paragraph.BoundingBox; var thisParagraph = thisBlock.AddParagraph(new BoundingBox(paragraphBox.XMin, paragraphBox.YMin, paragraphBox.XMax, paragraphBox.YMax)); foreach (var line in paragraph.Lines()) { var lineBox = line.BoundingBox; var thisLine = thisParagraph.AddLine(new BoundingBox(lineBox.XMin, lineBox.YMin, lineBox.XMax, lineBox.YMax)); foreach (var word in line.Words()) { var wordBox = word.BoundingBox; var accuracy = 50; if (tesseractCache.Contains(word, String.Compare)) { yield return(word); accuracy = 100; } var thisWord = new GMWord(wordBox, word.Text, accuracy); thisLine.AddWord(thisWord); } } } foreach (var word in block.StandaloneWords()) { var wordBox = word.BoundingBox; var accuracy = 50; if (tesseractCache.Contains(word, String.Compare)) { yield return(word); accuracy = 100; } var thisWord = new GMWord(wordBox, word.Text, accuracy); thisBlock.AddStandaloneWord(thisWord); } } }
private static void AddWordsFromTesseract(TextGeometryModel tesseractModel, TextGeometryModel sampleModel, List <GMWord> set) { var box = new BoundingBox(tesseractModel.PageBox.XMin, tesseractModel.PageBox.YMin, tesseractModel.PageBox.XMax, tesseractModel.PageBox.YMax); var block = sampleModel.AddTextBlock(box); var paragraph = block.AddParagraph(box); var line = paragraph.AddLine(box); foreach (var word in tesseractModel.Words()) { //compare with special comparator, which compare boundingboxes with inaccuracy if (!set.Contains(word, GMWordComparerWithInaccuracy.Instance)) { var wordBox = new BoundingBox(word.BoundingBox.XMin, word.BoundingBox.YMin, word.BoundingBox.XMax, word.BoundingBox.YMax); line.AddWord(new GMWord(wordBox, word.Text, 50)); } } }