private static TextBlockItem ExtractParaphraph(ResultIterator iter) { var text = new StringBuilder(); do { do { text.Append(iter.GetText(PageIteratorLevel.Word)); text.Append(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { text.Append(Environment.NewLine); } }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { text.Append(Environment.NewLine); } }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); return(new TextBlockItem { Text = text.ToString() }); }
public static string RecognizeBlocks(byte[] imageBytes) { StringBuilder sb = new StringBuilder(); try { using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default)) { using (Pix pix = Pix.LoadFromMemory(imageBytes)) { using (var page = engine.Process(pix)) { sb.AppendLine("Text (iterator):"); using (ResultIterator iter = page.GetIterator()) { iter.Begin(); do { do { do { do { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { sb.AppendLine("<BLOCK>"); } sb.Append(iter.GetText(PageIteratorLevel.Word)); sb.Append(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { sb.AppendLine(); } } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { sb.AppendLine(); } } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); } while (iter.Next(PageIteratorLevel.Block)); } } } } } catch (Exception e) { Trace.TraceError(e.ToString()); } return(sb.ToString()); }
private static void IterateBlocks(Page page) { Console.WriteLine("Text (iterator):"); using (ResultIterator iter = page.GetIterator()) { iter.Begin(); do { do { do { do { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { Console.WriteLine("<BLOCK>"); } Console.Write(iter.GetText(PageIteratorLevel.Word)); Console.Write(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { Console.WriteLine(); } }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { Console.WriteLine(); } }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); }while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); }while (iter.Next(PageIteratorLevel.Block)); } }
/* * int LookingTextMarker(RecognazeRule Rule, Page page, out ResultIterator BestLineIter, out int word) * { * word = -1; * BestLineIter = null; * int BestDistance = 10000; * * ResultIterator LineIter = page.GetIterator(); * string[] Words = Rule.TextMarker.Split(new char[] {' '}, StringSplitOptions.RemoveEmptyEntries); * int NumberOfWords = Words.Length; * LineIter.Begin(); * do * { * int CurrentWordNumber = -1; * int CurrentBestDistance = 10000; * string Line = LineIter.GetText(PageIteratorLevel.TextLine); * if(Line == null) * continue; * string[] WordsOfLine = Line.Split(new char[] {' '}, StringSplitOptions.None); * if(WordsOfLine.Length < NumberOfWords) * continue; * * for(int shift = 0; shift <= WordsOfLine.Length - NumberOfWords; shift++) * { * int PassDistance = 0; * for(int i = 0; i < NumberOfWords; i++) * { * PassDistance += FuzzyStringComparer.GetDistanceLevenshtein(WordsOfLine[shift + i], * Words[i], * StringComparison.CurrentCultureIgnoreCase); * } * if(PassDistance < CurrentBestDistance) * { * CurrentBestDistance = PassDistance; * CurrentWordNumber = shift + 1; * } * } * if(CurrentBestDistance < BestDistance) * { * AddToLog ("new best"); * AddToLog (LineIter.GetText(PageIteratorLevel.Word)); * word = CurrentWordNumber; * if(BestLineIter != null) * BestLineIter.Dispose(); * BestLineIter = LineIter.Clone(); * AddToLog (BestLineIter.GetText(PageIteratorLevel.TextLine)); * BestDistance = CurrentBestDistance; * } * } while( LineIter.Next(PageIteratorLevel.TextLine)); * LineIter.Dispose(); * return BestDistance; * } */ int GetTextPosition(string Text, Page page, out int PosX, out int PosY, out double AngleRad, RecognazeRule[] AfterMarkerRules) { int BestDistance = 10000; PosX = -1; PosY = -1; AngleRad = 0; logger.Debug("Marker zone text:{0}", page.GetText()); ResultIterator LineIter = page.GetIterator(); string[] Words = Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int NumberOfWords = Words.Length; LineIter.Begin(); do { int CurrentWordNumber = -1; int CurrentAfterWord = 0; int CurrentBestDistance = 10000; string Line = LineIter.GetText(PageIteratorLevel.TextLine); if (Line == null || Line == "") { continue; } Line = Line.Trim(); string[] WordsOfLine = Line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (WordsOfLine.Length == 0) { continue; } for (int shift = 0; shift < WordsOfLine.Length; shift++) { for (int i = 1; i <= NumberOfWords && i <= WordsOfLine.Length - shift; i++) { string passString = String.Join(" ", WordsOfLine, shift, i); int PassDistance = FuzzyStringComparer.GetDistanceLevenshtein(passString, Text, StringComparison.CurrentCultureIgnoreCase); if (PassDistance < CurrentBestDistance) { CurrentBestDistance = PassDistance; CurrentWordNumber = shift; CurrentAfterWord = shift + i; } } } if (CurrentBestDistance < BestDistance) { logger.Debug("new best"); logger.Debug(LineIter.GetText(PageIteratorLevel.TextLine).Trim()); //Заполняем поля данными после маркера. foreach (RecognazeRule rule in AfterMarkerRules) { if (rule.NextAfterTextMarker && WordsOfLine.Length > CurrentAfterWord + rule.ShiftWordsCount) { rule.AfterTextMarkerValue = WordsOfLine[CurrentAfterWord + rule.ShiftWordsCount]; } } BestDistance = CurrentBestDistance; for (int i = 0; i < CurrentWordNumber; i++) { LineIter.Next(PageIteratorLevel.Word); } Rect Box; LineIter.TryGetBoundingBox(PageIteratorLevel.Word, out Box); PosX = Box.X1; PosY = Box.Y1; logger.Debug("Position X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2); LineIter.TryGetBaseline(PageIteratorLevel.Word, out Box); logger.Debug("BaseLine X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2); AngleRad = Math.Atan2(Box.Y2 - Box.Y1, Box.X2 - Box.X1); //угл наклона базовой линии. double AngleGrad = AngleRad * (180 / Math.PI); logger.Debug("Angle rad:{0} grad:{1}", AngleRad, AngleGrad); //Получаем уровень распознования полей в маркере. int iterAlreadyShifted = CurrentWordNumber - CurrentAfterWord; bool stopIteration = false; foreach (RecognazeRule rule in AfterMarkerRules.Where(x => x.NextAfterTextMarker).OrderBy(x => x.ShiftWordsCount)) { while (iterAlreadyShifted < rule.ShiftWordsCount) { if (LineIter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { stopIteration = true; break; } LineIter.Next(PageIteratorLevel.Word); iterAlreadyShifted++; } if (stopIteration) { break; } rule.AfterTextMarkerConfidence = LineIter.GetConfidence(PageIteratorLevel.Word); logger.Debug("Cлово {0} со сдвигом {1} имеет точность {2}.", LineIter.GetText(PageIteratorLevel.Word), rule.ShiftWordsCount, rule.AfterTextMarkerConfidence); } } }while(LineIter.Next(PageIteratorLevel.TextLine)); LineIter.Dispose(); return(BestDistance); }