public void Print(ResultIterator iter) { logger.Log("Is beginning of block: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Block)); logger.Log("Is beginning of para: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Para)); logger.Log("Is beginning of text line: {0}", iter.IsAtBeginningOf(PageIteratorLevel.TextLine)); logger.Log("Is beginning of word: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Word)); logger.Log("Is beginning of symbol: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Symbol)); logger.Log("Block text: \"{0}\"", iter.GetText(PageIteratorLevel.Block)); logger.Log("Para text: \"{0}\"", iter.GetText(PageIteratorLevel.Para)); logger.Log("TextLine text: \"{0}\"", iter.GetText(PageIteratorLevel.TextLine)); logger.Log("Word text: \"{0}\"", iter.GetText(PageIteratorLevel.Word)); logger.Log("Symbol text: \"{0}\"", iter.GetText(PageIteratorLevel.Symbol)); }
public List <Rectangle> GetTextRects(Bitmap currentImage) { string tessPath = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\tessdata"; TesseractEngine tess = new TesseractEngine(tessPath, "eng"); Page newPage = tess.Process(currentImage, PageSegMode.AutoOsd); ResultIterator iterator = newPage.GetIterator(); string totalText = newPage.GetText(); List <Rectangle> currentRects = new List <Rectangle>(); for (int i = 0; i < totalText.Length; i++) { Rect foundRect = new Rect(); string symbolText = iterator.GetText(PageIteratorLevel.Symbol); bool hasText = symbolText != "" && symbolText != null; bool onlyLetters = false; if (hasText) { onlyLetters = symbolText.ToCharArray().All(s => char.IsLetter(s)); } bool gotBoundingBox = iterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out foundRect); if (hasText && onlyLetters && gotBoundingBox) { currentRects.Add(new Rectangle(foundRect.X1, foundRect.Y1, foundRect.X2 - foundRect.X1, foundRect.Y2 - foundRect.Y1)); } iterator.Next(PageIteratorLevel.Symbol); } tess.Dispose(); iterator.Dispose(); return(currentRects); }
private static IEnumerable <RecognizedTextChunk> recognizeWords(PdfPage page, TesseractEngine engine, int resolution, string tempFileName) { // Save PDF page as high-resolution image PdfDrawOptions options = PdfDrawOptions.Create(); options.BackgroundColor = new PdfRgbColor(255, 255, 255); options.HorizontalResolution = resolution; options.VerticalResolution = resolution; page.Save(tempFileName, options); using (var img = Pix.LoadFromFile(tempFileName)) { using (var recognizedPage = engine.Process(img)) { using (ResultIterator iter = recognizedPage.GetIterator()) { const PageIteratorLevel Level = PageIteratorLevel.Word; iter.Begin(); do { if (iter.TryGetBoundingBox(Level, out Rect bounds)) { string text = iter.GetText(Level); float confidence = iter.GetConfidence(Level); yield return(new RecognizedTextChunk(text, bounds, confidence)); } } while (iter.Next(Level)); } } } }
private static TextBlockItem ExtractParaphraph(ResultIterator iter) { var text = new StringBuilder(); do { do { text.Append(iter.GetText(PageIteratorLevel.Word)); text.Append(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { text.Append(Environment.NewLine); } }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { text.Append(Environment.NewLine); } }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); return(new TextBlockItem { Text = text.ToString() }); }
public static string RecognizeBlocks(byte[] imageBytes) { StringBuilder sb = new StringBuilder(); try { using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default)) { using (Pix pix = Pix.LoadFromMemory(imageBytes)) { using (var page = engine.Process(pix)) { sb.AppendLine("Text (iterator):"); using (ResultIterator iter = page.GetIterator()) { iter.Begin(); do { do { do { do { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { sb.AppendLine("<BLOCK>"); } sb.Append(iter.GetText(PageIteratorLevel.Word)); sb.Append(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { sb.AppendLine(); } } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { sb.AppendLine(); } } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); } while (iter.Next(PageIteratorLevel.Block)); } } } } } catch (Exception e) { Trace.TraceError(e.ToString()); } return(sb.ToString()); }
public static string OCR(Bitmap imagem, string linguagem) { string texto = ""; using (TesseractEngine engine = new TesseractEngine(@"C:\GitHub\operacao-politica-supervisionada\OPS\temp\", linguagem, EngineMode.Default)) { engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ"); engine.SetVariable("tessedit_unrej_any_wd", true); engine.SetVariable("applybox_learn_chars_and_char_frags_mode", true); engine.SetVariable("save_blob_choices", true); string sobreposto = ""; int ultimo = 12; using (Page page = engine.Process(imagem, PageSegMode.SingleLine)) { using (ResultIterator ri = page.GetIterator()) { do { string word = ri.GetText(PageIteratorLevel.Symbol); Tesseract.Rect bb; if (ri.TryGetBoundingBox(PageIteratorLevel.Symbol, out bb)) { if ((bb.Width > 13) && (bb.Height > 15) && (word.Trim() != "")) { while (bb.X1 > ultimo + 14) { texto += Resolver(sobreposto); sobreposto = ""; ultimo += 28; } //System.Web.HttpContext.Current.Response.Write(word + ": " + bb.X1 + "<br />\n"); if ((word != "Q") || (bb.Height <= 30)) { sobreposto += word; } else { sobreposto += "O"; } } } } while((ri.Next(PageIteratorLevel.Symbol))); if (texto.Length < 6) { texto += Resolver(sobreposto); while (texto.Length < 6) { texto += LetraAleatoria(); } } } } } return(texto); }
private void GetPageData(TesseractEngine engine, Pix pageData, string language, ATAPY.Document.Data.Core.Page page) { ResultIterator resultIterator = null; try { using (var tessPage = engine.Process(pageData)) { tessPage.Recognize(); resultIterator = tessPage.GetIterator(); resultIterator.Begin(); do { var text = resultIterator.GetText(PageIteratorLevel.Word); if (TextIsValid(text) && resultIterator.TryGetBoundingBox(PageIteratorLevel.Word, out var rect)) { var rectW = GetRect(rect); var area = new TextArea(rectW, text, page); page.TextAreas.Add(area); var chars = new System.Windows.Rect[text.Length]; int charIter = 0; do { if (resultIterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out var sRect)) { chars[charIter] = GetRect(sRect); } charIter++; } while (resultIterator.Next(PageIteratorLevel.Word, PageIteratorLevel.Symbol)); area.SetCharProperties(chars); } } while (resultIterator.Next(PageIteratorLevel.Word)); } } finally { resultIterator?.Dispose(); } //return page; }
private static void IterateBlocks(Page page) { Console.WriteLine("Text (iterator):"); using (ResultIterator iter = page.GetIterator()) { iter.Begin(); do { do { do { do { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { Console.WriteLine("<BLOCK>"); } Console.Write(iter.GetText(PageIteratorLevel.Word)); Console.Write(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { Console.WriteLine(); } }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { Console.WriteLine(); } }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); }while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); }while (iter.Next(PageIteratorLevel.Block)); } }
public static void Main(string[] args) { var testImagePath = "./phototest.tif"; if (args.Length > 0) { testImagePath = args[0]; } try { var logger = new FormattedConsoleLogger(); var resultPrinter = new ResultPrinter(logger); using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default)) { using (var img = Pix.LoadFromFile(testImagePath)) { using (logger.Begin("Process image")) { var i = 1; using (var page = engine.Process(img)) { var text = page.GetText(); logger.Log("Text: {0}", text); logger.Log("Mean confidence: {0}", page.GetMeanConfidence()); using (var iter = page.GetIterator()) { iter.Begin(); do { if (i % 2 == 0) { using (logger.Begin("Line {0}", i)) { do { using (logger.Begin("Word Iteration")) { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { logger.Log("New block"); } if (iter.IsAtBeginningOf(PageIteratorLevel.Para)) { logger.Log("New paragraph"); } if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine)) { logger.Log("New line"); } logger.Log("word: " + iter.GetText(PageIteratorLevel.Word)); ResultIterator testiter = iter.Clone(); logger.Log("from clone: " + testiter.GetText(PageIteratorLevel.Word)); } } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); } } i++; } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); } } } } } } catch (Exception e) { Trace.TraceError(e.ToString()); Console.WriteLine("Unexpected Error: " + e.Message); Console.WriteLine("Details: "); Console.WriteLine(e.ToString()); } Console.Write("Press any key to continue . . . "); Console.ReadKey(true); }
public static string Box(Bitmap imagem, string linguagem, string correto, int pagina) { string texto = ""; using (TesseractEngine engine = new TesseractEngine(@"C:\GitHub\operacao-politica-supervisionada\OPS\temp\", linguagem)) { engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ"); engine.SetVariable("tessedit_unrej_any_wd", true); engine.SetVariable("applybox_learn_chars_and_char_frags_mode", true); engine.SetVariable("save_blob_choices", true); string sobreposto = ""; int ultimo = 12; using (Page page = engine.Process(imagem, PageSegMode.SingleLine)) { Tesseract.Rect bb; int x1 = 14, y1 = 0, x2 = 0, y2 = 0; int pos = 0; int miny1 = 50; using (ResultIterator ri = page.GetIterator()) { do { string word = ri.GetText(PageIteratorLevel.Symbol); if (ri.TryGetBoundingBox(PageIteratorLevel.Symbol, out bb)) { if ((bb.Width > 13) && (bb.Height > 15) && (word.Trim() != "")) { while (bb.X1 > ultimo + 14) { x2 = Math.Max(x1 + 15, x2); texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n"; pos++; sobreposto = ""; ultimo += 28; x1 = Math.Max(x1 + 28, x2); } miny1 = Math.Min(miny1, bb.Y1); if (sobreposto != "") { x1 = Math.Min(x1, bb.X1); y1 = Math.Min(y1, bb.Y1); x2 = Math.Max(x2, bb.X2); y2 = Math.Max(y2, bb.Y2); } else { x1 = Math.Max(x2 - 5, bb.X1); y1 = bb.Y1; x2 = bb.X2; y2 = bb.Y2; } //System.Web.HttpContext.Current.Response.Write(word + ": " + bb.X1 + "<br />\n"); if ((word != "Q") || (bb.Height <= 30)) { sobreposto += word; } else { sobreposto += "O"; } } } } while((ri.Next(PageIteratorLevel.Symbol))); int limite = imagem.Width - 6; if (pos < 6) { texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n"; while (pos < 5) { pos++; x1 = x2; x2 = x1 + (limite - x1) / (6 - pos); texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n"; } } } if (miny1 > 40) { texto = ""; } } } return(texto); }
/* * int LookingTextMarker(RecognazeRule Rule, Page page, out ResultIterator BestLineIter, out int word) * { * word = -1; * BestLineIter = null; * int BestDistance = 10000; * * ResultIterator LineIter = page.GetIterator(); * string[] Words = Rule.TextMarker.Split(new char[] {' '}, StringSplitOptions.RemoveEmptyEntries); * int NumberOfWords = Words.Length; * LineIter.Begin(); * do * { * int CurrentWordNumber = -1; * int CurrentBestDistance = 10000; * string Line = LineIter.GetText(PageIteratorLevel.TextLine); * if(Line == null) * continue; * string[] WordsOfLine = Line.Split(new char[] {' '}, StringSplitOptions.None); * if(WordsOfLine.Length < NumberOfWords) * continue; * * for(int shift = 0; shift <= WordsOfLine.Length - NumberOfWords; shift++) * { * int PassDistance = 0; * for(int i = 0; i < NumberOfWords; i++) * { * PassDistance += FuzzyStringComparer.GetDistanceLevenshtein(WordsOfLine[shift + i], * Words[i], * StringComparison.CurrentCultureIgnoreCase); * } * if(PassDistance < CurrentBestDistance) * { * CurrentBestDistance = PassDistance; * CurrentWordNumber = shift + 1; * } * } * if(CurrentBestDistance < BestDistance) * { * AddToLog ("new best"); * AddToLog (LineIter.GetText(PageIteratorLevel.Word)); * word = CurrentWordNumber; * if(BestLineIter != null) * BestLineIter.Dispose(); * BestLineIter = LineIter.Clone(); * AddToLog (BestLineIter.GetText(PageIteratorLevel.TextLine)); * BestDistance = CurrentBestDistance; * } * } while( LineIter.Next(PageIteratorLevel.TextLine)); * LineIter.Dispose(); * return BestDistance; * } */ int GetTextPosition(string Text, Page page, out int PosX, out int PosY, out double AngleRad, RecognazeRule[] AfterMarkerRules) { int BestDistance = 10000; PosX = -1; PosY = -1; AngleRad = 0; logger.Debug("Marker zone text:{0}", page.GetText()); ResultIterator LineIter = page.GetIterator(); string[] Words = Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); int NumberOfWords = Words.Length; LineIter.Begin(); do { int CurrentWordNumber = -1; int CurrentAfterWord = 0; int CurrentBestDistance = 10000; string Line = LineIter.GetText(PageIteratorLevel.TextLine); if (Line == null || Line == "") { continue; } Line = Line.Trim(); string[] WordsOfLine = Line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (WordsOfLine.Length == 0) { continue; } for (int shift = 0; shift < WordsOfLine.Length; shift++) { for (int i = 1; i <= NumberOfWords && i <= WordsOfLine.Length - shift; i++) { string passString = String.Join(" ", WordsOfLine, shift, i); int PassDistance = FuzzyStringComparer.GetDistanceLevenshtein(passString, Text, StringComparison.CurrentCultureIgnoreCase); if (PassDistance < CurrentBestDistance) { CurrentBestDistance = PassDistance; CurrentWordNumber = shift; CurrentAfterWord = shift + i; } } } if (CurrentBestDistance < BestDistance) { logger.Debug("new best"); logger.Debug(LineIter.GetText(PageIteratorLevel.TextLine).Trim()); //Заполняем поля данными после маркера. foreach (RecognazeRule rule in AfterMarkerRules) { if (rule.NextAfterTextMarker && WordsOfLine.Length > CurrentAfterWord + rule.ShiftWordsCount) { rule.AfterTextMarkerValue = WordsOfLine[CurrentAfterWord + rule.ShiftWordsCount]; } } BestDistance = CurrentBestDistance; for (int i = 0; i < CurrentWordNumber; i++) { LineIter.Next(PageIteratorLevel.Word); } Rect Box; LineIter.TryGetBoundingBox(PageIteratorLevel.Word, out Box); PosX = Box.X1; PosY = Box.Y1; logger.Debug("Position X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2); LineIter.TryGetBaseline(PageIteratorLevel.Word, out Box); logger.Debug("BaseLine X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2); AngleRad = Math.Atan2(Box.Y2 - Box.Y1, Box.X2 - Box.X1); //угл наклона базовой линии. double AngleGrad = AngleRad * (180 / Math.PI); logger.Debug("Angle rad:{0} grad:{1}", AngleRad, AngleGrad); //Получаем уровень распознования полей в маркере. int iterAlreadyShifted = CurrentWordNumber - CurrentAfterWord; bool stopIteration = false; foreach (RecognazeRule rule in AfterMarkerRules.Where(x => x.NextAfterTextMarker).OrderBy(x => x.ShiftWordsCount)) { while (iterAlreadyShifted < rule.ShiftWordsCount) { if (LineIter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { stopIteration = true; break; } LineIter.Next(PageIteratorLevel.Word); iterAlreadyShifted++; } if (stopIteration) { break; } rule.AfterTextMarkerConfidence = LineIter.GetConfidence(PageIteratorLevel.Word); logger.Debug("Cлово {0} со сдвигом {1} имеет точность {2}.", LineIter.GetText(PageIteratorLevel.Word), rule.ShiftWordsCount, rule.AfterTextMarkerConfidence); } } }while(LineIter.Next(PageIteratorLevel.TextLine)); LineIter.Dispose(); return(BestDistance); }