private static IEnumerable <RecognizedTextChunk> recognizeWords(PdfPage page, TesseractEngine engine,
                                                                        int resolution, string tempFileName)
        {
            // Save PDF page as high-resolution image
            PdfDrawOptions options = PdfDrawOptions.Create();

            options.BackgroundColor      = new PdfRgbColor(255, 255, 255);
            options.HorizontalResolution = resolution;
            options.VerticalResolution   = resolution;
            page.Save(tempFileName, options);

            using (var img = Pix.LoadFromFile(tempFileName))
            {
                using (var recognizedPage = engine.Process(img))
                {
                    using (ResultIterator iter = recognizedPage.GetIterator())
                    {
                        const PageIteratorLevel Level = PageIteratorLevel.Word;
                        iter.Begin();
                        do
                        {
                            if (iter.TryGetBoundingBox(Level, out Rect bounds))
                            {
                                string text       = iter.GetText(Level);
                                float  confidence = iter.GetConfidence(Level);

                                yield return(new RecognizedTextChunk(text, bounds, confidence));
                            }
                        } while (iter.Next(Level));
                    }
                }
            }
        }
Exemplo n.º 2
0
        public static string RecognizeBlocks(byte[] imageBytes)
        {
            StringBuilder sb = new StringBuilder();

            try
            {
                using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default))
                {
                    using (Pix pix = Pix.LoadFromMemory(imageBytes))
                    {
                        using (var page = engine.Process(pix))
                        {
                            sb.AppendLine("Text (iterator):");
                            using (ResultIterator iter = page.GetIterator())
                            {
                                iter.Begin();

                                do
                                {
                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                                {
                                                    sb.AppendLine("<BLOCK>");
                                                }

                                                sb.Append(iter.GetText(PageIteratorLevel.Word));
                                                sb.Append(" ");

                                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                                {
                                                    sb.AppendLine();
                                                }
                                            } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                                            {
                                                sb.AppendLine();
                                            }
                                        } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                    } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                } while (iter.Next(PageIteratorLevel.Block));
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Trace.TraceError(e.ToString());
            }

            return(sb.ToString());
        }
Exemplo n.º 3
0
        private void GetPageData(TesseractEngine engine, Pix pageData, string language, ATAPY.Document.Data.Core.Page page)
        {
            ResultIterator resultIterator = null;

            try
            {
                using (var tessPage = engine.Process(pageData))
                {
                    tessPage.Recognize();
                    resultIterator = tessPage.GetIterator();
                    resultIterator.Begin();

                    do
                    {
                        var text = resultIterator.GetText(PageIteratorLevel.Word);
                        if (TextIsValid(text) && resultIterator.TryGetBoundingBox(PageIteratorLevel.Word, out var rect))
                        {
                            var rectW = GetRect(rect);
                            var area  = new TextArea(rectW, text, page);
                            page.TextAreas.Add(area);
                            var chars    = new System.Windows.Rect[text.Length];
                            int charIter = 0;
                            do
                            {
                                if (resultIterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out var sRect))
                                {
                                    chars[charIter] = GetRect(sRect);
                                }
                                charIter++;
                            } while (resultIterator.Next(PageIteratorLevel.Word, PageIteratorLevel.Symbol));
                            area.SetCharProperties(chars);
                        }
                    } while (resultIterator.Next(PageIteratorLevel.Word));
                }
            }
            finally
            {
                resultIterator?.Dispose();
            }
            //return page;
        }
Exemplo n.º 4
0
        private static void IterateBlocks(Page page)
        {
            Console.WriteLine("Text (iterator):");

            using (ResultIterator iter = page.GetIterator())
            {
                iter.Begin();

                do
                {
                    do
                    {
                        do
                        {
                            do
                            {
                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                {
                                    Console.WriteLine("<BLOCK>");
                                }

                                Console.Write(iter.GetText(PageIteratorLevel.Word));
                                Console.Write(" ");

                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                {
                                    Console.WriteLine();
                                }
                            }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                            {
                                Console.WriteLine();
                            }
                        }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                    }while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                }while (iter.Next(PageIteratorLevel.Block));
            }
        }
Exemplo n.º 5
0
        /*
         * int LookingTextMarker(RecognazeRule Rule, Page page, out ResultIterator BestLineIter, out int word)
         * {
         *      word = -1;
         *      BestLineIter = null;
         *      int BestDistance = 10000;
         *
         *      ResultIterator LineIter = page.GetIterator();
         *      string[] Words = Rule.TextMarker.Split(new char[] {' '}, StringSplitOptions.RemoveEmptyEntries);
         *      int NumberOfWords = Words.Length;
         *      LineIter.Begin();
         *      do
         *      {
         *              int CurrentWordNumber = -1;
         *              int CurrentBestDistance = 10000;
         *              string Line = LineIter.GetText(PageIteratorLevel.TextLine);
         *              if(Line == null)
         *                      continue;
         *              string[] WordsOfLine = Line.Split(new char[] {' '}, StringSplitOptions.None);
         *              if(WordsOfLine.Length < NumberOfWords)
         *                      continue;
         *
         *              for(int shift = 0; shift <= WordsOfLine.Length - NumberOfWords; shift++)
         *              {
         *                      int PassDistance = 0;
         *                      for(int i = 0; i < NumberOfWords; i++)
         *                      {
         *                              PassDistance += FuzzyStringComparer.GetDistanceLevenshtein(WordsOfLine[shift + i],
         *                                                                                            Words[i],
         *                                                                                            StringComparison.CurrentCultureIgnoreCase);
         *                      }
         *                      if(PassDistance < CurrentBestDistance)
         *                      {
         *                              CurrentBestDistance = PassDistance;
         *                              CurrentWordNumber = shift + 1;
         *                      }
         *              }
         *              if(CurrentBestDistance < BestDistance)
         *              {
         *                      AddToLog ("new best");
         *                      AddToLog (LineIter.GetText(PageIteratorLevel.Word));
         *                      word = CurrentWordNumber;
         *                      if(BestLineIter != null)
         *                              BestLineIter.Dispose();
         *                      BestLineIter = LineIter.Clone();
         *                      AddToLog (BestLineIter.GetText(PageIteratorLevel.TextLine));
         *                      BestDistance = CurrentBestDistance;
         *              }
         *      } while( LineIter.Next(PageIteratorLevel.TextLine));
         *      LineIter.Dispose();
         *      return BestDistance;
         * } */

        int GetTextPosition(string Text, Page page, out int PosX, out int PosY, out double AngleRad, RecognazeRule[] AfterMarkerRules)
        {
            int BestDistance = 10000;

            PosX     = -1;
            PosY     = -1;
            AngleRad = 0;
            logger.Debug("Marker zone text:{0}", page.GetText());
            ResultIterator LineIter = page.GetIterator();

            string[] Words         = Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            int      NumberOfWords = Words.Length;

            LineIter.Begin();
            do
            {
                int    CurrentWordNumber   = -1;
                int    CurrentAfterWord    = 0;
                int    CurrentBestDistance = 10000;
                string Line = LineIter.GetText(PageIteratorLevel.TextLine);

                if (Line == null || Line == "")
                {
                    continue;
                }
                Line = Line.Trim();
                string[] WordsOfLine = Line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (WordsOfLine.Length == 0)
                {
                    continue;
                }
                for (int shift = 0; shift < WordsOfLine.Length; shift++)
                {
                    for (int i = 1; i <= NumberOfWords && i <= WordsOfLine.Length - shift; i++)
                    {
                        string passString = String.Join(" ", WordsOfLine, shift, i);

                        int PassDistance = FuzzyStringComparer.GetDistanceLevenshtein(passString,
                                                                                      Text,
                                                                                      StringComparison.CurrentCultureIgnoreCase);
                        if (PassDistance < CurrentBestDistance)
                        {
                            CurrentBestDistance = PassDistance;
                            CurrentWordNumber   = shift;
                            CurrentAfterWord    = shift + i;
                        }
                    }
                }
                if (CurrentBestDistance < BestDistance)
                {
                    logger.Debug("new best");
                    logger.Debug(LineIter.GetText(PageIteratorLevel.TextLine).Trim());
                    //Заполняем поля данными после маркера.
                    foreach (RecognazeRule rule in AfterMarkerRules)
                    {
                        if (rule.NextAfterTextMarker && WordsOfLine.Length > CurrentAfterWord + rule.ShiftWordsCount)
                        {
                            rule.AfterTextMarkerValue = WordsOfLine[CurrentAfterWord + rule.ShiftWordsCount];
                        }
                    }

                    BestDistance = CurrentBestDistance;
                    for (int i = 0; i < CurrentWordNumber; i++)
                    {
                        LineIter.Next(PageIteratorLevel.Word);
                    }
                    Rect Box;
                    LineIter.TryGetBoundingBox(PageIteratorLevel.Word, out Box);
                    PosX = Box.X1;
                    PosY = Box.Y1;
                    logger.Debug("Position X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2);
                    LineIter.TryGetBaseline(PageIteratorLevel.Word, out Box);
                    logger.Debug("BaseLine X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2);
                    AngleRad = Math.Atan2(Box.Y2 - Box.Y1, Box.X2 - Box.X1);                     //угл наклона базовой линии.
                    double AngleGrad = AngleRad * (180 / Math.PI);
                    logger.Debug("Angle rad:{0} grad:{1}", AngleRad, AngleGrad);

                    //Получаем уровень распознования полей в маркере.
                    int  iterAlreadyShifted = CurrentWordNumber - CurrentAfterWord;
                    bool stopIteration      = false;
                    foreach (RecognazeRule rule in AfterMarkerRules.Where(x => x.NextAfterTextMarker).OrderBy(x => x.ShiftWordsCount))
                    {
                        while (iterAlreadyShifted < rule.ShiftWordsCount)
                        {
                            if (LineIter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                            {
                                stopIteration = true;
                                break;
                            }
                            LineIter.Next(PageIteratorLevel.Word);
                            iterAlreadyShifted++;
                        }
                        if (stopIteration)
                        {
                            break;
                        }
                        rule.AfterTextMarkerConfidence = LineIter.GetConfidence(PageIteratorLevel.Word);
                        logger.Debug("Cлово {0} со сдвигом {1} имеет точность {2}.", LineIter.GetText(PageIteratorLevel.Word), rule.ShiftWordsCount, rule.AfterTextMarkerConfidence);
                    }
                }
            }while(LineIter.Next(PageIteratorLevel.TextLine));
            LineIter.Dispose();
            return(BestDistance);
        }