Example #1
0
            public void Print(ResultIterator iter)
            {
                logger.Log("Is beginning of block: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Block));
                logger.Log("Is beginning of para: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Para));
                logger.Log("Is beginning of text line: {0}", iter.IsAtBeginningOf(PageIteratorLevel.TextLine));
                logger.Log("Is beginning of word: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Word));
                logger.Log("Is beginning of symbol: {0}", iter.IsAtBeginningOf(PageIteratorLevel.Symbol));

                logger.Log("Block text: \"{0}\"", iter.GetText(PageIteratorLevel.Block));
                logger.Log("Para text: \"{0}\"", iter.GetText(PageIteratorLevel.Para));
                logger.Log("TextLine text: \"{0}\"", iter.GetText(PageIteratorLevel.TextLine));
                logger.Log("Word text: \"{0}\"", iter.GetText(PageIteratorLevel.Word));
                logger.Log("Symbol text: \"{0}\"", iter.GetText(PageIteratorLevel.Symbol));
            }
Example #2
0
        public List <Rectangle> GetTextRects(Bitmap currentImage)
        {
            string          tessPath = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\tessdata";
            TesseractEngine tess     = new TesseractEngine(tessPath, "eng");

            Page newPage = tess.Process(currentImage, PageSegMode.AutoOsd);

            ResultIterator   iterator     = newPage.GetIterator();
            string           totalText    = newPage.GetText();
            List <Rectangle> currentRects = new List <Rectangle>();

            for (int i = 0; i < totalText.Length; i++)
            {
                Rect   foundRect   = new Rect();
                string symbolText  = iterator.GetText(PageIteratorLevel.Symbol);
                bool   hasText     = symbolText != "" && symbolText != null;
                bool   onlyLetters = false;
                if (hasText)
                {
                    onlyLetters = symbolText.ToCharArray().All(s => char.IsLetter(s));
                }

                bool gotBoundingBox = iterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out foundRect);

                if (hasText && onlyLetters && gotBoundingBox)
                {
                    currentRects.Add(new Rectangle(foundRect.X1, foundRect.Y1, foundRect.X2 - foundRect.X1, foundRect.Y2 - foundRect.Y1));
                }
                iterator.Next(PageIteratorLevel.Symbol);
            }
            tess.Dispose();
            iterator.Dispose();

            return(currentRects);
        }
        private static IEnumerable <RecognizedTextChunk> recognizeWords(PdfPage page, TesseractEngine engine,
                                                                        int resolution, string tempFileName)
        {
            // Save PDF page as high-resolution image
            PdfDrawOptions options = PdfDrawOptions.Create();

            options.BackgroundColor      = new PdfRgbColor(255, 255, 255);
            options.HorizontalResolution = resolution;
            options.VerticalResolution   = resolution;
            page.Save(tempFileName, options);

            using (var img = Pix.LoadFromFile(tempFileName))
            {
                using (var recognizedPage = engine.Process(img))
                {
                    using (ResultIterator iter = recognizedPage.GetIterator())
                    {
                        const PageIteratorLevel Level = PageIteratorLevel.Word;
                        iter.Begin();
                        do
                        {
                            if (iter.TryGetBoundingBox(Level, out Rect bounds))
                            {
                                string text       = iter.GetText(Level);
                                float  confidence = iter.GetConfidence(Level);

                                yield return(new RecognizedTextChunk(text, bounds, confidence));
                            }
                        } while (iter.Next(Level));
                    }
                }
            }
        }
Example #4
0
        private static TextBlockItem ExtractParaphraph(ResultIterator iter)
        {
            var text = new StringBuilder();

            do
            {
                do
                {
                    text.Append(iter.GetText(PageIteratorLevel.Word));
                    text.Append(" ");

                    if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                    {
                        text.Append(Environment.NewLine);
                    }
                }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                {
                    text.Append(Environment.NewLine);
                }
            }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));

            return(new TextBlockItem
            {
                Text = text.ToString()
            });
        }
        public static string RecognizeBlocks(byte[] imageBytes)
        {
            StringBuilder sb = new StringBuilder();

            try
            {
                using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default))
                {
                    using (Pix pix = Pix.LoadFromMemory(imageBytes))
                    {
                        using (var page = engine.Process(pix))
                        {
                            sb.AppendLine("Text (iterator):");
                            using (ResultIterator iter = page.GetIterator())
                            {
                                iter.Begin();

                                do
                                {
                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                                {
                                                    sb.AppendLine("<BLOCK>");
                                                }

                                                sb.Append(iter.GetText(PageIteratorLevel.Word));
                                                sb.Append(" ");

                                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                                {
                                                    sb.AppendLine();
                                                }
                                            } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                                            {
                                                sb.AppendLine();
                                            }
                                        } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                    } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                } while (iter.Next(PageIteratorLevel.Block));
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Trace.TraceError(e.ToString());
            }

            return(sb.ToString());
        }
Example #6
0
    public static string OCR(Bitmap imagem, string linguagem)
    {
        string texto = "";

        using (TesseractEngine engine = new TesseractEngine(@"C:\GitHub\operacao-politica-supervisionada\OPS\temp\", linguagem, EngineMode.Default)) {
            engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ");
            engine.SetVariable("tessedit_unrej_any_wd", true);
            engine.SetVariable("applybox_learn_chars_and_char_frags_mode", true);
            engine.SetVariable("save_blob_choices", true);

            string sobreposto = "";
            int    ultimo     = 12;
            using (Page page = engine.Process(imagem, PageSegMode.SingleLine)) {
                using (ResultIterator ri = page.GetIterator()) {
                    do
                    {
                        string         word = ri.GetText(PageIteratorLevel.Symbol);
                        Tesseract.Rect bb;
                        if (ri.TryGetBoundingBox(PageIteratorLevel.Symbol, out bb))
                        {
                            if ((bb.Width > 13) && (bb.Height > 15) && (word.Trim() != ""))
                            {
                                while (bb.X1 > ultimo + 14)
                                {
                                    texto     += Resolver(sobreposto);
                                    sobreposto = "";
                                    ultimo    += 28;
                                }
                                //System.Web.HttpContext.Current.Response.Write(word + ": " + bb.X1 + "<br />\n");
                                if ((word != "Q") || (bb.Height <= 30))
                                {
                                    sobreposto += word;
                                }
                                else
                                {
                                    sobreposto += "O";
                                }
                            }
                        }
                    } while((ri.Next(PageIteratorLevel.Symbol)));
                    if (texto.Length < 6)
                    {
                        texto += Resolver(sobreposto);
                        while (texto.Length < 6)
                        {
                            texto += LetraAleatoria();
                        }
                    }
                }
            }
        }
        return(texto);
    }
Example #7
0
        private void GetPageData(TesseractEngine engine, Pix pageData, string language, ATAPY.Document.Data.Core.Page page)
        {
            ResultIterator resultIterator = null;

            try
            {
                using (var tessPage = engine.Process(pageData))
                {
                    tessPage.Recognize();
                    resultIterator = tessPage.GetIterator();
                    resultIterator.Begin();

                    do
                    {
                        var text = resultIterator.GetText(PageIteratorLevel.Word);
                        if (TextIsValid(text) && resultIterator.TryGetBoundingBox(PageIteratorLevel.Word, out var rect))
                        {
                            var rectW = GetRect(rect);
                            var area  = new TextArea(rectW, text, page);
                            page.TextAreas.Add(area);
                            var chars    = new System.Windows.Rect[text.Length];
                            int charIter = 0;
                            do
                            {
                                if (resultIterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out var sRect))
                                {
                                    chars[charIter] = GetRect(sRect);
                                }
                                charIter++;
                            } while (resultIterator.Next(PageIteratorLevel.Word, PageIteratorLevel.Symbol));
                            area.SetCharProperties(chars);
                        }
                    } while (resultIterator.Next(PageIteratorLevel.Word));
                }
            }
            finally
            {
                resultIterator?.Dispose();
            }
            //return page;
        }
Example #8
0
        private static void IterateBlocks(Page page)
        {
            Console.WriteLine("Text (iterator):");

            using (ResultIterator iter = page.GetIterator())
            {
                iter.Begin();

                do
                {
                    do
                    {
                        do
                        {
                            do
                            {
                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                {
                                    Console.WriteLine("<BLOCK>");
                                }

                                Console.Write(iter.GetText(PageIteratorLevel.Word));
                                Console.Write(" ");

                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                {
                                    Console.WriteLine();
                                }
                            }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                            {
                                Console.WriteLine();
                            }
                        }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                    }while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                }while (iter.Next(PageIteratorLevel.Block));
            }
        }
Example #9
0
        public static void Main(string[] args)
        {
            var testImagePath = "./phototest.tif";

            if (args.Length > 0)
            {
                testImagePath = args[0];
            }

            try {
                var logger        = new FormattedConsoleLogger();
                var resultPrinter = new ResultPrinter(logger);
                using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default)) {
                    using (var img = Pix.LoadFromFile(testImagePath)) {
                        using (logger.Begin("Process image")) {
                            var i = 1;
                            using (var page = engine.Process(img)) {
                                var text = page.GetText();
                                logger.Log("Text: {0}", text);
                                logger.Log("Mean confidence: {0}", page.GetMeanConfidence());

                                using (var iter = page.GetIterator()) {
                                    iter.Begin();
                                    do
                                    {
                                        if (i % 2 == 0)
                                        {
                                            using (logger.Begin("Line {0}", i)) {
                                                do
                                                {
                                                    using (logger.Begin("Word Iteration")) {
                                                        if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                                        {
                                                            logger.Log("New block");
                                                        }
                                                        if (iter.IsAtBeginningOf(PageIteratorLevel.Para))
                                                        {
                                                            logger.Log("New paragraph");
                                                        }
                                                        if (iter.IsAtBeginningOf(PageIteratorLevel.TextLine))
                                                        {
                                                            logger.Log("New line");
                                                        }
                                                        logger.Log("word: " + iter.GetText(PageIteratorLevel.Word));
                                                        ResultIterator testiter = iter.Clone();
                                                        logger.Log("from clone: " + testiter.GetText(PageIteratorLevel.Word));
                                                    }
                                                } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
                                            }
                                        }
                                        i++;
                                    } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                }
                            }
                        }
                    }
                }
            } catch (Exception e) {
                Trace.TraceError(e.ToString());
                Console.WriteLine("Unexpected Error: " + e.Message);
                Console.WriteLine("Details: ");
                Console.WriteLine(e.ToString());
            }
            Console.Write("Press any key to continue . . . ");
            Console.ReadKey(true);
        }
Example #10
0
    public static string Box(Bitmap imagem, string linguagem, string correto, int pagina)
    {
        string texto = "";

        using (TesseractEngine engine = new TesseractEngine(@"C:\GitHub\operacao-politica-supervisionada\OPS\temp\", linguagem)) {
            engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ");
            engine.SetVariable("tessedit_unrej_any_wd", true);
            engine.SetVariable("applybox_learn_chars_and_char_frags_mode", true);
            engine.SetVariable("save_blob_choices", true);

            string sobreposto = "";
            int    ultimo     = 12;
            using (Page page = engine.Process(imagem, PageSegMode.SingleLine)) {
                Tesseract.Rect bb;
                int            x1 = 14, y1 = 0, x2 = 0, y2 = 0;
                int            pos   = 0;
                int            miny1 = 50;
                using (ResultIterator ri = page.GetIterator()) {
                    do
                    {
                        string word = ri.GetText(PageIteratorLevel.Symbol);
                        if (ri.TryGetBoundingBox(PageIteratorLevel.Symbol, out bb))
                        {
                            if ((bb.Width > 13) && (bb.Height > 15) && (word.Trim() != ""))
                            {
                                while (bb.X1 > ultimo + 14)
                                {
                                    x2     = Math.Max(x1 + 15, x2);
                                    texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n";
                                    pos++;
                                    sobreposto = "";
                                    ultimo    += 28;
                                    x1         = Math.Max(x1 + 28, x2);
                                }
                                miny1 = Math.Min(miny1, bb.Y1);
                                if (sobreposto != "")
                                {
                                    x1 = Math.Min(x1, bb.X1);
                                    y1 = Math.Min(y1, bb.Y1);
                                    x2 = Math.Max(x2, bb.X2);
                                    y2 = Math.Max(y2, bb.Y2);
                                }
                                else
                                {
                                    x1 = Math.Max(x2 - 5, bb.X1);
                                    y1 = bb.Y1;
                                    x2 = bb.X2;
                                    y2 = bb.Y2;
                                }
                                //System.Web.HttpContext.Current.Response.Write(word + ": " + bb.X1 + "<br />\n");
                                if ((word != "Q") || (bb.Height <= 30))
                                {
                                    sobreposto += word;
                                }
                                else
                                {
                                    sobreposto += "O";
                                }
                            }
                        }
                    } while((ri.Next(PageIteratorLevel.Symbol)));
                    int limite = imagem.Width - 6;
                    if (pos < 6)
                    {
                        texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n";
                        while (pos < 5)
                        {
                            pos++;
                            x1     = x2;
                            x2     = x1 + (limite - x1) / (6 - pos);
                            texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n";
                        }
                    }
                }
                if (miny1 > 40)
                {
                    texto = "";
                }
            }
        }
        return(texto);
    }
Example #11
0
        /*
         * int LookingTextMarker(RecognazeRule Rule, Page page, out ResultIterator BestLineIter, out int word)
         * {
         *      word = -1;
         *      BestLineIter = null;
         *      int BestDistance = 10000;
         *
         *      ResultIterator LineIter = page.GetIterator();
         *      string[] Words = Rule.TextMarker.Split(new char[] {' '}, StringSplitOptions.RemoveEmptyEntries);
         *      int NumberOfWords = Words.Length;
         *      LineIter.Begin();
         *      do
         *      {
         *              int CurrentWordNumber = -1;
         *              int CurrentBestDistance = 10000;
         *              string Line = LineIter.GetText(PageIteratorLevel.TextLine);
         *              if(Line == null)
         *                      continue;
         *              string[] WordsOfLine = Line.Split(new char[] {' '}, StringSplitOptions.None);
         *              if(WordsOfLine.Length < NumberOfWords)
         *                      continue;
         *
         *              for(int shift = 0; shift <= WordsOfLine.Length - NumberOfWords; shift++)
         *              {
         *                      int PassDistance = 0;
         *                      for(int i = 0; i < NumberOfWords; i++)
         *                      {
         *                              PassDistance += FuzzyStringComparer.GetDistanceLevenshtein(WordsOfLine[shift + i],
         *                                                                                            Words[i],
         *                                                                                            StringComparison.CurrentCultureIgnoreCase);
         *                      }
         *                      if(PassDistance < CurrentBestDistance)
         *                      {
         *                              CurrentBestDistance = PassDistance;
         *                              CurrentWordNumber = shift + 1;
         *                      }
         *              }
         *              if(CurrentBestDistance < BestDistance)
         *              {
         *                      AddToLog ("new best");
         *                      AddToLog (LineIter.GetText(PageIteratorLevel.Word));
         *                      word = CurrentWordNumber;
         *                      if(BestLineIter != null)
         *                              BestLineIter.Dispose();
         *                      BestLineIter = LineIter.Clone();
         *                      AddToLog (BestLineIter.GetText(PageIteratorLevel.TextLine));
         *                      BestDistance = CurrentBestDistance;
         *              }
         *      } while( LineIter.Next(PageIteratorLevel.TextLine));
         *      LineIter.Dispose();
         *      return BestDistance;
         * } */

        int GetTextPosition(string Text, Page page, out int PosX, out int PosY, out double AngleRad, RecognazeRule[] AfterMarkerRules)
        {
            int BestDistance = 10000;

            PosX     = -1;
            PosY     = -1;
            AngleRad = 0;
            logger.Debug("Marker zone text:{0}", page.GetText());
            ResultIterator LineIter = page.GetIterator();

            string[] Words         = Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            int      NumberOfWords = Words.Length;

            LineIter.Begin();
            do
            {
                int    CurrentWordNumber   = -1;
                int    CurrentAfterWord    = 0;
                int    CurrentBestDistance = 10000;
                string Line = LineIter.GetText(PageIteratorLevel.TextLine);

                if (Line == null || Line == "")
                {
                    continue;
                }
                Line = Line.Trim();
                string[] WordsOfLine = Line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (WordsOfLine.Length == 0)
                {
                    continue;
                }
                for (int shift = 0; shift < WordsOfLine.Length; shift++)
                {
                    for (int i = 1; i <= NumberOfWords && i <= WordsOfLine.Length - shift; i++)
                    {
                        string passString = String.Join(" ", WordsOfLine, shift, i);

                        int PassDistance = FuzzyStringComparer.GetDistanceLevenshtein(passString,
                                                                                      Text,
                                                                                      StringComparison.CurrentCultureIgnoreCase);
                        if (PassDistance < CurrentBestDistance)
                        {
                            CurrentBestDistance = PassDistance;
                            CurrentWordNumber   = shift;
                            CurrentAfterWord    = shift + i;
                        }
                    }
                }
                if (CurrentBestDistance < BestDistance)
                {
                    logger.Debug("new best");
                    logger.Debug(LineIter.GetText(PageIteratorLevel.TextLine).Trim());
                    //Заполняем поля данными после маркера.
                    foreach (RecognazeRule rule in AfterMarkerRules)
                    {
                        if (rule.NextAfterTextMarker && WordsOfLine.Length > CurrentAfterWord + rule.ShiftWordsCount)
                        {
                            rule.AfterTextMarkerValue = WordsOfLine[CurrentAfterWord + rule.ShiftWordsCount];
                        }
                    }

                    BestDistance = CurrentBestDistance;
                    for (int i = 0; i < CurrentWordNumber; i++)
                    {
                        LineIter.Next(PageIteratorLevel.Word);
                    }
                    Rect Box;
                    LineIter.TryGetBoundingBox(PageIteratorLevel.Word, out Box);
                    PosX = Box.X1;
                    PosY = Box.Y1;
                    logger.Debug("Position X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2);
                    LineIter.TryGetBaseline(PageIteratorLevel.Word, out Box);
                    logger.Debug("BaseLine X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2);
                    AngleRad = Math.Atan2(Box.Y2 - Box.Y1, Box.X2 - Box.X1);                     //угл наклона базовой линии.
                    double AngleGrad = AngleRad * (180 / Math.PI);
                    logger.Debug("Angle rad:{0} grad:{1}", AngleRad, AngleGrad);

                    //Получаем уровень распознования полей в маркере.
                    int  iterAlreadyShifted = CurrentWordNumber - CurrentAfterWord;
                    bool stopIteration      = false;
                    foreach (RecognazeRule rule in AfterMarkerRules.Where(x => x.NextAfterTextMarker).OrderBy(x => x.ShiftWordsCount))
                    {
                        while (iterAlreadyShifted < rule.ShiftWordsCount)
                        {
                            if (LineIter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                            {
                                stopIteration = true;
                                break;
                            }
                            LineIter.Next(PageIteratorLevel.Word);
                            iterAlreadyShifted++;
                        }
                        if (stopIteration)
                        {
                            break;
                        }
                        rule.AfterTextMarkerConfidence = LineIter.GetConfidence(PageIteratorLevel.Word);
                        logger.Debug("Cлово {0} со сдвигом {1} имеет точность {2}.", LineIter.GetText(PageIteratorLevel.Word), rule.ShiftWordsCount, rule.AfterTextMarkerConfidence);
                    }
                }
            }while(LineIter.Next(PageIteratorLevel.TextLine));
            LineIter.Dispose();
            return(BestDistance);
        }