예제 #1
0
        private static TextBlockItem ExtractParaphraph(ResultIterator iter)
        {
            var text = new StringBuilder();

            do
            {
                do
                {
                    text.Append(iter.GetText(PageIteratorLevel.Word));
                    text.Append(" ");

                    if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                    {
                        text.Append(Environment.NewLine);
                    }
                }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                {
                    text.Append(Environment.NewLine);
                }
            }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));

            return(new TextBlockItem
            {
                Text = text.ToString()
            });
        }
예제 #2
0
        public static string RecognizeBlocks(byte[] imageBytes)
        {
            StringBuilder sb = new StringBuilder();

            try
            {
                using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default))
                {
                    using (Pix pix = Pix.LoadFromMemory(imageBytes))
                    {
                        using (var page = engine.Process(pix))
                        {
                            sb.AppendLine("Text (iterator):");
                            using (ResultIterator iter = page.GetIterator())
                            {
                                iter.Begin();

                                do
                                {
                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                                {
                                                    sb.AppendLine("<BLOCK>");
                                                }

                                                sb.Append(iter.GetText(PageIteratorLevel.Word));
                                                sb.Append(" ");

                                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                                {
                                                    sb.AppendLine();
                                                }
                                            } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                                            {
                                                sb.AppendLine();
                                            }
                                        } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                    } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                } while (iter.Next(PageIteratorLevel.Block));
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Trace.TraceError(e.ToString());
            }

            return(sb.ToString());
        }
예제 #3
0
        public List <Rectangle> GetTextRects(Bitmap currentImage)
        {
            string          tessPath = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) + "\\tessdata";
            TesseractEngine tess     = new TesseractEngine(tessPath, "eng");

            Page newPage = tess.Process(currentImage, PageSegMode.AutoOsd);

            ResultIterator   iterator     = newPage.GetIterator();
            string           totalText    = newPage.GetText();
            List <Rectangle> currentRects = new List <Rectangle>();

            for (int i = 0; i < totalText.Length; i++)
            {
                Rect   foundRect   = new Rect();
                string symbolText  = iterator.GetText(PageIteratorLevel.Symbol);
                bool   hasText     = symbolText != "" && symbolText != null;
                bool   onlyLetters = false;
                if (hasText)
                {
                    onlyLetters = symbolText.ToCharArray().All(s => char.IsLetter(s));
                }

                bool gotBoundingBox = iterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out foundRect);

                if (hasText && onlyLetters && gotBoundingBox)
                {
                    currentRects.Add(new Rectangle(foundRect.X1, foundRect.Y1, foundRect.X2 - foundRect.X1, foundRect.Y2 - foundRect.Y1));
                }
                iterator.Next(PageIteratorLevel.Symbol);
            }
            tess.Dispose();
            iterator.Dispose();

            return(currentRects);
        }
        private static IEnumerable <RecognizedTextChunk> recognizeWords(PdfPage page, TesseractEngine engine,
                                                                        int resolution, string tempFileName)
        {
            // Save PDF page as high-resolution image
            PdfDrawOptions options = PdfDrawOptions.Create();

            options.BackgroundColor      = new PdfRgbColor(255, 255, 255);
            options.HorizontalResolution = resolution;
            options.VerticalResolution   = resolution;
            page.Save(tempFileName, options);

            using (var img = Pix.LoadFromFile(tempFileName))
            {
                using (var recognizedPage = engine.Process(img))
                {
                    using (ResultIterator iter = recognizedPage.GetIterator())
                    {
                        const PageIteratorLevel Level = PageIteratorLevel.Word;
                        iter.Begin();
                        do
                        {
                            if (iter.TryGetBoundingBox(Level, out Rect bounds))
                            {
                                string text       = iter.GetText(Level);
                                float  confidence = iter.GetConfidence(Level);

                                yield return(new RecognizedTextChunk(text, bounds, confidence));
                            }
                        } while (iter.Next(Level));
                    }
                }
            }
        }
예제 #5
0
 private static IEnumerable <TextBlockItem> ExtractPage(ResultIterator iter)
 {
     do
     {
         yield return(ExtractParaphraph(iter));
     }while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
 }
예제 #6
0
        private void IterateFullPage(ResultIterator iter, ref List <TextLine> _textLines)
        {
            int left, top, right, bottom;


            StringBuilder     ss    = new StringBuilder(string.Empty);
            PageIteratorLevel level = PageIteratorLevel.RIL_TEXTLINE;
            string            t;

            do
            {
                TextLine l = new TextLine();
                t = iter.GetUTF8Text(level);

                ss.Append(t);
                iter.BoundingBox(level, out left, out top, out right, out bottom);

                l.Bounds = new Rectangle(left, top, right - left, bottom - top);

                l.Text = t ?? string.Empty;


                level   = PageIteratorLevel.RIL_WORD;
                l.Words = new List <Word>();
                do
                {
                    Word w = new Word();
                    iter.BoundingBox(level, out left, out top, out right, out bottom);
                    w.Text       = iter.GetUTF8Text(level);
                    w.Confidence = iter.Confidence(level);
                    w.Bounds     = new Rectangle(left, top, right - left, bottom - top);
                    l.Words.Add(w);
                    if (iter.IsAtFinalElement(PageIteratorLevel.RIL_TEXTLINE, PageIteratorLevel.RIL_WORD))
                    {
                        break;
                    }
                } while (iter.Next(level));
                level = PageIteratorLevel.RIL_TEXTLINE;

                ss.Append(System.Environment.NewLine);
                _textLines.Add(l);
            } while (iter.Next(level));



            text = ss.ToString();
        }
예제 #7
0
        // Grab text from image
        private string GetText(TessBaseAPI tessBaseAPI, int partX, int partY)
        {
            // These can change depending on resolution, HUD scaling, or UI changes
            // For now, only supports 1440p with full HUD scaling
            // TODO: Dynamic scaling
            const int BOXWIDTH  = 311;
            const int BOXHEIGHT = 33;

            // Set image location start
            tessBaseAPI.SetRectangle(partX, partY, BOXWIDTH, BOXHEIGHT);

            // Recognize image
            tessBaseAPI.Recognize();

            ResultIterator resultIterator = tessBaseAPI.GetIterator();

            // Extract text from result iterator
            StringBuilder     stringBuilder     = new StringBuilder();
            PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;

            do
            {
                stringBuilder.Append(resultIterator.GetUTF8Text(pageIteratorLevel));
            } while (resultIterator.Next(pageIteratorLevel));

            // Fix up string for Warframe.market
            stringBuilder = stringBuilder.Replace("\n", String.Empty);
            string guess = stringBuilder.ToString();

            // Changed to use Levenshtein here due to important of having a matching "Blueprint" word
            Levenshtein levBP         = new Fastenshtein.Levenshtein("Blueprint");
            int         levBPDistance = levBP.DistanceFrom(guess);

            Debug.WriteLine("Distance from Blueprint: " + levBPDistance);

            // If there is a 4-character difference, accept the word is == "Blueprint"
            // Adjust the offset for 2-lined parts
            if (levBPDistance < 5)
            {
                guess = GetText(tessBaseAPI, partX, 550);
            }

            // Match whatever result we get to the closest selling item name from Warframe.market
            // We want to ignore "Blueprint" because this indicates that it's a 2-lined item
            if (guess != "Blueprint" && !guess.Contains("Forma"))
            {
                Debug.Write("");
                Debug.Write("Old: " + guess);

                guess = FindClosestWord(guess);

                Debug.WriteLine(" | New: " + guess);
            }

            return(guess);
        }
예제 #8
0
        private void GetPageData(TesseractEngine engine, Pix pageData, string language, ATAPY.Document.Data.Core.Page page)
        {
            ResultIterator resultIterator = null;

            try
            {
                using (var tessPage = engine.Process(pageData))
                {
                    tessPage.Recognize();
                    resultIterator = tessPage.GetIterator();
                    resultIterator.Begin();

                    do
                    {
                        var text = resultIterator.GetText(PageIteratorLevel.Word);
                        if (TextIsValid(text) && resultIterator.TryGetBoundingBox(PageIteratorLevel.Word, out var rect))
                        {
                            var rectW = GetRect(rect);
                            var area  = new TextArea(rectW, text, page);
                            page.TextAreas.Add(area);
                            var chars    = new System.Windows.Rect[text.Length];
                            int charIter = 0;
                            do
                            {
                                if (resultIterator.TryGetBoundingBox(PageIteratorLevel.Symbol, out var sRect))
                                {
                                    chars[charIter] = GetRect(sRect);
                                }
                                charIter++;
                            } while (resultIterator.Next(PageIteratorLevel.Word, PageIteratorLevel.Symbol));
                            area.SetCharProperties(chars);
                        }
                    } while (resultIterator.Next(PageIteratorLevel.Word));
                }
            }
            finally
            {
                resultIterator?.Dispose();
            }
            //return page;
        }
예제 #9
0
    public static string OCR(Bitmap imagem, string linguagem)
    {
        string texto = "";

        using (TesseractEngine engine = new TesseractEngine(@"C:\GitHub\operacao-politica-supervisionada\OPS\temp\", linguagem, EngineMode.Default)) {
            engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ");
            engine.SetVariable("tessedit_unrej_any_wd", true);
            engine.SetVariable("applybox_learn_chars_and_char_frags_mode", true);
            engine.SetVariable("save_blob_choices", true);

            string sobreposto = "";
            int    ultimo     = 12;
            using (Page page = engine.Process(imagem, PageSegMode.SingleLine)) {
                using (ResultIterator ri = page.GetIterator()) {
                    do
                    {
                        string         word = ri.GetText(PageIteratorLevel.Symbol);
                        Tesseract.Rect bb;
                        if (ri.TryGetBoundingBox(PageIteratorLevel.Symbol, out bb))
                        {
                            if ((bb.Width > 13) && (bb.Height > 15) && (word.Trim() != ""))
                            {
                                while (bb.X1 > ultimo + 14)
                                {
                                    texto     += Resolver(sobreposto);
                                    sobreposto = "";
                                    ultimo    += 28;
                                }
                                //System.Web.HttpContext.Current.Response.Write(word + ": " + bb.X1 + "<br />\n");
                                if ((word != "Q") || (bb.Height <= 30))
                                {
                                    sobreposto += word;
                                }
                                else
                                {
                                    sobreposto += "O";
                                }
                            }
                        }
                    } while((ri.Next(PageIteratorLevel.Symbol)));
                    if (texto.Length < 6)
                    {
                        texto += Resolver(sobreposto);
                        while (texto.Length < 6)
                        {
                            texto += LetraAleatoria();
                        }
                    }
                }
            }
        }
        return(texto);
    }
예제 #10
0
        private static void IterateBlocks(Page page)
        {
            Console.WriteLine("Text (iterator):");

            using (ResultIterator iter = page.GetIterator())
            {
                iter.Begin();

                do
                {
                    do
                    {
                        do
                        {
                            do
                            {
                                if (iter.IsAtBeginningOf(PageIteratorLevel.Block))
                                {
                                    Console.WriteLine("<BLOCK>");
                                }

                                Console.Write(iter.GetText(PageIteratorLevel.Word));
                                Console.Write(" ");

                                if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                {
                                    Console.WriteLine();
                                }
                            }while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));

                            if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine))
                            {
                                Console.WriteLine();
                            }
                        }while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                    }while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                }while (iter.Next(PageIteratorLevel.Block));
            }
        }
예제 #11
0
        private List <string> run_tessract()
        {
            List <string>        re        = new List <string>();
            string               dataPath  = "./tessdata/";
            string               language  = 1 != 1 ? "eng" : "jpn";
            string               inputFile = "./_tmp.bmp";
            OcrEngineMode        oem       = OcrEngineMode.DEFAULT;
            PageSegmentationMode psm       = PageSegmentationMode.AUTO_OSD;

            TessBaseAPI tessBaseAPI = new TessBaseAPI();

            // Initialize tesseract-ocr
            if (!tessBaseAPI.Init(dataPath, language, oem))
            {
                throw new Exception("Could not initialize tesseract.");
            }

            // Set the Page Segmentation mode
            tessBaseAPI.SetPageSegMode(psm);

            // Set the input image
            Pix pix = tessBaseAPI.SetImage(inputFile);

            tessBaseAPI.SetVariable("number", "1234567890");

            // Recognize image
            tessBaseAPI.Recognize();

            ResultIterator resultIterator = tessBaseAPI.GetIterator();

            // extract text from result iterator
            StringBuilder     stringBuilder     = new StringBuilder();
            PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;

            do
            {
                string str = resultIterator.GetUTF8Text(pageIteratorLevel);


                if (str != null)
                {
                    str = Regex.Replace(str, @"\n", "\r\n");
                    re.Add(str);
                }
            } while (resultIterator.Next(pageIteratorLevel));

            tessBaseAPI.Dispose();
            pix.Dispose();
            return(re);
        }
예제 #12
0
        private void ConvertImageToText()
        {
            try{
                string               dataPath  = "./tessdata";
                string               language  = "eng";
                string               inputFile = "./imageSaved.bmp";
                OcrEngineMode        oem       = OcrEngineMode.DEFAULT;
                PageSegmentationMode psm       = PageSegmentationMode.AUTO_OSD;

                TessBaseAPI tessBaseAPI = new TessBaseAPI();

                if (!tessBaseAPI.Init(dataPath, language, oem))
                {
                    throw new Exception("Could not initialize tesseract.");
                }

                // Set the Page Segmentation mode
                tessBaseAPI.SetPageSegMode(psm);

                // Set the input image
                tessBaseAPI.SetImage(inputFile);

                // Recognize image
                tessBaseAPI.Recognize();

                ResultIterator resultIterator = tessBaseAPI.GetIterator();

                // Extract text from result iterator
                StringBuilder     stringBuilder     = new StringBuilder();
                PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_WORD;// RIL_PARA;

                do
                {
                    stringBuilder.Append(resultIterator.GetUTF8Text(pageIteratorLevel) + " ");
                } while (resultIterator.Next(pageIteratorLevel));

                tessBaseAPI.Dispose();
                textParsed = stringBuilder.ToString().Trim();
                //textParsedBox.Text = stringBuilder.ToString().Trim();
            } catch (Exception ex) {
                MessageBox.Show(ex.Message);
            }
        }
예제 #13
0
        static void example3()
        {
            string dataPath = "./tessdata/";
            //string language = "eng";
            string               language  = "chi_sim";
            string               inputFile = "./input.png";
            OcrEngineMode        oem       = OcrEngineMode.DEFAULT;
            PageSegmentationMode psm       = PageSegmentationMode.AUTO_OSD;

            TessBaseAPI tessBaseAPI = new TessBaseAPI();

            // Initialize tesseract-ocr
            if (!tessBaseAPI.Init(dataPath, language, oem))
            {
                throw new Exception("Could not initialize tesseract.");
            }

            // Set the Page Segmentation mode
            tessBaseAPI.SetPageSegMode(psm);

            // Set the input image
            Pix pix = tessBaseAPI.SetImage(inputFile);

            // Recognize image
            tessBaseAPI.Recognize();

            ResultIterator resultIterator = tessBaseAPI.GetIterator();

            // extract text from result iterator
            StringBuilder     stringBuilder     = new StringBuilder();
            PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;

            do
            {
                stringBuilder.Append(resultIterator.GetUTF8Text(pageIteratorLevel));
            } while (resultIterator.Next(pageIteratorLevel));

            tessBaseAPI.Dispose();
            pix.Dispose();
        }
예제 #14
0
        protected override ApiResult ProcessClientQueueMessage(ImageArtifact message)
        {
            BitmapData bData = message.Image.LockBits(
                new Rectangle(0, 0, message.Image.Width, message.Image.Height), ImageLockMode.ReadOnly, message.Image.PixelFormat);
            int w = bData.Width, h = bData.Height, bpp = Image.GetPixelFormatSize(bData.PixelFormat) / 8;

            unsafe
            {
                TesseractImage.SetImage(new UIntPtr(bData.Scan0.ToPointer()), w, h, bpp, bData.Stride);
            }
            Pix = TesseractImage.GetInputImage();

            Debug("Pix has width: {0} height: {1} depth: {2} xres: {3} yres: {4}.", Pix.Width, Pix.Height, Pix.Depth,
                  Pix.XRes, Pix.YRes);
            List <string> text;

            using (var op = Begin("Tesseract OCR (fast)"))
            {
                TesseractImage.Recognize();
                ResultIterator resultIterator = TesseractImage.GetIterator();
                text = new List <string>();
                PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;
                do
                {
                    string r = resultIterator.GetUTF8Text(pageIteratorLevel);
                    if (r.IsEmpty())
                    {
                        continue;
                    }
                    text.Add(r.Trim());
                }while (resultIterator.Next(pageIteratorLevel));

                if (text.Count > 0)
                {
                    string alltext = text.Aggregate((s1, s2) => s1 + " " + s2).Trim();

                    if (text.Count < 7)
                    {
                        Info("Artifact id {0} is likely a photo or non-text image.", message.Id);
                    }
                    else
                    {
                        message.OCRText = text;
                        Info("OCR Text: {0}", alltext);
                    }
                }
                else
                {
                    Info("No text recognized in artifact id {0}.", message.Id);
                }
                op.Complete();
            }

            message.Image.UnlockBits(bData);
            if (text.Count >= 7)
            {
                TextArtifact artifact = new TextArtifact(message.Name + ".txt", text);
                EnqueueMessage(artifact);
                Info("{0} added artifact id {1} of type {2} from artifact {3}.", Name, artifact.Id, artifact.GetType(),
                     message.Id);
            }

            return(ApiResult.Success);
        }
        public void /*Image*/ GetTextFromImage(IWebElement element, string uniqueName)
        {
            Screenshot screenshot = ((ITakesScreenshot)Driver).GetScreenshot();

            string pth       = Assembly.GetCallingAssembly().CodeBase;
            string finalpth  = pth.Substring(0, pth.LastIndexOf("bin")) + "Screenshots/" + uniqueName + ".jpeg";
            string localpath = new Uri(finalpth).LocalPath;

            screenshot.SaveAsFile(localpath, ScreenshotImageFormat.Jpeg);

            Image     img  = Image.FromFile(localpath /*uniqueName*/);
            Rectangle rect = new Rectangle();

            if (element != null)
            {
                // Get the Width and Height of the WebElement using
                int width  = element.Size.Width;
                int height = element.Size.Height;

                // Get the Location of WebElement in a Point.
                // This will provide X & Y co-ordinates of the WebElement
                Point p = element.Location;

                // Create a rectangle using Width, Height and element location
                rect = new Rectangle(p.X, p.Y, width, height);
            }

            //croping the image based on rect.
            Bitmap bmpImage   = new Bitmap(img);
            var    cropedImag = bmpImage.Clone(rect, bmpImage.PixelFormat);


            string dataPath = @"C:\Betsold\AutomationTesting\Tests\testdata\";
            string language = "eng";
            string imgPath  = @"C:\Betsold\AutomationTesting\Tests\Screenshots\logo-test.jpeg";

            OcrEngineMode        oem = OcrEngineMode.LSTM_ONLY;
            PageSegmentationMode psm = PageSegmentationMode.AUTO;

            TessBaseAPI tessBaseAPI = new TessBaseAPI(dataPath, language, oem, psm);

            // Set the input image
            tessBaseAPI.SetImage(imgPath);

            var processedImage = tessBaseAPI.GetThresholdedImage();

            processedImage.Write(@"C:\Users\ibozhinovski\Desktop\", ImageFileFormatTypes.IFF_JFIF_JPEG);


            // Recognize image
            tessBaseAPI.Recognize();

            ResultIterator resultIterator = tessBaseAPI.GetIterator();

            // Extract text from result iterator
            StringBuilder     text = new StringBuilder();
            PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;

            do
            {
                text.Append(resultIterator.GetUTF8Text(pageIteratorLevel));
            } while (resultIterator.Next(pageIteratorLevel));

            tessBaseAPI.Dispose();

            Console.Read();

            /*
             * // croping the image based on rect.
             * Bitmap bmpImage = new Bitmap(img);
             * var cropedImag = bmpImage.Clone(rect, bmpImage.PixelFormat);
             *
             * var ocr = new TesseractEngine("./testdata", "eng");
             *
             * var page = ocr.Process(cropedImag);
             *
             * var result = page.GetText();
             *
             * Console.WriteLine(result);
             */
        }
예제 #16
0
        /*
         * int LookingTextMarker(RecognazeRule Rule, Page page, out ResultIterator BestLineIter, out int word)
         * {
         *      word = -1;
         *      BestLineIter = null;
         *      int BestDistance = 10000;
         *
         *      ResultIterator LineIter = page.GetIterator();
         *      string[] Words = Rule.TextMarker.Split(new char[] {' '}, StringSplitOptions.RemoveEmptyEntries);
         *      int NumberOfWords = Words.Length;
         *      LineIter.Begin();
         *      do
         *      {
         *              int CurrentWordNumber = -1;
         *              int CurrentBestDistance = 10000;
         *              string Line = LineIter.GetText(PageIteratorLevel.TextLine);
         *              if(Line == null)
         *                      continue;
         *              string[] WordsOfLine = Line.Split(new char[] {' '}, StringSplitOptions.None);
         *              if(WordsOfLine.Length < NumberOfWords)
         *                      continue;
         *
         *              for(int shift = 0; shift <= WordsOfLine.Length - NumberOfWords; shift++)
         *              {
         *                      int PassDistance = 0;
         *                      for(int i = 0; i < NumberOfWords; i++)
         *                      {
         *                              PassDistance += FuzzyStringComparer.GetDistanceLevenshtein(WordsOfLine[shift + i],
         *                                                                                            Words[i],
         *                                                                                            StringComparison.CurrentCultureIgnoreCase);
         *                      }
         *                      if(PassDistance < CurrentBestDistance)
         *                      {
         *                              CurrentBestDistance = PassDistance;
         *                              CurrentWordNumber = shift + 1;
         *                      }
         *              }
         *              if(CurrentBestDistance < BestDistance)
         *              {
         *                      AddToLog ("new best");
         *                      AddToLog (LineIter.GetText(PageIteratorLevel.Word));
         *                      word = CurrentWordNumber;
         *                      if(BestLineIter != null)
         *                              BestLineIter.Dispose();
         *                      BestLineIter = LineIter.Clone();
         *                      AddToLog (BestLineIter.GetText(PageIteratorLevel.TextLine));
         *                      BestDistance = CurrentBestDistance;
         *              }
         *      } while( LineIter.Next(PageIteratorLevel.TextLine));
         *      LineIter.Dispose();
         *      return BestDistance;
         * } */

        int GetTextPosition(string Text, Page page, out int PosX, out int PosY, out double AngleRad, RecognazeRule[] AfterMarkerRules)
        {
            int BestDistance = 10000;

            PosX     = -1;
            PosY     = -1;
            AngleRad = 0;
            logger.Debug("Marker zone text:{0}", page.GetText());
            ResultIterator LineIter = page.GetIterator();

            string[] Words         = Text.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            int      NumberOfWords = Words.Length;

            LineIter.Begin();
            do
            {
                int    CurrentWordNumber   = -1;
                int    CurrentAfterWord    = 0;
                int    CurrentBestDistance = 10000;
                string Line = LineIter.GetText(PageIteratorLevel.TextLine);

                if (Line == null || Line == "")
                {
                    continue;
                }
                Line = Line.Trim();
                string[] WordsOfLine = Line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                if (WordsOfLine.Length == 0)
                {
                    continue;
                }
                for (int shift = 0; shift < WordsOfLine.Length; shift++)
                {
                    for (int i = 1; i <= NumberOfWords && i <= WordsOfLine.Length - shift; i++)
                    {
                        string passString = String.Join(" ", WordsOfLine, shift, i);

                        int PassDistance = FuzzyStringComparer.GetDistanceLevenshtein(passString,
                                                                                      Text,
                                                                                      StringComparison.CurrentCultureIgnoreCase);
                        if (PassDistance < CurrentBestDistance)
                        {
                            CurrentBestDistance = PassDistance;
                            CurrentWordNumber   = shift;
                            CurrentAfterWord    = shift + i;
                        }
                    }
                }
                if (CurrentBestDistance < BestDistance)
                {
                    logger.Debug("new best");
                    logger.Debug(LineIter.GetText(PageIteratorLevel.TextLine).Trim());
                    //Заполняем поля данными после маркера.
                    foreach (RecognazeRule rule in AfterMarkerRules)
                    {
                        if (rule.NextAfterTextMarker && WordsOfLine.Length > CurrentAfterWord + rule.ShiftWordsCount)
                        {
                            rule.AfterTextMarkerValue = WordsOfLine[CurrentAfterWord + rule.ShiftWordsCount];
                        }
                    }

                    BestDistance = CurrentBestDistance;
                    for (int i = 0; i < CurrentWordNumber; i++)
                    {
                        LineIter.Next(PageIteratorLevel.Word);
                    }
                    Rect Box;
                    LineIter.TryGetBoundingBox(PageIteratorLevel.Word, out Box);
                    PosX = Box.X1;
                    PosY = Box.Y1;
                    logger.Debug("Position X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2);
                    LineIter.TryGetBaseline(PageIteratorLevel.Word, out Box);
                    logger.Debug("BaseLine X1:{0} Y1:{1} X2:{2} Y2:{3}", Box.X1, Box.Y1, Box.X2, Box.Y2);
                    AngleRad = Math.Atan2(Box.Y2 - Box.Y1, Box.X2 - Box.X1);                     //угл наклона базовой линии.
                    double AngleGrad = AngleRad * (180 / Math.PI);
                    logger.Debug("Angle rad:{0} grad:{1}", AngleRad, AngleGrad);

                    //Получаем уровень распознования полей в маркере.
                    int  iterAlreadyShifted = CurrentWordNumber - CurrentAfterWord;
                    bool stopIteration      = false;
                    foreach (RecognazeRule rule in AfterMarkerRules.Where(x => x.NextAfterTextMarker).OrderBy(x => x.ShiftWordsCount))
                    {
                        while (iterAlreadyShifted < rule.ShiftWordsCount)
                        {
                            if (LineIter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                            {
                                stopIteration = true;
                                break;
                            }
                            LineIter.Next(PageIteratorLevel.Word);
                            iterAlreadyShifted++;
                        }
                        if (stopIteration)
                        {
                            break;
                        }
                        rule.AfterTextMarkerConfidence = LineIter.GetConfidence(PageIteratorLevel.Word);
                        logger.Debug("Cлово {0} со сдвигом {1} имеет точность {2}.", LineIter.GetText(PageIteratorLevel.Word), rule.ShiftWordsCount, rule.AfterTextMarkerConfidence);
                    }
                }
            }while(LineIter.Next(PageIteratorLevel.TextLine));
            LineIter.Dispose();
            return(BestDistance);
        }
예제 #17
0
        protected override ApiResult ProcessClientQueueMessage(ImageArtifact message)
        {
            BitmapData bData = message.Image.LockBits(
                new Rectangle(0, 0, message.Image.Width, message.Image.Height), ImageLockMode.ReadOnly, message.Image.PixelFormat);
            int w = bData.Width, h = bData.Height, bpp = Image.GetPixelFormatSize(bData.PixelFormat) / 8;

            unsafe
            {
                TesseractImage.SetImage(new UIntPtr(bData.Scan0.ToPointer()), w, h, bpp, bData.Stride);
            }
            Pix = TesseractImage.GetInputImage();

            Debug("Pix has width: {0} height: {1} depth: {2} xres: {3} yres: {4}.", Pix.Width, Pix.Height, Pix.Depth,
                  Pix.XRes, Pix.YRes);
            List <string> text;

            using (var op = Begin("Tesseract OCR (fast)"))
            {
                TesseractImage.Recognize();
                ResultIterator resultIterator = TesseractImage.GetIterator();
                text = new List <string>();
                PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;
                do
                {
                    string ant = TextArtifact.GetAlphaNumericString(resultIterator.GetUTF8Text(pageIteratorLevel));
                    ant = string.Join(" ",
                                      ant.Split(new[] { ' ', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
                                      .Where(word => TextArtifact.IsNumber(word) || word.Length > 3 || Pipeline.Dictionaries["common_words_en_3grams"].Contains(word)))
                          .Trim();
                    if (ant.IsEmpty())
                    {
                        continue;
                    }
                    else
                    {
                        text.Add(ant);
                    }
                }while (resultIterator.Next(pageIteratorLevel));

                if (text.Count > 0)
                {
                    string alltext = text.Aggregate((s1, s2) => s1 + " " + s2).Trim();
                    if (text.Count < 7)
                    {
                        Info("Artifact id {0} is likely a photo or non-text image.", message.Id);
                    }
                    else
                    {
                        message.OCRText = text;
                        Info("OCR Text: {0}", alltext);
                    }
                }
                else
                {
                    Info("No text recognized in artifact id {0}.", message.Id);
                }
                op.Complete();
            }

            message.Image.UnlockBits(bData);
            if (text.Count >= 7)
            {
                TextArtifact artifact = new TextArtifact(message.Name + ".txt", string.Join(Environment.NewLine, text.ToArray()));
                artifact.Source             = message.Source;
                artifact.CurrentProcess     = message.CurrentProcess;
                artifact.CurrentWindowTitle = message.CurrentWindowTitle;
                artifact.Image       = message;
                message.TextArtifact = artifact;
                EnqueueMessage(artifact);
                Info("{0} added artifact id {1} of type {2} from artifact {3}.", Name, artifact.Id, artifact.GetType(),
                     message.Id);
            }
            return(ApiResult.Success);
        }
예제 #18
0
    public static string Box(Bitmap imagem, string linguagem, string correto, int pagina)
    {
        string texto = "";

        using (TesseractEngine engine = new TesseractEngine(@"C:\GitHub\operacao-politica-supervisionada\OPS\temp\", linguagem)) {
            engine.SetVariable("tessedit_char_whitelist", "1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ");
            engine.SetVariable("tessedit_unrej_any_wd", true);
            engine.SetVariable("applybox_learn_chars_and_char_frags_mode", true);
            engine.SetVariable("save_blob_choices", true);

            string sobreposto = "";
            int    ultimo     = 12;
            using (Page page = engine.Process(imagem, PageSegMode.SingleLine)) {
                Tesseract.Rect bb;
                int            x1 = 14, y1 = 0, x2 = 0, y2 = 0;
                int            pos   = 0;
                int            miny1 = 50;
                using (ResultIterator ri = page.GetIterator()) {
                    do
                    {
                        string word = ri.GetText(PageIteratorLevel.Symbol);
                        if (ri.TryGetBoundingBox(PageIteratorLevel.Symbol, out bb))
                        {
                            if ((bb.Width > 13) && (bb.Height > 15) && (word.Trim() != ""))
                            {
                                while (bb.X1 > ultimo + 14)
                                {
                                    x2     = Math.Max(x1 + 15, x2);
                                    texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n";
                                    pos++;
                                    sobreposto = "";
                                    ultimo    += 28;
                                    x1         = Math.Max(x1 + 28, x2);
                                }
                                miny1 = Math.Min(miny1, bb.Y1);
                                if (sobreposto != "")
                                {
                                    x1 = Math.Min(x1, bb.X1);
                                    y1 = Math.Min(y1, bb.Y1);
                                    x2 = Math.Max(x2, bb.X2);
                                    y2 = Math.Max(y2, bb.Y2);
                                }
                                else
                                {
                                    x1 = Math.Max(x2 - 5, bb.X1);
                                    y1 = bb.Y1;
                                    x2 = bb.X2;
                                    y2 = bb.Y2;
                                }
                                //System.Web.HttpContext.Current.Response.Write(word + ": " + bb.X1 + "<br />\n");
                                if ((word != "Q") || (bb.Height <= 30))
                                {
                                    sobreposto += word;
                                }
                                else
                                {
                                    sobreposto += "O";
                                }
                            }
                        }
                    } while((ri.Next(PageIteratorLevel.Symbol)));
                    int limite = imagem.Width - 6;
                    if (pos < 6)
                    {
                        texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n";
                        while (pos < 5)
                        {
                            pos++;
                            x1     = x2;
                            x2     = x1 + (limite - x1) / (6 - pos);
                            texto += correto[pos] + " " + x1 + " " + Math.Min(10, y1) + " " + x2 + " " + Math.Max(40, y2) + " " + pagina + "\n";
                        }
                    }
                }
                if (miny1 > 40)
                {
                    texto = "";
                }
            }
        }
        return(texto);
    }
예제 #19
0
        //public static void  clearFaceFlag() { faceFlag = 0; }
        public string RecognizeText(int id)
        {
            DBService dbs = new DBService();
            Document  doc = dbs.FindDocumentById(id);

            Image <Gray, Byte> img = scale(doc);

            //var image = PixConverter.ToPix(img.ToBitmap()))

            Tesseract.Native.DllImports.TesseractDirectory = System.Web.HttpContext.Current.Server.MapPath("~/Tesseract/bin/Debug/DLLS/");
            TessBaseAPI tessBaseAPI = new TessBaseAPI();

            System.Diagnostics.Debug.WriteLine("The current version is {0}", tessBaseAPI.GetVersion());

            string dataPath = System.Web.HttpContext.Current.Server.MapPath("~/tessdata/");
            string language = "eng";

            string        inputFile = doc.Path;
            OcrEngineMode oem       = OcrEngineMode.DEFAULT;
            //OcrEngineMode oem = OcrEngineMode.DEFAULT;
            PageSegmentationMode psm = PageSegmentationMode.AUTO_OSD;

            // Initialize tesseract-ocr
            if (!tessBaseAPI.Init(dataPath, language, oem))
            {
                throw new Exception("Could not initialize tesseract.");
            }

            // Set the Page Segmentation mode
            tessBaseAPI.SetPageSegMode(psm);

            // Set the input image
            Pix pix = tessBaseAPI.SetImage(inputFile);

            // Recognize image
            tessBaseAPI.Recognize();

            ResultIterator resultIterator = tessBaseAPI.GetIterator();

            // extract text from result iterator
            StringBuilder stringBuilder = new StringBuilder();
            int           top, bottom, left, right, i = 0;

            List <OCRText> forJson = new List <OCRText>();

            PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_TEXTLINE;

            do
            {
                string textContent = resultIterator.GetUTF8Text(pageIteratorLevel);
                resultIterator.BoundingBox(pageIteratorLevel, out left, out top, out right, out bottom);
                string coordsString = "" + left + "," + top + "," + right + "," + bottom;

                forJson.Add(new OCRText()
                {
                    Coords = coordsString, Text = textContent
                });
            } while (resultIterator.Next(pageIteratorLevel));

            tessBaseAPI.Dispose();
            pix.Dispose();

            var textForReturn = JsonConvert.SerializeObject(forJson);

            dbs.UpdateDocument(textForReturn, id);

            if (HasFace(img) == true)
            {
                FaceFlag = 1;
            }
            else
            {
                FaceFlag = 0;
            }

            return(textForReturn);
        }