/// <summary>
        /// Extract text from document with specific Ocr Mode
        /// </summary>
        /// <param name="inputDocument"></param>
        /// <param name="oCRMode"></param>
        /// <returns></returns>
        private static string _ExtractTextWithSpecificOCRMode(string inputDocument, OCRMode ocrMode)
        {
            // Location of language data files
            string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // OCR language
            string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

            // Find more language files at https://github.com/bytescout/ocrdata

            // Create TextExtractor instance
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
            {
                // Load document to TextExtractor
                textExtractor.LoadDocumentFromFile(inputDocument);

                // Specify Ocr Mode
                textExtractor.OCRMode = ocrMode;

                // Ocr language data folder path and language
                textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                textExtractor.OCRLanguage           = ocrLanguage;

                // Return extracted text
                return(textExtractor.GetText());
            }
        }
Ejemplo n.º 2
0
        protected void Page_Load(object sender, EventArgs e)
        {
            try
            {
                string             strImageName = "";
                int                iFileLength;
                HttpFileCollection files      = HttpContext.Current.Request.Files;
                HttpPostedFile     uploadfile = files["RemoteFile"];
                if (uploadfile != null)
                {
                    strImageName = uploadfile.FileName;
                    iFileLength  = uploadfile.ContentLength;

                    Byte[]           inputBuffer = new Byte[iFileLength];
                    System.IO.Stream inputStream;

                    inputStream = uploadfile.InputStream;
                    inputStream.Read(inputBuffer, 0, iFileLength);

                    string strLanguage = "0";
                    try
                    {
                        strLanguage = HttpContext.Current.Request.Form["OCRLanguage"].ToString();
                    }
                    catch { }

                    string strFormat = "0";
                    try
                    {
                        strFormat = HttpContext.Current.Request.Form["FileFormat"].ToString();
                    }
                    catch { }

                    string strReturnValue = "";
                    try
                    {
                        byte[] content     = OCRMode.OCR(inputBuffer, strLanguage, Convert.ToInt32(strFormat));
                        string strFileName = SaveOCRContentAsFile(content, Convert.ToInt32(strFormat));
                        strReturnValue = "OK;" + "DownLoadOCR.aspx?FileId=" + Server.UrlEncode(strFileName);
                    }

                    catch (Exception exp)
                    {
                        strReturnValue = "EXP;" + exp.Message.ToString();
                    }
                    finally
                    {
                    }
                    Response.Write(strReturnValue);
                }
            }
            catch (Exception exp)
            {
                Response.Write("EXP;" + exp.Message.ToString() + "; 0;");
            }
        }
Ejemplo n.º 3
0
        public static void OCRSetup(OCRMode mode)
        {
            string dir = Environment.CurrentDirectory + @"\Modules\OCR\tessdata";

            switch (mode)
            {
            case OCRMode.NUMBERS:
                _ocr = new Tesseract(dir, "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_ONLY);
                _ocr.SetVariable("tessedit_char_whitelist", "1234567890");
                break;

            case OCRMode.COMBINED:
                _ocr = new Tesseract(dir, "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED);
                break;
            }

            OCRRoutine.DoWork               += new DoWorkEventHandler(OCRRoutine_doWork);
            OCRRoutine.ProgressChanged      += new ProgressChangedEventHandler(OCRRoutine_ProgressChanged);
            OCRRoutine.RunWorkerCompleted   += new RunWorkerCompletedEventHandler(OCRRoutine_WorkerCompleted);
            OCRRoutine.WorkerReportsProgress = true;
        }
Ejemplo n.º 4
0
        private static String OcrImage(Tesseract ocr, Mat image, OCRMode mode, Mat imageColor)
        {
            Bgr drawCharColor = new Bgr(Color.Red);

            if (image.NumberOfChannels == 1)
            {
                CvInvoke.CvtColor(image, imageColor, ColorConversion.Gray2Bgr);
            }
            else
            {
                image.CopyTo(imageColor);
            }

            if (mode == OCRMode.FullPage)
            {
                ocr.SetImage(imageColor);

                if (ocr.Recognize() != 0)
                {
                    throw new Exception("Failed to recognizer image");
                }

                Tesseract.Character[] characters = ocr.GetCharacters();
                if (characters.Length == 0)
                {
                    Mat imgGrey = new Mat();
                    CvInvoke.CvtColor(image, imgGrey, ColorConversion.Bgr2Gray);
                    Mat imgThresholded = new Mat();
                    CvInvoke.Threshold(imgGrey, imgThresholded, 65, 255, ThresholdType.Binary);
                    ocr.SetImage(imgThresholded);
                    characters = ocr.GetCharacters();
                    imageColor = imgThresholded;
                    if (characters.Length == 0)
                    {
                        CvInvoke.Threshold(image, imgThresholded, 190, 255, ThresholdType.Binary);
                        ocr.SetImage(imgThresholded);
                        characters = ocr.GetCharacters();
                        imageColor = imgThresholded;
                    }
                }
                foreach (Tesseract.Character c in characters)
                {
                    CvInvoke.Rectangle(imageColor, c.Region, drawCharColor.MCvScalar);
                }

                return(ocr.GetUTF8Text());
            }
            else
            {
                bool checkInvert = true;

                Rectangle[] regions;

                using (
                    ERFilterNM1 er1 = new ERFilterNM1("trained_classifierNM1.xml", 8, 0.00025f, 0.13f, 0.4f, true, 0.1f))
                    using (ERFilterNM2 er2 = new ERFilterNM2("trained_classifierNM2.xml", 0.3f))
                    {
                        int    channelCount = image.NumberOfChannels;
                        UMat[] channels     = new UMat[checkInvert ? channelCount * 2 : channelCount];

                        for (int i = 0; i < channelCount; i++)
                        {
                            UMat c = new UMat();
                            CvInvoke.ExtractChannel(image, c, i);
                            channels[i] = c;
                        }

                        if (checkInvert)
                        {
                            for (int i = 0; i < channelCount; i++)
                            {
                                UMat c = new UMat();
                                CvInvoke.BitwiseNot(channels[i], c);
                                channels[i + channelCount] = c;
                            }
                        }

                        VectorOfERStat[] regionVecs = new VectorOfERStat[channels.Length];
                        for (int i = 0; i < regionVecs.Length; i++)
                        {
                            regionVecs[i] = new VectorOfERStat();
                        }

                        try
                        {
                            for (int i = 0; i < channels.Length; i++)
                            {
                                er1.Run(channels[i], regionVecs[i]);
                                er2.Run(channels[i], regionVecs[i]);
                            }
                            using (VectorOfUMat vm = new VectorOfUMat(channels))
                            {
                                regions = ERFilter.ERGrouping(image, vm, regionVecs, ERFilter.GroupingMethod.OrientationHoriz,
                                                              "trained_classifier_erGrouping.xml", 0.5f);
                            }
                        }
                        finally
                        {
                            foreach (UMat tmp in channels)
                            {
                                if (tmp != null)
                                {
                                    tmp.Dispose();
                                }
                            }
                            foreach (VectorOfERStat tmp in regionVecs)
                            {
                                if (tmp != null)
                                {
                                    tmp.Dispose();
                                }
                            }
                        }

                        Rectangle imageRegion = new Rectangle(Point.Empty, imageColor.Size);
                        for (int i = 0; i < regions.Length; i++)
                        {
                            Rectangle r = ScaleRectangle(regions[i], 1.1);

                            r.Intersect(imageRegion);
                            regions[i] = r;
                        }
                    }


                List <Tesseract.Character> allChars = new List <Tesseract.Character>();
                String allText = String.Empty;
                foreach (Rectangle rect in regions)
                {
                    using (Mat region = new Mat(image, rect))
                    {
                        ocr.SetImage(region);
                        if (ocr.Recognize() != 0)
                        {
                            throw new Exception("Failed to recognize image");
                        }
                        Tesseract.Character[] characters = ocr.GetCharacters();

                        //convert the coordinates from the local region to global
                        for (int i = 0; i < characters.Length; i++)
                        {
                            Rectangle charRegion = characters[i].Region;
                            charRegion.Offset(rect.Location);
                            characters[i].Region = charRegion;
                        }
                        allChars.AddRange(characters);

                        allText += ocr.GetUTF8Text() + Environment.NewLine;
                    }
                }

                Bgr drawRegionColor = new Bgr(Color.Red);
                foreach (Rectangle rect in regions)
                {
                    CvInvoke.Rectangle(imageColor, rect, drawRegionColor.MCvScalar);
                }
                foreach (Tesseract.Character c in allChars)
                {
                    CvInvoke.Rectangle(imageColor, c.Region, drawCharColor.MCvScalar);
                }

                return(allText);
            }
        }