예제 #1
0
        private void InitOcr(String path, String lang, OcrEngineMode mode)
        {
            try
            {
                if (_ocr != null)
                {
                    _ocr.Dispose();
                    _ocr = null;
                }

                if (String.IsNullOrEmpty(path))
                {
                    path = Tesseract.DefaultTesseractDirectory;
                }

                TesseractDownloadLangFile(path, lang);
                TesseractDownloadLangFile(path, "osd"); //script orientation detection

                _ocr = new Tesseract(path, lang, mode);
            }
            catch (System.Net.WebException e)
            {
                _ocr = null;
                throw  new Exception("Unable to download tesseract lang file. Please check internet connection.", e);
            }
            catch (Exception e)
            {
                _ocr = null;
                throw e;
            }
        }
예제 #2
0
        /*
         * /// <summary>
         * /// Check of the specific Ocr Engine is supported for the current tesseract release
         * /// </summary>
         * /// <param name="mode">The Engine mode</param>
         * /// <returns>True if supported, false otherwise</returns>
         * public bool IsEngineModeSupported(OcrEngineMode mode)
         * {
         * Version v = Version;
         * if ((mode == OcrEngineMode.OEM_CUBE_ONLY || mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED)
         *    && (v.Major < 3 || (v.Major == 3 && v.Minor < 1)))
         * {
         *    return false;
         * }
         * return true;
         * }*/

        /// <summary>
        /// Initialize the OCR engine using the specific dataPath and language name.
        /// </summary>
        /// <param name="dataPath">
        /// The datapath must be the name of the parent directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// </param>
        /// <param name="language">
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
        /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~ sign is available to override
        /// that. Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin. The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// </param>
        /// <param name="mode">OCR engine mode</param>
        public void Init(String dataPath, String language, OcrEngineMode mode)
        {
            if (!(dataPath.Length > 0 && dataPath.Substring(dataPath.Length - 1).ToCharArray()[0] == System.IO.Path.DirectorySeparatorChar))
            { //if the data path end in slash
                int lastSlash = dataPath.LastIndexOf(System.IO.Path.DirectorySeparatorChar);
                if (lastSlash != -1)
                {
                    //there is a direcotry separator, get the path up to the separator, the same way tesseract-ocr calculate the folder
                    dataPath = dataPath.Substring(0, lastSlash + 1);
                }
            }

            /*
             * if (!System.IO.Directory.Exists(System.IO.Path.Combine(dataPath, "tessdata")))
             * {
             * throw new ArgumentException(String.Format("The directory {0} doesn't exist!", Path.Combine(dataPath, "tessdata")));
             * }
             *
             * //make sure the tesseract file exist.
             * if (mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED || mode == OcrEngineMode.OEM_TESSERACT_ONLY)
             * {
             * if (!System.IO.File.Exists(System.IO.Path.Combine(dataPath, "tessdata", language + ".traineddata")))
             *    throw new ArgumentException(String.Format("The required tesseract file {0}.traineddata doesn't exist", System.IO.Path.Combine(dataPath, language)));
             * }*/

            /*if (!IsEngineModeSupported(mode))
             * throw new ArgumentException(String.Format("The Ocr engine mode {0} is not supported in tesseract v{1}", mode, Version));*/
            int initResult = TessBaseAPIInit(_ptr, dataPath, language, mode);

            if (initResult != 0)
            {
                throw new ArgumentException(String.Format("Unable to create ocr model using Path {0} and language {1}.", dataPath, language));
            }
        }
예제 #3
0
 internal static extern int TessBaseAPIInit(
    IntPtr ocr,
    [MarshalAs(CvInvoke.StringMarshalType)]
    String dataPath,
    [MarshalAs(CvInvoke.StringMarshalType)]
    String language,
    OcrEngineMode mode);
예제 #4
0
        /// <summary>
        /// Instances are now mostly thread-safe and totally independent,
        /// but some global parameters remain.Basically it is safe to use multiple
        /// TessBaseAPIs in different threads in parallel, UNLESS:
        /// you use SetVariable on some of the Params in classify and textord.
        /// If you do, then the effect will be to change it for all your instances.
        ///
        /// Start tesseract.Returns zero on success and -1 on failure.
        /// NOTE that the only members that may be called before Init are those
        /// listed above here in the class definition.
        ///
        /// The datapath must be the name of the parent directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form[~]<lang>[+[~]<lang>]* indicating
        /// that multiple languages are to be loaded.Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~sign is available to override
        ///
        /// that.Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin.The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// WARNING: On changing languages, all Tesseract parameters are reset
        /// back to their default values. (Which may vary between languages.)
        /// If you have a rare need to set a Variable that controls
        /// initialization for a second call to Init you should explicitly
        /// call End() and then use SetVariable before Init.This is only a very
        /// rare use case, since there are very few uses that require any parameters
        /// to be set before Init.
        ///
        /// If set_only_non_debug_params is true, only params that do not contain
        /// "debug" in the name will be set.
        /// </summary>
        public bool Init(string dataPath, string language, OcrEngineMode tessOcrEngineMode,
                         string[] configs, string[] varsVec, string[] varsValues, bool setOnlyNonDebugParams = false)
        {
            int configsSize = 0;

            if (null != configs)
            {
                configsSize = configs.Length;
            }

            UIntPtr varsVecSize = new UIntPtr(0);

            if (null != varsVec)
            {
                varsVecSize = new UIntPtr((uint)varsVec.Length);
            }

            UIntPtr varsValuesSize = new UIntPtr(0);

            if (null != varsValues)
            {
                varsValuesSize = new UIntPtr((uint)varsValues.Length);
            }

            return(Native.DllImports.TessBaseAPIInit4((HandleRef)this, dataPath, language, tessOcrEngineMode,
                                                      configs, configsSize, varsVec, varsValues, varsVecSize, setOnlyNonDebugParams ? 1 : 0) == 0);
        }
예제 #5
0
파일: Tesseract.cs 프로젝트: v5chn/emgucv
        /// <summary>
        /// Initialize the OCR engine using the specific dataPath and language name.
        /// </summary>
        /// <param name="dataPath">
        /// The datapath must be the name of the parent directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// </param>
        /// <param name="language">
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
        /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~ sign is available to override
        /// that. Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin. The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// </param>
        /// <param name="mode">OCR engine mode</param>
        public void Init(String dataPath, String language, OcrEngineMode mode)
        {
            /*
             * if (!System.IO.Directory.Exists(System.IO.Path.Combine(dataPath, "tessdata")))
             * {
             * throw new ArgumentException(String.Format("The directory {0} doesn't exist!", Path.Combine(dataPath, "tessdata")));
             * }
             *
             * //make sure the tesseract file exist.
             * if (mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED || mode == OcrEngineMode.OEM_TESSERACT_ONLY)
             * {
             * if (!System.IO.File.Exists(System.IO.Path.Combine(dataPath, "tessdata", language + ".traineddata")))
             *    throw new ArgumentException(String.Format("The required tesseract file {0}.traineddata doesn't exist", System.IO.Path.Combine(dataPath, language)));
             * }*/

            /*if (!IsEngineModeSupported(mode))
             * throw new ArgumentException(String.Format("The Ocr engine mode {0} is not supported in tesseract v{1}", mode, Version));*/


            using (CvString csDataPath = new CvString(dataPath))
                using (CvString csLanguage = new CvString(language))
                {
                    int initResult = OcrInvoke.cveTessBaseAPIInit(_ptr, csDataPath, csLanguage, mode);
                    if (initResult != 0)
                    {
                        if (dataPath.Equals(String.Empty))
                        {
                            dataPath = Path.GetFullPath(".");
                        }
                        throw new ArgumentException(
                                  String.Format("Unable to create ocr model using Path '{0}', language '{1}' and OcrEngineMode '{2}'.", dataPath,
                                                language, mode));
                    }
                }
        }
예제 #6
0
        private void InitOcr(String path, String lang, OcrEngineMode mode)
        {
            try
            {
                if (_ocr != null)
                {
                    _ocr.Dispose();
                    _ocr = null;
                }

                if (String.IsNullOrEmpty(path))
                {
                    path = ".";
                }

                TesseractDownloadLangFile(path, lang);
                TesseractDownloadLangFile(path, "osd"); //script orientation detection
                String pathFinal = path.Length == 0 ||
                                   path.Substring(path.Length - 1, 1).Equals(Path.DirectorySeparatorChar.ToString())
                    ? path
                    : String.Format("{0}{1}", path, System.IO.Path.DirectorySeparatorChar);

                _ocr = new Tesseract(pathFinal, lang, mode);
            }
            catch (System.Net.WebException e)
            {
                _ocr = null;
                throw new Exception("Unable to download tesseract lang file. Please check internet connection.", e);
            }
            catch (Exception e)
            {
                _ocr = null;
                throw e;
            }
        }
예제 #7
0
 internal static extern int TessBaseAPIInit(
     IntPtr ocr,
     [MarshalAs(CvInvoke.StringMarshalType)]
     String dataPath,
     [MarshalAs(CvInvoke.StringMarshalType)]
     String language,
     OcrEngineMode mode);
예제 #8
0
        private void InitOcr(String path, String lang, OcrEngineMode mode)
        {
            try
            {
                if (_ocr != null)
                {
                    _ocr.Dispose();
                    _ocr = null;
                }

                if (String.IsNullOrEmpty(path))
                {
                    path = ".";
                }

                TesseractDownloadLangFile(path, lang);
                TesseractDownloadLangFile(path, "osd"); //script orientation detection
                String pathFinal = path.Length == 0 || path.Substring(path.Length - 1, 1).Equals(Path.DirectorySeparatorChar.ToString())
                    ? path
                    : String.Format("{0}{1}", path, System.IO.Path.DirectorySeparatorChar);

                _ocr = new Tesseract(pathFinal, lang, mode);

                languageNameLabel.Text = String.Format("{0} : {1}", lang, mode.ToString());
            }
            catch (Exception e)
            {
                _ocr = null;
                MessageBox.Show(e.Message, "Failed to initialize tesseract OCR engine", MessageBoxButtons.OK);
                languageNameLabel.Text = "Failed to initialize tesseract OCR engine";
            }
        }
예제 #9
0
        InitTesseract(String lang, OcrEngineMode mode, System.Net.DownloadProgressChangedEventHandler onDownloadProgressChanged = null)
        {
            if (_ocr == null)
            {
                FileDownloadManager manager = new FileDownloadManager();
                manager.AddFile(Emgu.CV.OCR.Tesseract.GetLangFileUrl(lang), _modelFolderName);
                manager.AddFile(Emgu.CV.OCR.Tesseract.GetLangFileUrl("osd"), _modelFolderName); //script orientation detection

                if (onDownloadProgressChanged != null)
                {
                    manager.OnDownloadProgressChanged += onDownloadProgressChanged;
                }
#if UNITY_EDITOR || UNITY_IOS || UNITY_ANDROID || UNITY_STANDALONE || UNITY_WEBGL
                yield return(manager.Download());
#else
                await manager.Download();
#endif

                if (manager.AllFilesDownloaded)
                {
                    _lang = lang;
                    _mode = mode;
                    FileInfo fi = new FileInfo(manager.Files[0].LocalFile);
                    _ocr = new Tesseract(fi.DirectoryName, _lang, _mode);
                }
            }
        }
예제 #10
0
        public int Init(string datapath,
            string language,
            OcrEngineMode oem,
            string configs,
            int configs_size,
            List<string> vars_vec,
            List<string> vars_values,            
            bool set_only_init_params)
        {
            //       if (tesseract_ != NULL &&
             //(datapath_ == NULL || language_ == NULL ||
             // *datapath_ != datapath || last_oem_requested_ != oem ||
             // (*language_ != Language && tesseract_->lang != Language)))
             //       {
             //           tesseract_->end_tesseract();
             //           delete tesseract_;
             //           tesseract_ = NULL;
             //       }
            if (tesseract != null &&
                (string.IsNullOrEmpty(datapath) || string.IsNullOrEmpty(language) ||
                this.datapath != datapath || !last_oem_requested.Equals(oem) ||
                (this.language != language && tesseract.Language != language)))
            {
                tesseract.end_tesseract();
                tesseract = null;
            }

            return 0;
        }
예제 #11
0
        public int Init(string datapath,
                        string language,
                        OcrEngineMode oem,
                        string configs,
                        int configs_size,
                        List <string> vars_vec,
                        List <string> vars_values,
                        bool set_only_init_params)
        {
            //       if (tesseract_ != NULL &&
            //(datapath_ == NULL || language_ == NULL ||
            // *datapath_ != datapath || last_oem_requested_ != oem ||
            // (*language_ != Language && tesseract_->lang != Language)))
            //       {
            //           tesseract_->end_tesseract();
            //           delete tesseract_;
            //           tesseract_ = NULL;
            //       }
            if (tesseract != null &&
                (string.IsNullOrEmpty(datapath) || string.IsNullOrEmpty(language) ||
                 this.datapath != datapath || !last_oem_requested.Equals(oem) ||
                 (this.language != language && tesseract.Language != language)))
            {
                tesseract.end_tesseract();
                tesseract = null;
            }

            return(0);
        }
 //"", "eng", OcrEngineMode.TesseractLstmCombined
 public void Init(string path, string lang, OcrEngineMode mode)
 {
     if (_ocr != null)
     {
         _ocr.Dispose();
     }
     _ocr = new Tesseract(path, lang, mode);
 }
예제 #13
0
        public Recogniser(string lang = "rusf")
        {
            string        dataPath = "tessdata";      // Папка, где лежат traineddata
            OcrEngineMode mode     = lang == "rusf" ? OcrEngineMode.LstmOnly : OcrEngineMode.TesseractLstmCombined;

            _tesseract = new Tesseract(dataPath, lang, mode);
            _tesseract.SetVariable("user_defined_dpi", "300");             // Установка dpi, чтоб не ругался и не выдавал предупреждения
        }
예제 #14
0
 private static void InitTesseract(OcrEngineMode mode = OcrEngineMode.Default, string whiteList = null)
 {
     if (ocr == null || ocr.Oem != mode || whiteList != null)
     {
         ocr             = new Tesseract(ocrDataSetPath, "eng", mode, whiteList);
         ocr.PageSegMode = PageSegMode.SingleLine;
     }
 }
예제 #15
0
        /// <summary>
        /// Create an tesseract OCR engine.
        /// </summary>
        /// <param name="dataPath">
        /// The datapath must be the name of the directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// </param>
        /// <param name="language">
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
        /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~ sign is available to override
        /// that. Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin. The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// </param>
        /// <param name="mode">OCR engine mode</param>
        /// <param name="whiteList">This can be used to specify a white list for OCR. e.g. specify "1234567890" to recognize digits only. Note that the white list currently seems to only work with OcrEngineMode.OEM_TESSERACT_ONLY</param>
        public Tesseract(String dataPath, String language, OcrEngineMode mode, String whiteList)
            : this(dataPath, language, mode)
        {
            //if (mode == OcrEngineMode. || mode == OcrEngineMode.TesseractCubeCombined)
            //    throw new ArgumentException("White list is not supported by CUBE engine");

            SetVariable("tessedit_char_whitelist", whiteList);
        }
예제 #16
0
        static void example2()
        {
            string               dataPath = "./tessdata/";
            string               language = "eng";
            OcrEngineMode        oem      = OcrEngineMode.DEFAULT;
            PageSegmentationMode psm      = PageSegmentationMode.AUTO_OSD;

            TessBaseAPI tessBaseAPI = new TessBaseAPI(dataPath, language, oem, psm);
        }
예제 #17
0
        /// <summary>
        /// Initialize the OCR engine using the specific dataPath and language name.
        /// </summary>
        /// <param name="dataPath">The path where the language file is located</param>
        /// <param name="language">The 3 letter language code </param>
        /// <param name="mode">OCR engine mode</param>
        private void Init(String dataPath, String language, OcrEngineMode mode)
        {
            int initResult = TessBaseAPIInit(_ptr, dataPath, language, mode);

            if (initResult != 0)
            {
                throw new ArgumentException(String.Format("Unable to create ocr model using Path {0} and language {1}.", dataPath, language));
            }
        }
예제 #18
0
        /// <summary>
        /// Create an tesseract OCR engine.
        /// </summary>
        /// <param name="dataPath">The path where the language file is located</param>
        /// <param name="language">The 3 letter language code </param>
        /// <param name="mode">OCR engine mode</param>
        public Tesseract(String dataPath, String language, OcrEngineMode mode)
        {
            if (!IsEngineModeSupported(mode))
            {
                throw new ArgumentException(String.Format("The Ocr engine mode {0} is not supported in tesseract v{1}", mode, Version));
            }
            _ptr = TessBaseAPICreate();

            Init(dataPath, language, mode);
        }
예제 #19
0
        /// <summary>
        /// Create an tesseract OCR engine.
        /// </summary>
        /// <param name="dataPath">
        /// The datapath must be the name of the parent directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// </param>
        /// <param name="language">
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
        /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~ sign is available to override
        /// that. Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin. The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// </param>
        /// <param name="mode">OCR engine mode</param>
        /// <param name="whiteList">This can be used to specify a white list for OCR. e.g. specify "1234567890" to recognize digits only. Note that the white list currently seems to only work with OcrEngineMode.OEM_TESSERACT_ONLY</param>
        public Tesseract(String dataPath, String language, OcrEngineMode mode, String whiteList)
            : this(dataPath, language, mode)
        {
            if (mode == OcrEngineMode.OEM_CUBE_ONLY || mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED)
            {
                throw new ArgumentException("White list is not supported by CUBE engine");
            }

            SetVariable("tessedit_char_whitelist", whiteList);
        }
예제 #20
0
        /// <summary>
        /// Instances are now mostly thread-safe and totally independent,
        /// but some global parameters remain.Basically it is safe to use multiple
        /// TessBaseAPIs in different threads in parallel, UNLESS:
        /// you use SetVariable on some of the Params in classify and textord.
        /// If you do, then the effect will be to change it for all your instances.
        ///
        /// Start tesseract.Returns zero on success and -1 on failure.
        /// NOTE that the only members that may be called before Init are those
        /// listed above here in the class definition.
        ///
        /// The datapath must be the name of the parent directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form[~]<lang>[+[~]<lang>]* indicating
        /// that multiple languages are to be loaded.Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~sign is available to override
        ///
        /// that.Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin.The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// WARNING: On changing languages, all Tesseract parameters are reset
        /// back to their default values. (Which may vary between languages.)
        /// If you have a rare need to set a Variable that controls
        /// initialization for a second call to Init you should explicitly
        /// call End() and then use SetVariable before Init.This is only a very
        /// rare use case, since there are very few uses that require any parameters
        /// to be set before Init.
        ///
        /// If set_only_non_debug_params is true, only params that do not contain
        /// "debug" in the name will be set.
        /// </summary>
        public bool Init(string dataPath, string language, OcrEngineMode tessOcrEngineMode, string[] configs)
        {
            int configsSize = 0;

            if (null != configs)
            {
                configsSize = configs.Length;
            }

            return(Native.DllImports.TessBaseAPIInit1((HandleRef)this, dataPath, language, tessOcrEngineMode, configs, configsSize) == 0);
        }
예제 #21
0
        /*
         * /// <summary>
         * /// Check of the specific Ocr Engine is supported for the current tesseract release
         * /// </summary>
         * /// <param name="mode">The Engine mode</param>
         * /// <returns>True if supported, false otherwise</returns>
         * public bool IsEngineModeSupported(OcrEngineMode mode)
         * {
         * Version v = Version;
         * if ((mode == OcrEngineMode.OEM_CUBE_ONLY || mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED)
         *    && (v.Major < 3 || (v.Major == 3 && v.Minor < 1)))
         * {
         *    return false;
         * }
         * return true;
         * }*/

        /// <summary>
        /// Initialize the OCR engine using the specific dataPath and language name.
        /// </summary>
        /// <param name="dataPath">The path where the language file is located</param>
        /// <param name="language">The 3 letter language code </param>
        /// <param name="mode">OCR engine mode</param>
        public void Init(String dataPath, String language, OcrEngineMode mode)
        {
            /*if (!IsEngineModeSupported(mode))
             * throw new ArgumentException(String.Format("The Ocr engine mode {0} is not supported in tesseract v{1}", mode, Version));*/
            int initResult = TessBaseAPIInit(_ptr, dataPath, language, mode);

            if (initResult != 0)
            {
                throw new ArgumentException(String.Format("Unable to create ocr model using Path {0} and language {1}.", dataPath, language));
            }
        }
예제 #22
0
 public async Task<bool> Init (string language, OcrEngineMode? mode = null)
 {
     if (string.IsNullOrEmpty (language))
         return false;
     var path = await CopyAssets ();
     var result = mode.HasValue
         ? _api.Init (path, language, GetOcrEngineMode (mode.Value))
         : _api.Init (path, language);
     Initialized = result;
     return result;
 }
예제 #23
0
        /// <summary>
        /// Check of the specific Ocr Engine is supported for the current tesseract release
        /// </summary>
        /// <param name="mode">The Engine mode</param>
        /// <returns>True if supported, false otherwise</returns>
        public bool IsEngineModeSupported(OcrEngineMode mode)
        {
            Version v = Version;

            if ((mode == OcrEngineMode.OEM_CUBE_ONLY || mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED) &&
                (v.Major < 3 || (v.Major == 3 && v.Minor < 1)))
            {
                return(false);
            }
            return(true);
        }
예제 #24
0
        /*
         * /// <summary>
         * /// Create an tesseract OCR engine.
         * /// </summary>
         * /// <param name="dataPath">
         * /// The datapath must be the name of the directory of tessdata and
         * /// must end in / . Any name after the last / will be stripped.
         * /// </param>
         * /// <param name="language">
         * /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
         * /// It is entirely safe (and eventually will be efficient too) to call
         * /// Init multiple times on the same instance to change language, or just
         * /// to reset the classifier.
         * /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
         * /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
         * /// English. Languages may specify internally that they want to be loaded
         * /// with one or more other languages, so the ~ sign is available to override
         * /// that. Eg if hin were set to load eng by default, then hin+~eng would force
         * /// loading only hin. The number of loaded languages is limited only by
         * /// memory, with the caveat that loading additional languages will impact
         * /// both speed and accuracy, as there is more work to do to decide on the
         * /// applicable language, and there is more chance of hallucinating incorrect
         * /// words.
         * /// </param>
         * /// <param name="mode">OCR engine mode</param>
         * public Tesseract(String dataPath, String language, OcrEngineMode mode)
         * : this()
         * {
         *  Init(dataPath, language, mode);
         * }*/

        /// <summary>
        /// Create a Tesseract OCR engine.
        /// </summary>
        /// <param name="dataPath">
        /// The datapath must be the name of the directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// </param>
        /// <param name="language">
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
        /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~ sign is available to override
        /// that. Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin. The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// </param>
        /// <param name="mode">OCR engine mode</param>
        /// <param name="whiteList">This can be used to specify a white list for OCR. e.g. specify "1234567890" to recognize digits only. Note that the white list currently seems to only work with OcrEngineMode.OEM_TESSERACT_ONLY</param>
        /// <param name="enforceLocale">If true, we will change the locale to "C" before initializing the tesseract engine and reverting it back once the tesseract initialiation is completer. If false, it will be the user's responsibility to set the locale to "C", otherwise an exception will be thrown. See https://github.com/tesseract-ocr/tesseract/issues/1670 </param>
        public Tesseract(String dataPath, String language, OcrEngineMode mode, String whiteList = null, bool enforceLocale = true)
            : this(enforceLocale)
        {
            //if (mode == OcrEngineMode. || mode == OcrEngineMode.TesseractCubeCombined)
            //    throw new ArgumentException("White list is not supported by CUBE engine");

            Init(dataPath, language, mode);
            if (whiteList != null)
            {
                SetVariable("tessedit_char_whitelist", whiteList);
            }
        }
 public async Task<bool> Init (string language, OcrEngineMode? mode = null)
 {
     try {
         _api = new G8Tesseract (language) { Delegate = _progressHandler };
         _api.Init ();
         if (mode.HasValue)
             SetOcrEngineMode (mode.Value);
         Initialized = true;
     } catch {
         Initialized = false;
     }
     return Initialized;
 }
예제 #26
0
        /// <summary>
        /// Initialize the OCR engine using the specific dataPath and language name.
        /// </summary>
        /// <param name="dataPath">
        /// The datapath must be the name of the parent directory of tessdata and
        /// must end in / . Any name after the last / will be stripped.
        /// </param>
        /// <param name="language">
        /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
        /// It is entirely safe (and eventually will be efficient too) to call
        /// Init multiple times on the same instance to change language, or just
        /// to reset the classifier.
        /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
        /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
        /// English. Languages may specify internally that they want to be loaded
        /// with one or more other languages, so the ~ sign is available to override
        /// that. Eg if hin were set to load eng by default, then hin+~eng would force
        /// loading only hin. The number of loaded languages is limited only by
        /// memory, with the caveat that loading additional languages will impact
        /// both speed and accuracy, as there is more work to do to decide on the
        /// applicable language, and there is more chance of hallucinating incorrect
        /// words.
        /// </param>
        /// <param name="mode">OCR engine mode</param>
        public void Init(String dataPath, String language, OcrEngineMode mode)
        {
            /*
             #if !NETFX_CORE
             *          if (!(dataPath.Length > 0 && dataPath.Substring(dataPath.Length - 1).ToCharArray()[0] == System.IO.Path.DirectorySeparatorChar))
             *          {  //if the data path end in slash
             *              int lastSlash = dataPath.LastIndexOf(System.IO.Path.DirectorySeparatorChar);
             *              if (lastSlash != -1)
             *              {
             *                  //there is a directory separator, get the path up to the separator, the same way tesseract-ocr calculate the folder
             *                  dataPath = dataPath.Substring(0, lastSlash + 1);
             *              }
             *          }
             #endif
             */
            /*
             * if (!System.IO.Directory.Exists(System.IO.Path.Combine(dataPath, "tessdata")))
             * {
             * throw new ArgumentException(String.Format("The directory {0} doesn't exist!", Path.Combine(dataPath, "tessdata")));
             * }
             *
             * //make sure the tesseract file exist.
             * if (mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED || mode == OcrEngineMode.OEM_TESSERACT_ONLY)
             * {
             * if (!System.IO.File.Exists(System.IO.Path.Combine(dataPath, "tessdata", language + ".traineddata")))
             *    throw new ArgumentException(String.Format("The required tesseract file {0}.traineddata doesn't exist", System.IO.Path.Combine(dataPath, language)));
             * }*/

            /*if (!IsEngineModeSupported(mode))
             * throw new ArgumentException(String.Format("The Ocr engine mode {0} is not supported in tesseract v{1}", mode, Version));*/


            using (CvString csDataPath = new CvString(dataPath))
                using (CvString csLanguage = new CvString(language))
                {
                    int initResult = OcrInvoke.TessBaseAPIInit(_ptr, csDataPath, csLanguage, mode);
                    if (initResult != 0)
                    {
#if !NETFX_CORE
                        if (dataPath.Equals(String.Empty))
                        {
                            dataPath = Path.GetFullPath(".");
                        }
#endif
                        throw new ArgumentException(
                                  String.Format("Unable to create ocr model using Path '{0}', language '{1}' and OcrEngineMode '{2}'.", dataPath,
                                                language, mode));
                    }
                }
        }
예제 #27
0
        public async Task<bool> Init (string language, OcrEngineMode? mode = null)
        {
            try {
                _api = new Tesseract.Binding.iOS.G8Tesseract (language);
                _api.Init ();
                if (mode.HasValue)
                    SetOcrEngineMode (mode.Value);
                Initialized = true;
            } catch {
                Initialized = false;

            }
            return Initialized;
        }
예제 #28
0
        private List <string> run_tessract()
        {
            List <string>        re        = new List <string>();
            string               dataPath  = "./tessdata/";
            string               language  = 1 != 1 ? "eng" : "jpn";
            string               inputFile = "./_tmp.bmp";
            OcrEngineMode        oem       = OcrEngineMode.DEFAULT;
            PageSegmentationMode psm       = PageSegmentationMode.AUTO_OSD;

            TessBaseAPI tessBaseAPI = new TessBaseAPI();

            // Initialize tesseract-ocr
            if (!tessBaseAPI.Init(dataPath, language, oem))
            {
                throw new Exception("Could not initialize tesseract.");
            }

            // Set the Page Segmentation mode
            tessBaseAPI.SetPageSegMode(psm);

            // Set the input image
            Pix pix = tessBaseAPI.SetImage(inputFile);

            tessBaseAPI.SetVariable("number", "1234567890");

            // Recognize image
            tessBaseAPI.Recognize();

            ResultIterator resultIterator = tessBaseAPI.GetIterator();

            // extract text from result iterator
            StringBuilder     stringBuilder     = new StringBuilder();
            PageIteratorLevel pageIteratorLevel = PageIteratorLevel.RIL_PARA;

            do
            {
                string str = resultIterator.GetUTF8Text(pageIteratorLevel);


                if (str != null)
                {
                    str = Regex.Replace(str, @"\n", "\r\n");
                    re.Add(str);
                }
            } while (resultIterator.Next(pageIteratorLevel));

            tessBaseAPI.Dispose();
            pix.Dispose();
            return(re);
        }
예제 #29
0
        private async Task InitTesseract(String lang, OcrEngineMode mode)
        {
            if (_ocr == null)
            {
                FileDownloadManager manager = new FileDownloadManager();
                manager.AddFile(Emgu.CV.OCR.Tesseract.GetLangFileUrl(lang), _modelFolderName);
                manager.AddFile(Emgu.CV.OCR.Tesseract.GetLangFileUrl("osd"), _modelFolderName); //script orientation detection

                manager.OnDownloadProgressChanged += DownloadManager_OnDownloadProgressChanged;
                await manager.Download();

                FileInfo fi = new FileInfo(manager.Files[0].LocalFile);
                _ocr = new Tesseract(fi.DirectoryName, lang, mode);
            }
        }
예제 #30
0
 private void InitializeOCR(String dataPath, String lang, OcrEngineMode mode)
 {
     try
     {
         if (_ocr != null)
         {
             _ocr.Dispose();
         }
         _ocr = new Tesseract(dataPath, lang, mode);
     }
     catch (Exception e)
     {
         _ocr = null;
         throw e;
     }
 }
 public async Task<bool> Init (string language, OcrEngineMode? mode = null)
 {
     if (string.IsNullOrEmpty (language))
         return false;
     try {
         var path = await CopyAssets ();
         var result = mode.HasValue
             ? _api.Init (path, language, GetOcrEngineMode (mode.Value))
             : _api.Init (path, language);
         Initialized = result;
         return result;
     } catch (IllegalArgumentException ex) {
         Log.Debug ("TesseractApi", ex, ex.Message);
         Initialized = false;
         return false;
     }
 }
예제 #32
0
        public void SetOcrEngineMode(OcrEngineMode mode)
        {
            switch (mode)
            {
            case OcrEngineMode.CubeOnly:
                _api.EngineMode = G8OCREngineMode.CubeOnly;
                break;

            case OcrEngineMode.TesseractCubeCombined:
                _api.EngineMode = G8OCREngineMode.TesseractCubeCombined;
                break;

            case OcrEngineMode.TesseractOnly:
                _api.EngineMode = G8OCREngineMode.TesseractOnly;
                break;
            }
        }
예제 #33
0
        private int GetOcrEngineMode(OcrEngineMode mode)
        {
            switch (mode)
            {
            case OcrEngineMode.CubeOnly:
                return(OcrMode.CubeOnly);

            case OcrEngineMode.TesseractCubeCombined:
                return(OcrMode.TesseractCubeCombined);

            case OcrEngineMode.TesseractOnly:
                return(OcrMode.TesseractOnly);

            default:
                return(OcrMode.CubeOnly);
            }
        }
예제 #34
0
        static void example1()
        {
            string               dataPath = "./tessdata/";
            string               language = "eng";
            OcrEngineMode        oem      = OcrEngineMode.DEFAULT;
            PageSegmentationMode psm      = PageSegmentationMode.AUTO_OSD;

            TessBaseAPI tessBaseAPI = new TessBaseAPI();

            // Initialize tesseract-ocr
            if (!tessBaseAPI.Init(dataPath, language, oem))
            {
                throw new Exception("Could not initialize tesseract.");
            }

            // Set the Page Segmentation mode
            tessBaseAPI.SetPageSegMode(psm);
        }
예제 #35
0
 private static void InitOcr(String path, String lang, OcrEngineMode mode)
 {
     try
     {
         if (_ocr != null)
         {
             _ocr.Dispose();
             _ocr = null;
         }
         //_ocr = new Tesseract()
         _ocr = new Tesseract(path, lang, mode, "1234567890");
         //_ocr.SetVariable("tessedit_char_whitelist", "1234567890");
     }
     catch (Exception e)
     {
         _ocr = null;
     }
 }
예제 #36
0
        private void InitOCR(string dataPath, string lang, OcrEngineMode mode)
        {
            try
            {
                if (OCR != null)
                {
                    OCR.Dispose();
                    OCR = null;
                }

                OCR = new Tesseract(dataPath, lang, mode);
            }
            catch (Exception e)
            {
                OCR = null;
                Console.WriteLine("Failed to initialize tesseract OCR engine, error: " + e.Message);
            }
        }
예제 #37
0
파일: OCRForm.cs 프로젝트: neutmute/emgucv
 private void InitOcr(String path, String lang, OcrEngineMode mode)
 {
    try
    {
       if (_ocr != null)
       {
          _ocr.Dispose();
          _ocr = null;
       }
       _ocr = new Tesseract(path, lang, mode);
       languageNameLabel.Text = String.Format("{0} : {1}", lang, mode.ToString());
    }
    catch (Exception e)
    {
       _ocr = null;
       MessageBox.Show(e.Message, "Failed to initialize tesseract OCR engine", MessageBoxButtons.OK);
       languageNameLabel.Text = "Failed to initialize tesseract OCR engine";
    }
 }
 public void SetOcrEngineMode (OcrEngineMode mode)
 {
     switch (mode) {
     case OcrEngineMode.CubeOnly:
         _api.EngineMode = G8OCREngineMode.CubeOnly;
         break;
     case OcrEngineMode.TesseractCubeCombined:
         _api.EngineMode = G8OCREngineMode.TesseractCubeCombined;
         break;
     case OcrEngineMode.TesseractOnly:
         _api.EngineMode = G8OCREngineMode.TesseractOnly;
         break;
     }
 }
예제 #39
0
 private int GetOcrEngineMode (OcrEngineMode mode)
 {
     switch (mode) {
     case OcrEngineMode.CubeOnly:
         return OcrMode.CubeOnly;
     case OcrEngineMode.TesseractCubeCombined:
         return OcrMode.TesseractCubeCombined;
     case OcrEngineMode.TesseractOnly:
         return OcrMode.TesseractOnly;
     default:
         return OcrMode.CubeOnly;
     }
 }
예제 #40
0
      /// <summary>
      /// Create an tesseract OCR engine.
      /// </summary>
      /// <param name="dataPath">
      /// The datapath must be the name of the parent directory of tessdata and
      /// must end in / . Any name after the last / will be stripped.
      /// </param>
      /// <param name="language">
      /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
      /// It is entirely safe (and eventually will be efficient too) to call
      /// Init multiple times on the same instance to change language, or just
      /// to reset the classifier.
      /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
      /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
      /// English. Languages may specify internally that they want to be loaded
      /// with one or more other languages, so the ~ sign is available to override
      /// that. Eg if hin were set to load eng by default, then hin+~eng would force
      /// loading only hin. The number of loaded languages is limited only by
      /// memory, with the caveat that loading additional languages will impact
      /// both speed and accuracy, as there is more work to do to decide on the
      /// applicable language, and there is more chance of hallucinating incorrect
      /// words.
      /// </param>
      /// <param name="mode">OCR engine mode</param>
      /// <param name="whiteList">This can be used to specify a white list for OCR. e.g. specify "1234567890" to recognize digits only. Note that the white list currently seems to only work with OcrEngineMode.OEM_TESSERACT_ONLY</param>
      public Tesseract(String dataPath, String language, OcrEngineMode mode, String whiteList)
         : this(dataPath, language, mode)
      {
         if (mode == OcrEngineMode.CubeOnly || mode == OcrEngineMode.TesseractCubeCombined)
            throw new ArgumentException("White list is not supported by CUBE engine");

         SetVariable("tessedit_char_whitelist", whiteList);
      }
예제 #41
0
      /// <summary>
      /// Initialize the OCR engine using the specific dataPath and language name.
      /// </summary>
      /// <param name="dataPath">
      /// The datapath must be the name of the parent directory of tessdata and
      /// must end in / . Any name after the last / will be stripped.
      /// </param>
      /// <param name="language">
      /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
      /// It is entirely safe (and eventually will be efficient too) to call
      /// Init multiple times on the same instance to change language, or just
      /// to reset the classifier.
      /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
      /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
      /// English. Languages may specify internally that they want to be loaded
      /// with one or more other languages, so the ~ sign is available to override
      /// that. Eg if hin were set to load eng by default, then hin+~eng would force
      /// loading only hin. The number of loaded languages is limited only by
      /// memory, with the caveat that loading additional languages will impact
      /// both speed and accuracy, as there is more work to do to decide on the
      /// applicable language, and there is more chance of hallucinating incorrect
      /// words.
      /// </param>
      /// <param name="mode">OCR engine mode</param>
      public void Init(String dataPath, String language, OcrEngineMode mode)
      {
         
         if (!(dataPath.Length > 0 && dataPath.Substring(dataPath.Length - 1).ToCharArray()[0] == System.IO.Path.DirectorySeparatorChar))
         {  //if the data path end in slash
            int lastSlash = dataPath.LastIndexOf(System.IO.Path.DirectorySeparatorChar);
            if (lastSlash != -1)
            {  
               //there is a directory separator, get the path up to the separator, the same way tesseract-ocr calculate the folder
               dataPath = dataPath.Substring(0, lastSlash + 1);
            }
         }
         
         /*
         if (!System.IO.Directory.Exists(System.IO.Path.Combine(dataPath, "tessdata")))
         {
            throw new ArgumentException(String.Format("The directory {0} doesn't exist!", Path.Combine(dataPath, "tessdata")));
         }

         //make sure the tesseract file exist.
         if (mode == OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED || mode == OcrEngineMode.OEM_TESSERACT_ONLY)
         {
            if (!System.IO.File.Exists(System.IO.Path.Combine(dataPath, "tessdata", language + ".traineddata")))
               throw new ArgumentException(String.Format("The required tesseract file {0}.traineddata doesn't exist", System.IO.Path.Combine(dataPath, language)));
         }*/

         /*if (!IsEngineModeSupported(mode))
            throw new ArgumentException(String.Format("The Ocr engine mode {0} is not supported in tesseract v{1}", mode, Version));*/
         int initResult = OcrInvoke.TessBaseAPIInit(_ptr, dataPath, language, mode);
         if (initResult != 0) throw new ArgumentException(String.Format("Unable to create ocr model using Path {0} and language {1}.", dataPath, language));
      }
예제 #42
0
 /// <summary>
 /// Create an tesseract OCR engine.
 /// </summary>
 /// <param name="dataPath">
 /// The datapath must be the name of the parent directory of tessdata and
 /// must end in / . Any name after the last / will be stripped.
 /// </param>
 /// <param name="language">
 /// The language is (usually) an ISO 639-3 string or NULL will default to eng.
 /// It is entirely safe (and eventually will be efficient too) to call
 /// Init multiple times on the same instance to change language, or just
 /// to reset the classifier.
 /// The language may be a string of the form [~]%lt;lang&gt;[+[~]&lt;lang&gt;]* indicating
 /// that multiple languages are to be loaded. Eg hin+eng will load Hindi and
 /// English. Languages may specify internally that they want to be loaded
 /// with one or more other languages, so the ~ sign is available to override
 /// that. Eg if hin were set to load eng by default, then hin+~eng would force
 /// loading only hin. The number of loaded languages is limited only by
 /// memory, with the caveat that loading additional languages will impact
 /// both speed and accuracy, as there is more work to do to decide on the
 /// applicable language, and there is more chance of hallucinating incorrect
 /// words.
 /// </param>
 /// <param name="mode">OCR engine mode</param>
 public Tesseract(String dataPath, String language, OcrEngineMode mode)
    : this()
 {
    Init(dataPath, language, mode);
 }