Esempio n. 1
0
 /// <summary>
 /// Check whether tesseract executable is installed on the machine and
 /// provided path to tesseract executable is correct.
 /// </summary>
 /// <param name="execPath">path to tesseract executable</param>
 private void CheckTesseractInstalled(String execPath)
 {
     try {
         TesseractHelper.RunCommand(execPath, JavaCollectionsUtil.SingletonList <String>("--version"));
     }
     catch (Tesseract4OcrException e) {
         throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_NOT_FOUND, e);
     }
 }
Esempio n. 2
0
        /// <summary>
        /// Performs tesseract OCR using command line tool for the selected page
        /// of input image (by default 1st).
        /// </summary>
        /// <remarks>
        /// Performs tesseract OCR using command line tool for the selected page
        /// of input image (by default 1st).
        /// Please note that list of output files is accepted instead of a single file because
        /// page number parameter is not respected in case of TIFF images not requiring preprocessing.
        /// In other words, if the passed image is the TIFF image and according to the
        /// <see cref="Tesseract4OcrEngineProperties"/>
        /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
        /// is expected to be same as number of pages in the image, otherwise, only one file is expected
        /// </remarks>
        /// <param name="inputImage">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFiles">
        ///
        /// <see cref="System.Collections.IList{E}"/>
        /// of output files
        /// (one per each page)
        /// </param>
        /// <param name="outputFormat">
        /// selected
        /// <see cref="OutputFormat"/>
        /// for tesseract
        /// </param>
        /// <param name="pageNumber">number of page to be processed</param>
        internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat
                                              , int pageNumber)
        {
            ScheduledCheck();
            IList <String> @params   = new List <String>();
            String         execPath  = null;
            String         imagePath = null;

            try {
                imagePath = inputImage.FullName;
                // path to tesseract executable
                if (GetPathToExecutable() == null || String.IsNullOrEmpty(GetPathToExecutable()))
                {
                    throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE);
                }
                else
                {
                    if (IsWindows())
                    {
                        execPath = AddQuotes(GetPathToExecutable());
                    }
                    else
                    {
                        execPath = GetPathToExecutable();
                    }
                    @params.Add(execPath);
                }
                CheckTesseractInstalled(execPath);
                // path to tess data
                AddTessData(@params);
                // validate languages before preprocessing started
                ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages());
                // preprocess input file if needed and add it
                imagePath = PreprocessImage(inputImage, pageNumber);
                AddInputFile(@params, imagePath);
                // move to image directory as tesseract cannot parse non ascii
                // characters in input path
                IList <String> moveToDirectoryParams = MoveToImageDirectory(imagePath);
                // output file
                AddOutputFile(@params, outputFiles[0], outputFormat, imagePath);
                // page segmentation mode
                AddPageSegMode(@params);
                // add user words if needed
                AddUserWords(@params, imagePath);
                // required languages
                AddLanguages(@params);
                if (outputFormat.Equals(OutputFormat.HOCR))
                {
                    // path to hocr script
                    SetHocrOutput(@params);
                }
                // set default user defined dpi
                AddDefaultDpi(@params);
                OnEvent();
                TesseractHelper.RunCommand(IsWindows() ? "cmd" : "bash", CreateCommandList(moveToDirectoryParams, @params)
                                           );
            }
            catch (Tesseract4OcrException e) {
                LogManager.GetLogger(GetType()).Error(e.Message);
                throw new Tesseract4OcrException(e.Message, e);
            }
            finally {
                try {
                    if (imagePath != null && !inputImage.FullName.Equals(imagePath))
                    {
                        TesseractHelper.DeleteFile(imagePath);
                    }
                }
                catch (SecurityException e) {
                    LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE
                                                                                   , imagePath, e.Message));
                }
                try {
                    if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties
                            ().IsUserWordsFileTemporary())
                    {
                        TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile());
                    }
                }
                catch (SecurityException e) {
                    LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE
                                                                                   , GetTesseract4OcrEngineProperties().GetPathToUserWordsFile(), e.Message));
                }
            }
        }