Esempio n. 1
0
        public virtual void TestDoTesseractOcrForNonAsciiPathForExecutable()
        {
            String   path       = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png";
            FileInfo imgFile    = new FileInfo(path);
            FileInfo outputFile = new FileInfo(TesseractOcrUtil.GetTempFilePath("test", ".hocr"));
            Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties();

            properties.SetPathToTessData(GetTessDataDirectory());
            properties.SetPreprocessingImages(false);
            Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), properties
                                                                                     );

            engine.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR);
            NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(outputFile.FullName)));
            TesseractHelper.DeleteFile(outputFile.FullName);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(outputFile.FullName)));
        }
Esempio n. 2
0
        /// <summary>Reads data from the provided input image file.</summary>
        /// <param name="input">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        ///
        /// <see cref="OutputFormat"/>
        /// for the result returned
        /// by
        /// <see cref="iText.Pdfocr.IOcrEngine"/>
        /// </param>
        /// <returns>
        ///
        /// <see cref="ITesseractOcrResult"/>
        /// instance, either
        /// <see cref="StringTesseractOcrResult"/>
        /// if output format is TXT, or
        /// <see cref="TextInfoTesseractOcrResult"/>
        /// if the output format is HOCR
        /// </returns>
        private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat
                                                                                  )
        {
            IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >();
            StringBuilder    data      = new StringBuilder();
            IList <FileInfo> tempFiles = new List <FileInfo>();

            AbstractTesseract4OcrEngine.ITesseractOcrResult result = null;
            try {
                // image needs to be paginated only if it's tiff
                // or preprocessing isn't required
                int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff
                                         (input);
                int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1;
                int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages;
                for (int page = 1; page <= numOfPages; page++)
                {
                    String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt";
                    for (int i = 0; i < numOfFiles; i++)
                    {
                        tempFiles.Add(CreateTempFile(extension));
                    }
                    DoTesseractOcr(input, tempFiles, outputFormat, page);
                    if (outputFormat.Equals(OutputFormat.HOCR))
                    {
                        IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties
                                                                                                          ().GetTextPositioning());
                        if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                        {
                            imageData.Put(page, pageData.Get(1));
                        }
                        else
                        {
                            imageData = pageData;
                        }
                        result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData);
                    }
                    else
                    {
                        foreach (FileInfo tmpFile in tempFiles)
                        {
                            if (File.Exists(System.IO.Path.Combine(tmpFile.FullName)))
                            {
                                data.Append(TesseractHelper.ReadTxtFile(tmpFile));
                            }
                        }
                        result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString());
                    }
                }
            }
            catch (System.IO.IOException e) {
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE
                                                                               , e.Message));
            }
            finally {
                foreach (FileInfo file in tempFiles)
                {
                    TesseractHelper.DeleteFile(file.FullName);
                }
            }
            return(result);
        }
Esempio n. 3
0
        /// <summary>
        /// Performs tesseract OCR using command line tool for the selected page
        /// of input image (by default 1st).
        /// </summary>
        /// <remarks>
        /// Performs tesseract OCR using command line tool for the selected page
        /// of input image (by default 1st).
        /// Please note that list of output files is accepted instead of a single file because
        /// page number parameter is not respected in case of TIFF images not requiring preprocessing.
        /// In other words, if the passed image is the TIFF image and according to the
        /// <see cref="Tesseract4OcrEngineProperties"/>
        /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
        /// is expected to be same as number of pages in the image, otherwise, only one file is expected
        /// </remarks>
        /// <param name="inputImage">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFiles">
        ///
        /// <see cref="System.Collections.IList{E}"/>
        /// of output files
        /// (one per each page)
        /// </param>
        /// <param name="outputFormat">
        /// selected
        /// <see cref="OutputFormat"/>
        /// for tesseract
        /// </param>
        /// <param name="pageNumber">number of page to be processed</param>
        internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat
                                              , int pageNumber)
        {
            ScheduledCheck();
            IList <String> @params   = new List <String>();
            String         execPath  = null;
            String         imagePath = null;

            try {
                imagePath = inputImage.FullName;
                // path to tesseract executable
                if (GetPathToExecutable() == null || String.IsNullOrEmpty(GetPathToExecutable()))
                {
                    throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE);
                }
                else
                {
                    if (IsWindows())
                    {
                        execPath = AddQuotes(GetPathToExecutable());
                    }
                    else
                    {
                        execPath = GetPathToExecutable();
                    }
                    @params.Add(execPath);
                }
                CheckTesseractInstalled(execPath);
                // path to tess data
                AddTessData(@params);
                // validate languages before preprocessing started
                ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages());
                // preprocess input file if needed and add it
                imagePath = PreprocessImage(inputImage, pageNumber);
                AddInputFile(@params, imagePath);
                // move to image directory as tesseract cannot parse non ascii
                // characters in input path
                IList <String> moveToDirectoryParams = MoveToImageDirectory(imagePath);
                // output file
                AddOutputFile(@params, outputFiles[0], outputFormat, imagePath);
                // page segmentation mode
                AddPageSegMode(@params);
                // add user words if needed
                AddUserWords(@params, imagePath);
                // required languages
                AddLanguages(@params);
                if (outputFormat.Equals(OutputFormat.HOCR))
                {
                    // path to hocr script
                    SetHocrOutput(@params);
                }
                // set default user defined dpi
                AddDefaultDpi(@params);
                OnEvent();
                TesseractHelper.RunCommand(IsWindows() ? "cmd" : "bash", CreateCommandList(moveToDirectoryParams, @params)
                                           );
            }
            catch (Tesseract4OcrException e) {
                LogManager.GetLogger(GetType()).Error(e.Message);
                throw new Tesseract4OcrException(e.Message, e);
            }
            finally {
                try {
                    if (imagePath != null && !inputImage.FullName.Equals(imagePath))
                    {
                        TesseractHelper.DeleteFile(imagePath);
                    }
                }
                catch (SecurityException e) {
                    LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE
                                                                                   , imagePath, e.Message));
                }
                try {
                    if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties
                            ().IsUserWordsFileTemporary())
                    {
                        TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile());
                    }
                }
                catch (SecurityException e) {
                    LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE
                                                                                   , GetTesseract4OcrEngineProperties().GetPathToUserWordsFile(), e.Message));
                }
            }
        }