Beispiel #1
0
 /// <summary>
 /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
 /// of input image (by default 1st).
 /// </summary>
 /// <remarks>
 /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
 /// of input image (by default 1st).
 /// Please note that list of output files is accepted instead of a single file because
 /// page number parameter is not respected in case of TIFF images not requiring preprocessing.
 /// In other words, if the passed image is the TIFF image and according to the
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
 /// is expected to be same as number of pages in the image, otherwise, only one file is expected
 /// </remarks>
 /// <param name="inputImage">
 /// input image
 /// <see cref="System.IO.FileInfo"/>
 /// </param>
 /// <param name="outputFiles">
 ///
 /// <see cref="System.Collections.IList{E}"/>
 /// of output files
 /// (one per each page)
 /// </param>
 /// <param name="outputFormat">
 /// selected
 /// <see cref="OutputFormat"/>
 /// for tesseract
 /// </param>
 /// <param name="pageNumber">number of page to be processed</param>
 internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat
                                       , int pageNumber)
 {
     ScheduledCheck();
     try {
         ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages());
         InitializeTesseract(outputFormat);
         OnEvent();
         // if preprocessing is not needed and provided image is tiff,
         // the image will be paginated and separate pages will be OCRed
         IList <String> resultList = new List <String>();
         if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage
                                                                                                               ))
         {
             resultList = GetOcrResultForMultiPage(inputImage, outputFormat);
         }
         else
         {
             resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber));
         }
         // list of result strings is written to separate files
         // (one for each page)
         for (int i = 0; i < resultList.Count; i++)
         {
             String   result     = resultList[i];
             FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i];
             if (result != null && outputFile != null)
             {
                 try {
                     using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding
                                                                 .UTF8)) {
                         writer.Write(result);
                     }
                 }
                 catch (System.IO.IOException e) {
                     LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE
                                                                                    , e.Message));
                     throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED);
                 }
             }
         }
     }
     catch (Tesseract4OcrException e) {
         LogManager.GetLogger(GetType()).Error(e.Message);
         throw new Tesseract4OcrException(e.Message, e);
     }
     finally {
         if (tesseractInstance != null)
         {
             TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance);
         }
         if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties
                 ().IsUserWordsFileTemporary())
         {
             TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile());
         }
     }
 }
Beispiel #2
0
 /// <summary>
 /// Check whether tesseract executable is installed on the machine and
 /// provided path to tesseract executable is correct.
 /// </summary>
 /// <param name="execPath">path to tesseract executable</param>
 private void CheckTesseractInstalled(String execPath)
 {
     try {
         TesseractHelper.RunCommand(execPath, JavaCollectionsUtil.SingletonList <String>("--version"));
     }
     catch (Tesseract4OcrException e) {
         throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_NOT_FOUND, e);
     }
 }
Beispiel #3
0
        public virtual void TestDetectAndFixBrokenBBoxes()
        {
            FileInfo hocrFile = new FileInfo(TEST_DOCUMENTS_DIRECTORY + "broken_bboxes.hocr");
            IDictionary <int, IList <TextInfo> > parsedHocr = TesseractHelper.ParseHocrFile(JavaCollectionsUtil.SingletonList
                                                                                                (hocrFile), TextPositioning.BY_WORDS_AND_LINES);
            TextInfo textInfo = parsedHocr.Get(1)[1];

            NUnit.Framework.Assert.AreEqual(383.0f, (float)textInfo.GetBbox()[0], 0.1);
            NUnit.Framework.Assert.AreEqual(101.0f, (float)textInfo.GetBbox()[1], 0.1);
            NUnit.Framework.Assert.AreEqual(514.0f, (float)textInfo.GetBbox()[2], 0.1);
            NUnit.Framework.Assert.AreEqual(136.0f, (float)textInfo.GetBbox()[3], 0.1);
        }
Beispiel #4
0
        /// <summary>
        /// Performs OCR using provided
        /// <see cref="iText.Pdfocr.IOcrEngine"/>
        /// for the given list of
        /// input images and saves output to a text file using provided path.
        /// </summary>
        /// <param name="inputImages">
        ///
        /// <see cref="System.Collections.IList{E}"/>
        /// of images to be OCRed
        /// </param>
        /// <param name="txtFile">file to be created</param>
        public virtual void CreateTxtFile(IList <FileInfo> inputImages, FileInfo txtFile)
        {
            LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.START_OCR_FOR_IMAGES
                                                                          , inputImages.Count));
            StringBuilder content = new StringBuilder();

            foreach (FileInfo inputImage in inputImages)
            {
                content.Append(DoImageOcr(inputImage, OutputFormat.TXT));
            }
            // write to file
            TesseractHelper.WriteToTextFile(txtFile.FullName, content.ToString());
        }
Beispiel #5
0
        public virtual void TestPixSavingAsPng()
        {
            String path        = TEST_IMAGES_DIRECTORY + "numbers_01.jpg";
            String tmpFileName = GetTargetDirectory() + "testPixSavingAsPng.png";

            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
            Pix pix = ImagePreprocessingUtil.ReadPix(new FileInfo(path));

            TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix);
            NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName)));
            TesseractHelper.DeleteFile(tmpFileName);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
        }
Beispiel #6
0
        public virtual void TestImageSavingAsPng()
        {
            String path        = TEST_IMAGES_DIRECTORY + "numbers_01.jpg";
            String tmpFileName = GetTargetDirectory() + "testImageSavingAsPng.png";

            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
            System.Drawing.Bitmap bi = (System.Drawing.Bitmap)System.Drawing.Image.FromStream(new FileStream(path, FileMode.Open
                                                                                                             , FileAccess.Read));
            TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, bi);
            NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName)));
            TesseractHelper.DeleteFile(tmpFileName);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
        }
Beispiel #7
0
        public virtual void TestDoTesseractOcrForNonAsciiPathForExecutable()
        {
            String   path       = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png";
            FileInfo imgFile    = new FileInfo(path);
            FileInfo outputFile = new FileInfo(TesseractOcrUtil.GetTempFilePath("test", ".hocr"));
            Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties();

            properties.SetPathToTessData(GetTessDataDirectory());
            properties.SetPreprocessingImages(false);
            Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), properties
                                                                                     );

            engine.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR);
            NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(outputFile.FullName)));
            TesseractHelper.DeleteFile(outputFile.FullName);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(outputFile.FullName)));
        }
Beispiel #8
0
        /// <summary>Reads data from the provided input image file.</summary>
        /// <param name="input">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        ///
        /// <see cref="OutputFormat"/>
        /// for the result returned
        /// by
        /// <see cref="iText.Pdfocr.IOcrEngine"/>
        /// </param>
        /// <returns>
        ///
        /// <see cref="ITesseractOcrResult"/>
        /// instance, either
        /// <see cref="StringTesseractOcrResult"/>
        /// if output format is TXT, or
        /// <see cref="TextInfoTesseractOcrResult"/>
        /// if the output format is HOCR
        /// </returns>
        private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat
                                                                                  )
        {
            IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >();
            StringBuilder    data      = new StringBuilder();
            IList <FileInfo> tempFiles = new List <FileInfo>();

            AbstractTesseract4OcrEngine.ITesseractOcrResult result = null;
            try {
                // image needs to be paginated only if it's tiff
                // or preprocessing isn't required
                int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff
                                         (input);
                int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1;
                int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages;
                for (int page = 1; page <= numOfPages; page++)
                {
                    String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt";
                    for (int i = 0; i < numOfFiles; i++)
                    {
                        tempFiles.Add(CreateTempFile(extension));
                    }
                    DoTesseractOcr(input, tempFiles, outputFormat, page);
                    if (outputFormat.Equals(OutputFormat.HOCR))
                    {
                        IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties
                                                                                                          ().GetTextPositioning());
                        if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                        {
                            imageData.Put(page, pageData.Get(1));
                        }
                        else
                        {
                            imageData = pageData;
                        }
                        result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData);
                    }
                    else
                    {
                        foreach (FileInfo tmpFile in tempFiles)
                        {
                            if (File.Exists(System.IO.Path.Combine(tmpFile.FullName)))
                            {
                                data.Append(TesseractHelper.ReadTxtFile(tmpFile));
                            }
                        }
                        result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString());
                    }
                }
            }
            catch (System.IO.IOException e) {
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE
                                                                               , e.Message));
            }
            finally {
                foreach (FileInfo file in tempFiles)
                {
                    TesseractHelper.DeleteFile(file.FullName);
                }
            }
            return(result);
        }
Beispiel #9
0
        /// <summary>
        /// Performs tesseract OCR using command line tool for the selected page
        /// of input image (by default 1st).
        /// </summary>
        /// <remarks>
        /// Performs tesseract OCR using command line tool for the selected page
        /// of input image (by default 1st).
        /// Please note that list of output files is accepted instead of a single file because
        /// page number parameter is not respected in case of TIFF images not requiring preprocessing.
        /// In other words, if the passed image is the TIFF image and according to the
        /// <see cref="Tesseract4OcrEngineProperties"/>
        /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
        /// is expected to be same as number of pages in the image, otherwise, only one file is expected
        /// </remarks>
        /// <param name="inputImage">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFiles">
        ///
        /// <see cref="System.Collections.IList{E}"/>
        /// of output files
        /// (one per each page)
        /// </param>
        /// <param name="outputFormat">
        /// selected
        /// <see cref="OutputFormat"/>
        /// for tesseract
        /// </param>
        /// <param name="pageNumber">number of page to be processed</param>
        internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat
                                              , int pageNumber)
        {
            ScheduledCheck();
            IList <String> @params   = new List <String>();
            String         execPath  = null;
            String         imagePath = null;

            try {
                imagePath = inputImage.FullName;
                // path to tesseract executable
                if (GetPathToExecutable() == null || String.IsNullOrEmpty(GetPathToExecutable()))
                {
                    throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE);
                }
                else
                {
                    if (IsWindows())
                    {
                        execPath = AddQuotes(GetPathToExecutable());
                    }
                    else
                    {
                        execPath = GetPathToExecutable();
                    }
                    @params.Add(execPath);
                }
                CheckTesseractInstalled(execPath);
                // path to tess data
                AddTessData(@params);
                // validate languages before preprocessing started
                ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages());
                // preprocess input file if needed and add it
                imagePath = PreprocessImage(inputImage, pageNumber);
                AddInputFile(@params, imagePath);
                // move to image directory as tesseract cannot parse non ascii
                // characters in input path
                IList <String> moveToDirectoryParams = MoveToImageDirectory(imagePath);
                // output file
                AddOutputFile(@params, outputFiles[0], outputFormat, imagePath);
                // page segmentation mode
                AddPageSegMode(@params);
                // add user words if needed
                AddUserWords(@params, imagePath);
                // required languages
                AddLanguages(@params);
                if (outputFormat.Equals(OutputFormat.HOCR))
                {
                    // path to hocr script
                    SetHocrOutput(@params);
                }
                // set default user defined dpi
                AddDefaultDpi(@params);
                OnEvent();
                TesseractHelper.RunCommand(IsWindows() ? "cmd" : "bash", CreateCommandList(moveToDirectoryParams, @params)
                                           );
            }
            catch (Tesseract4OcrException e) {
                LogManager.GetLogger(GetType()).Error(e.Message);
                throw new Tesseract4OcrException(e.Message, e);
            }
            finally {
                try {
                    if (imagePath != null && !inputImage.FullName.Equals(imagePath))
                    {
                        TesseractHelper.DeleteFile(imagePath);
                    }
                }
                catch (SecurityException e) {
                    LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE
                                                                                   , imagePath, e.Message));
                }
                try {
                    if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties
                            ().IsUserWordsFileTemporary())
                    {
                        TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile());
                    }
                }
                catch (SecurityException e) {
                    LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE
                                                                                   , GetTesseract4OcrEngineProperties().GetPathToUserWordsFile(), e.Message));
                }
            }
        }