Example #1
0
        /// <summary>
        /// Gets OCR result from provided multi-page image and returns result as
        /// list of strings for each page.
        /// </summary>
        /// <remarks>
        /// Gets OCR result from provided multi-page image and returns result as
        /// list of strings for each page. This method is used for tiff images
        /// when preprocessing is not needed.
        /// </remarks>
        /// <param name="inputImage">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        /// selected
        /// <see cref="OutputFormat"/>
        /// for tesseract
        /// </param>
        /// <returns>
        /// list of result string that will be written to a temporary files
        /// later
        /// </returns>
        private IList <String> GetOcrResultForMultiPage(FileInfo inputImage, OutputFormat outputFormat)
        {
            IList <String> resultList = new List <String>();

            try {
                InitializeTesseract(outputFormat);
                TesseractOcrUtil util = new TesseractOcrUtil();
                util.InitializeImagesListFromTiff(inputImage);
                int numOfPages = util.GetListOfPages().Count;
                for (int i = 0; i < numOfPages; i++)
                {
                    String result = util.GetOcrResultAsString(GetTesseractInstance(), util.GetListOfPages()[i], outputFormat);
                    resultList.Add(result);
                }
            }
            catch (TesseractException e) {
                String msg = MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED, e.Message);
                LogManager.GetLogger(GetType()).Error(msg);
                throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED);
            }
            finally {
                TesseractOcrUtil.DisposeTesseractInstance(GetTesseractInstance());
            }
            return(resultList);
        }
Example #2
0
 /// <summary>
 /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
 /// of input image (by default 1st).
 /// </summary>
 /// <remarks>
 /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
 /// of input image (by default 1st).
 /// Please note that list of output files is accepted instead of a single file because
 /// page number parameter is not respected in case of TIFF images not requiring preprocessing.
 /// In other words, if the passed image is the TIFF image and according to the
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
 /// is expected to be same as number of pages in the image, otherwise, only one file is expected
 /// </remarks>
 /// <param name="inputImage">
 /// input image
 /// <see cref="System.IO.FileInfo"/>
 /// </param>
 /// <param name="outputFiles">
 ///
 /// <see cref="System.Collections.IList{E}"/>
 /// of output files
 /// (one per each page)
 /// </param>
 /// <param name="outputFormat">
 /// selected
 /// <see cref="OutputFormat"/>
 /// for tesseract
 /// </param>
 /// <param name="pageNumber">number of page to be processed</param>
 internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat
                                       , int pageNumber)
 {
     ScheduledCheck();
     try {
         ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages());
         InitializeTesseract(outputFormat);
         OnEvent();
         // if preprocessing is not needed and provided image is tiff,
         // the image will be paginated and separate pages will be OCRed
         IList <String> resultList = new List <String>();
         if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage
                                                                                                               ))
         {
             resultList = GetOcrResultForMultiPage(inputImage, outputFormat);
         }
         else
         {
             resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber));
         }
         // list of result strings is written to separate files
         // (one for each page)
         for (int i = 0; i < resultList.Count; i++)
         {
             String   result     = resultList[i];
             FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i];
             if (result != null && outputFile != null)
             {
                 try {
                     using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding
                                                                 .UTF8)) {
                         writer.Write(result);
                     }
                 }
                 catch (System.IO.IOException e) {
                     LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE
                                                                                    , e.Message));
                     throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED);
                 }
             }
         }
     }
     catch (Tesseract4OcrException e) {
         LogManager.GetLogger(GetType()).Error(e.Message);
         throw new Tesseract4OcrException(e.Message, e);
     }
     finally {
         if (tesseractInstance != null)
         {
             TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance);
         }
         if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties
                 ().IsUserWordsFileTemporary())
         {
             TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile());
         }
     }
 }