/// <summary> /// Gets OCR result from provided multi-page image and returns result as /// list of strings for each page. /// </summary> /// <remarks> /// Gets OCR result from provided multi-page image and returns result as /// list of strings for each page. This method is used for tiff images /// when preprocessing is not needed. /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <returns> /// list of result string that will be written to a temporary files /// later /// </returns> private IList <String> GetOcrResultForMultiPage(FileInfo inputImage, OutputFormat outputFormat) { IList <String> resultList = new List <String>(); try { InitializeTesseract(outputFormat); TesseractOcrUtil util = new TesseractOcrUtil(); util.InitializeImagesListFromTiff(inputImage); int numOfPages = util.GetListOfPages().Count; for (int i = 0; i < numOfPages; i++) { String result = util.GetOcrResultAsString(GetTesseractInstance(), util.GetListOfPages()[i], outputFormat); resultList.Add(result); } } catch (TesseractException e) { String msg = MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED, e.Message); LogManager.GetLogger(GetType()).Error(msg); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } finally { TesseractOcrUtil.DisposeTesseractInstance(GetTesseractInstance()); } return(resultList); }
/// <summary> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// </summary> /// <remarks> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// Please note that list of output files is accepted instead of a single file because /// page number parameter is not respected in case of TIFF images not requiring preprocessing. /// In other words, if the passed image is the TIFF image and according to the /// <see cref="Tesseract4OcrEngineProperties"/> /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list /// is expected to be same as number of pages in the image, otherwise, only one file is expected /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFiles"> /// /// <see cref="System.Collections.IList{E}"/> /// of output files /// (one per each page) /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be processed</param> internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat , int pageNumber) { ScheduledCheck(); try { ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages()); InitializeTesseract(outputFormat); OnEvent(); // if preprocessing is not needed and provided image is tiff, // the image will be paginated and separate pages will be OCRed IList <String> resultList = new List <String>(); if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage )) { resultList = GetOcrResultForMultiPage(inputImage, outputFormat); } else { resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber)); } // list of result strings is written to separate files // (one for each page) for (int i = 0; i < resultList.Count; i++) { String result = resultList[i]; FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i]; if (result != null && outputFile != null) { try { using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding .UTF8)) { writer.Write(result); } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE , e.Message)); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } } } } catch (Tesseract4OcrException e) { LogManager.GetLogger(GetType()).Error(e.Message); throw new Tesseract4OcrException(e.Message, e); } finally { if (tesseractInstance != null) { TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance); } if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties ().IsUserWordsFileTemporary()) { TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } } }