/// <summary> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// </summary> /// <remarks> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// Please note that list of output files is accepted instead of a single file because /// page number parameter is not respected in case of TIFF images not requiring preprocessing. /// In other words, if the passed image is the TIFF image and according to the /// <see cref="Tesseract4OcrEngineProperties"/> /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list /// is expected to be same as number of pages in the image, otherwise, only one file is expected /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFiles"> /// /// <see cref="System.Collections.IList{E}"/> /// of output files /// (one per each page) /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be processed</param> internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat , int pageNumber) { ScheduledCheck(); try { ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages()); InitializeTesseract(outputFormat); OnEvent(); // if preprocessing is not needed and provided image is tiff, // the image will be paginated and separate pages will be OCRed IList <String> resultList = new List <String>(); if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage )) { resultList = GetOcrResultForMultiPage(inputImage, outputFormat); } else { resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber)); } // list of result strings is written to separate files // (one for each page) for (int i = 0; i < resultList.Count; i++) { String result = resultList[i]; FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i]; if (result != null && outputFile != null) { try { using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding .UTF8)) { writer.Write(result); } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE , e.Message)); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } } } } catch (Tesseract4OcrException e) { LogManager.GetLogger(GetType()).Error(e.Message); throw new Tesseract4OcrException(e.Message, e); } finally { if (tesseractInstance != null) { TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance); } if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties ().IsUserWordsFileTemporary()) { TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } } }
/// <summary> /// Check whether tesseract executable is installed on the machine and /// provided path to tesseract executable is correct. /// </summary> /// <param name="execPath">path to tesseract executable</param> private void CheckTesseractInstalled(String execPath) { try { TesseractHelper.RunCommand(execPath, JavaCollectionsUtil.SingletonList <String>("--version")); } catch (Tesseract4OcrException e) { throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_NOT_FOUND, e); } }
public virtual void TestDetectAndFixBrokenBBoxes() { FileInfo hocrFile = new FileInfo(TEST_DOCUMENTS_DIRECTORY + "broken_bboxes.hocr"); IDictionary <int, IList <TextInfo> > parsedHocr = TesseractHelper.ParseHocrFile(JavaCollectionsUtil.SingletonList (hocrFile), TextPositioning.BY_WORDS_AND_LINES); TextInfo textInfo = parsedHocr.Get(1)[1]; NUnit.Framework.Assert.AreEqual(383.0f, (float)textInfo.GetBbox()[0], 0.1); NUnit.Framework.Assert.AreEqual(101.0f, (float)textInfo.GetBbox()[1], 0.1); NUnit.Framework.Assert.AreEqual(514.0f, (float)textInfo.GetBbox()[2], 0.1); NUnit.Framework.Assert.AreEqual(136.0f, (float)textInfo.GetBbox()[3], 0.1); }
/// <summary> /// Performs OCR using provided /// <see cref="iText.Pdfocr.IOcrEngine"/> /// for the given list of /// input images and saves output to a text file using provided path. /// </summary> /// <param name="inputImages"> /// /// <see cref="System.Collections.IList{E}"/> /// of images to be OCRed /// </param> /// <param name="txtFile">file to be created</param> public virtual void CreateTxtFile(IList <FileInfo> inputImages, FileInfo txtFile) { LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.START_OCR_FOR_IMAGES , inputImages.Count)); StringBuilder content = new StringBuilder(); foreach (FileInfo inputImage in inputImages) { content.Append(DoImageOcr(inputImage, OutputFormat.TXT)); } // write to file TesseractHelper.WriteToTextFile(txtFile.FullName, content.ToString()); }
public virtual void TestPixSavingAsPng() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String tmpFileName = GetTargetDirectory() + "testPixSavingAsPng.png"; NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); Pix pix = ImagePreprocessingUtil.ReadPix(new FileInfo(path)); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractHelper.DeleteFile(tmpFileName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestImageSavingAsPng() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String tmpFileName = GetTargetDirectory() + "testImageSavingAsPng.png"; NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); System.Drawing.Bitmap bi = (System.Drawing.Bitmap)System.Drawing.Image.FromStream(new FileStream(path, FileMode.Open , FileAccess.Read)); TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, bi); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractHelper.DeleteFile(tmpFileName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestDoTesseractOcrForNonAsciiPathForExecutable() { String path = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"; FileInfo imgFile = new FileInfo(path); FileInfo outputFile = new FileInfo(TesseractOcrUtil.GetTempFilePath("test", ".hocr")); Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(); properties.SetPathToTessData(GetTessDataDirectory()); properties.SetPreprocessingImages(false); Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), properties ); engine.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(outputFile.FullName))); TesseractHelper.DeleteFile(outputFile.FullName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(outputFile.FullName))); }
/// <summary>Reads data from the provided input image file.</summary> /// <param name="input"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// /// <see cref="OutputFormat"/> /// for the result returned /// by /// <see cref="iText.Pdfocr.IOcrEngine"/> /// </param> /// <returns> /// /// <see cref="ITesseractOcrResult"/> /// instance, either /// <see cref="StringTesseractOcrResult"/> /// if output format is TXT, or /// <see cref="TextInfoTesseractOcrResult"/> /// if the output format is HOCR /// </returns> private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat ) { IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >(); StringBuilder data = new StringBuilder(); IList <FileInfo> tempFiles = new List <FileInfo>(); AbstractTesseract4OcrEngine.ITesseractOcrResult result = null; try { // image needs to be paginated only if it's tiff // or preprocessing isn't required int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff (input); int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1; int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages; for (int page = 1; page <= numOfPages; page++) { String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt"; for (int i = 0; i < numOfFiles; i++) { tempFiles.Add(CreateTempFile(extension)); } DoTesseractOcr(input, tempFiles, outputFormat, page); if (outputFormat.Equals(OutputFormat.HOCR)) { IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties ().GetTextPositioning()); if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { imageData.Put(page, pageData.Get(1)); } else { imageData = pageData; } result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData); } else { foreach (FileInfo tmpFile in tempFiles) { if (File.Exists(System.IO.Path.Combine(tmpFile.FullName))) { data.Append(TesseractHelper.ReadTxtFile(tmpFile)); } } result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString()); } } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE , e.Message)); } finally { foreach (FileInfo file in tempFiles) { TesseractHelper.DeleteFile(file.FullName); } } return(result); }
/// <summary> /// Performs tesseract OCR using command line tool for the selected page /// of input image (by default 1st). /// </summary> /// <remarks> /// Performs tesseract OCR using command line tool for the selected page /// of input image (by default 1st). /// Please note that list of output files is accepted instead of a single file because /// page number parameter is not respected in case of TIFF images not requiring preprocessing. /// In other words, if the passed image is the TIFF image and according to the /// <see cref="Tesseract4OcrEngineProperties"/> /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list /// is expected to be same as number of pages in the image, otherwise, only one file is expected /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFiles"> /// /// <see cref="System.Collections.IList{E}"/> /// of output files /// (one per each page) /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be processed</param> internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat , int pageNumber) { ScheduledCheck(); IList <String> @params = new List <String>(); String execPath = null; String imagePath = null; try { imagePath = inputImage.FullName; // path to tesseract executable if (GetPathToExecutable() == null || String.IsNullOrEmpty(GetPathToExecutable())) { throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE); } else { if (IsWindows()) { execPath = AddQuotes(GetPathToExecutable()); } else { execPath = GetPathToExecutable(); } @params.Add(execPath); } CheckTesseractInstalled(execPath); // path to tess data AddTessData(@params); // validate languages before preprocessing started ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages()); // preprocess input file if needed and add it imagePath = PreprocessImage(inputImage, pageNumber); AddInputFile(@params, imagePath); // move to image directory as tesseract cannot parse non ascii // characters in input path IList <String> moveToDirectoryParams = MoveToImageDirectory(imagePath); // output file AddOutputFile(@params, outputFiles[0], outputFormat, imagePath); // page segmentation mode AddPageSegMode(@params); // add user words if needed AddUserWords(@params, imagePath); // required languages AddLanguages(@params); if (outputFormat.Equals(OutputFormat.HOCR)) { // path to hocr script SetHocrOutput(@params); } // set default user defined dpi AddDefaultDpi(@params); OnEvent(); TesseractHelper.RunCommand(IsWindows() ? "cmd" : "bash", CreateCommandList(moveToDirectoryParams, @params) ); } catch (Tesseract4OcrException e) { LogManager.GetLogger(GetType()).Error(e.Message); throw new Tesseract4OcrException(e.Message, e); } finally { try { if (imagePath != null && !inputImage.FullName.Equals(imagePath)) { TesseractHelper.DeleteFile(imagePath); } } catch (SecurityException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE , imagePath, e.Message)); } try { if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties ().IsUserWordsFileTemporary()) { TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } } catch (SecurityException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_DELETE_FILE , GetTesseract4OcrEngineProperties().GetPathToUserWordsFile(), e.Message)); } } }