/// <summary> /// Reads /// <see cref="Tesseract.Pix"/> /// from input file or, if /// this is not possible, reads input file as /// <see cref="System.Drawing.Bitmap"/> /// and then converts to /// <see cref="Tesseract.Pix"/>. /// </summary> /// <param name="inputFile"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <returns> /// Pix result /// <see cref="Tesseract.Pix"/> /// object from /// input file /// </returns> internal static Pix ReadPix(FileInfo inputFile) { Pix pix = null; try { System.Drawing.Bitmap bufferedImage = iText.Pdfocr.Tesseract4.ImagePreprocessingUtil.ReadImageFromFile(inputFile ); if (bufferedImage != null) { pix = TesseractOcrUtil.ConvertImageToPix(bufferedImage); } } catch (Exception e) { // NOSONAR LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).Info(MessageFormatUtil.Format (Tesseract4LogMessageConstant.CANNOT_CONVERT_IMAGE_TO_PIX, inputFile.FullName, e.Message)); } if (pix == null) { try { pix = Tesseract.Pix.LoadFromFile(inputFile.FullName); } catch (Exception e) { // NOSONAR LogManager.GetLogger(typeof(iText.Pdfocr.Tesseract4.ImagePreprocessingUtil)).Info(MessageFormatUtil.Format (Tesseract4LogMessageConstant.CANNOT_CONVERT_IMAGE_TO_PIX, inputFile.FullName, e.Message)); } } return(pix); }
/// <summary>Checks whether parent directories are equal for the passed file paths.</summary> /// <param name="firstPath">path to the first file</param> /// <param name="secondPath">path to the second file</param> /// <returns>true if parent directories are equal, otherwise - false</returns> private bool AreEqualParentDirectories(String firstPath, String secondPath) { String firstParentDir = TesseractOcrUtil.GetParentDirectory(firstPath); String secondParentDir = TesseractOcrUtil.GetParentDirectory(secondPath); return(firstParentDir != null && firstParentDir.Equals(secondParentDir)); }
/// <summary> /// Gets OCR result from provided multi-page image and returns result as /// list of strings for each page. /// </summary> /// <remarks> /// Gets OCR result from provided multi-page image and returns result as /// list of strings for each page. This method is used for tiff images /// when preprocessing is not needed. /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <returns> /// list of result string that will be written to a temporary files /// later /// </returns> private IList <String> GetOcrResultForMultiPage(FileInfo inputImage, OutputFormat outputFormat) { IList <String> resultList = new List <String>(); try { InitializeTesseract(outputFormat); TesseractOcrUtil util = new TesseractOcrUtil(); util.InitializeImagesListFromTiff(inputImage); int numOfPages = util.GetListOfPages().Count; for (int i = 0; i < numOfPages; i++) { String result = util.GetOcrResultAsString(GetTesseractInstance(), util.GetListOfPages()[i], outputFormat); resultList.Add(result); } } catch (TesseractException e) { String msg = MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED, e.Message); LogManager.GetLogger(GetType()).Error(msg); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } finally { TesseractOcrUtil.DisposeTesseractInstance(GetTesseractInstance()); } return(resultList); }
/// <summary>Preprocess given image if it is needed.</summary> /// <param name="inputImage"> /// original input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="pageNumber">number of page to be OCRed</param> /// <returns> /// path to output image as /// <see cref="System.String"/> /// </returns> private String PreprocessImage(FileInfo inputImage, int pageNumber) { String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), GetExtension(inputImage)); String path = inputImage.FullName; try { if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { Pix pix = ImagePreprocessingUtil.PreprocessImage(inputImage, pageNumber); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix); if (!File.Exists(System.IO.Path.Combine(tmpFileName))) { System.Drawing.Bitmap img = TesseractOcrUtil.ConvertPixToImage(pix); if (img != null) { TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, img); } } } if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() || !File.Exists(System.IO.Path.Combine(tmpFileName ))) { TesseractOcrUtil.CreateTempFileCopy(path, tmpFileName); } if (File.Exists(System.IO.Path.Combine(tmpFileName))) { path = tmpFileName; } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE , e.Message)); } return(path); }
public virtual void TestReadingPageFromInvalidTiff() { String path = TEST_IMAGES_DIRECTORY + "example_03.tiff"; FileInfo imgFile = new FileInfo(path); Pix page = TesseractOcrUtil.ReadPixPageFromTiff(imgFile, 0); NUnit.Framework.Assert.IsNull(page); }
public virtual void TestImageSavingAsPngWithError() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; System.Drawing.Bitmap bi = (System.Drawing.Bitmap)System.Drawing.Image.FromStream(new FileStream(path, FileMode.Open , FileAccess.Read)); TesseractOcrUtil.SaveImageToTempPngFile(null, bi); }
/// <summary> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// </summary> /// <remarks> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// Please note that list of output files is accepted instead of a single file because /// page number parameter is not respected in case of TIFF images not requiring preprocessing. /// In other words, if the passed image is the TIFF image and according to the /// <see cref="Tesseract4OcrEngineProperties"/> /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list /// is expected to be same as number of pages in the image, otherwise, only one file is expected /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFiles"> /// /// <see cref="System.Collections.IList{E}"/> /// of output files /// (one per each page) /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be processed</param> internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat , int pageNumber) { ScheduledCheck(); try { ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages()); InitializeTesseract(outputFormat); OnEvent(); // if preprocessing is not needed and provided image is tiff, // the image will be paginated and separate pages will be OCRed IList <String> resultList = new List <String>(); if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage )) { resultList = GetOcrResultForMultiPage(inputImage, outputFormat); } else { resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber)); } // list of result strings is written to separate files // (one for each page) for (int i = 0; i < resultList.Count; i++) { String result = resultList[i]; FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i]; if (result != null && outputFile != null) { try { using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding .UTF8)) { writer.Write(result); } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE , e.Message)); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } } } } catch (Tesseract4OcrException e) { LogManager.GetLogger(GetType()).Error(e.Message); throw new Tesseract4OcrException(e.Message, e); } finally { if (tesseractInstance != null) { TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance); } if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties ().IsUserWordsFileTemporary()) { TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } } }
public virtual void TestInitializeImagesListFromInvalidTiff() { String path = TEST_IMAGES_DIRECTORY + "example_03.tiff"; FileInfo imgFile = new FileInfo(path); TesseractOcrUtil tesseractOcrUtil = new TesseractOcrUtil(); tesseractOcrUtil.InitializeImagesListFromTiff(imgFile); NUnit.Framework.Assert.AreEqual(0, tesseractOcrUtil.GetListOfPages().Count); }
public virtual void TestPreprocessingConditions() { Pix pix = null; NUnit.Framework.Assert.IsNull(TesseractOcrUtil.ConvertToGrayscale(pix)); NUnit.Framework.Assert.IsNull(TesseractOcrUtil.OtsuImageThresholding(pix)); NUnit.Framework.Assert.IsNull(TesseractOcrUtil.ConvertPixToImage(pix)); TesseractOcrUtil.DestroyPix(pix); }
public virtual void TestNullSavingAsPng() { String tmpFileName = TesseractOcrUtil.GetTempFilePath(GetTargetDirectory() + "/testNullSavingAsPng", ".png" ); TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, null); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, null); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestImageSavingAsPng() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String tmpFileName = GetTargetDirectory() + "testImageSavingAsPng.png"; NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); System.Drawing.Bitmap bi = (System.Drawing.Bitmap)System.Drawing.Image.FromStream(new FileStream(path, FileMode.Open , FileAccess.Read)); TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, bi); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractHelper.DeleteFile(tmpFileName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestPixSavingAsPng() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String tmpFileName = GetTargetDirectory() + "testPixSavingAsPng.png"; NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); Pix pix = ImagePreprocessingUtil.ReadPix(new FileInfo(path)); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractHelper.DeleteFile(tmpFileName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestGetOcrResultAsStringForFile() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String expected = "619121"; FileInfo imgFile = new FileInfo(path); Tesseract4LibOcrEngine tesseract4LibOcrEngine = GetTesseract4LibOcrEngine(); tesseract4LibOcrEngine.SetTesseract4OcrEngineProperties(new Tesseract4OcrEngineProperties().SetPathToTessData (GetTessDataDirectory())); tesseract4LibOcrEngine.InitializeTesseract(OutputFormat.TXT); String result = new TesseractOcrUtil().GetOcrResultAsString(tesseract4LibOcrEngine.GetTesseractInstance(), imgFile, OutputFormat.TXT); NUnit.Framework.Assert.IsTrue(result.Contains(expected)); }
public virtual void TestTesseract4OcrForPix() { String path = TEST_IMAGES_DIRECTORY + "numbers_02.jpg"; String expected = "0123456789"; FileInfo imgFile = new FileInfo(path); Pix pix = ImagePreprocessingUtil.ReadPix(imgFile); Tesseract4LibOcrEngine tesseract4LibOcrEngine = GetTesseract4LibOcrEngine(); tesseract4LibOcrEngine.SetTesseract4OcrEngineProperties(new Tesseract4OcrEngineProperties().SetPathToTessData (GetTessDataDirectory())); tesseract4LibOcrEngine.InitializeTesseract(OutputFormat.TXT); String result = new TesseractOcrUtil().GetOcrResultAsString(tesseract4LibOcrEngine.GetTesseractInstance(), pix, OutputFormat.TXT); NUnit.Framework.Assert.IsTrue(result.Contains(expected)); }
public virtual void TestDoTesseractOcrForNonAsciiPathForExecutable() { String path = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"; FileInfo imgFile = new FileInfo(path); FileInfo outputFile = new FileInfo(TesseractOcrUtil.GetTempFilePath("test", ".hocr")); Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(); properties.SetPathToTessData(GetTessDataDirectory()); properties.SetPreprocessingImages(false); Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), properties ); engine.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(outputFile.FullName))); TesseractHelper.DeleteFile(outputFile.FullName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(outputFile.FullName))); }
/// <summary>Performs basic image preprocessing using buffered image (if provided).</summary> /// <remarks> /// Performs basic image preprocessing using buffered image (if provided). /// Preprocessed image will be saved in temporary directory. /// </remarks> /// <param name="inputFile"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="pageNumber">number of page to be preprocessed</param> /// <returns> /// created preprocessed image as /// <see cref="Tesseract.Pix"/> /// </returns> internal static Pix PreprocessImage(FileInfo inputFile, int pageNumber) { Pix pix = null; // read image if (IsTiffImage(inputFile)) { pix = TesseractOcrUtil.ReadPixPageFromTiff(inputFile, pageNumber - 1); } else { pix = ReadPix(inputFile); } if (pix == null) { throw new Tesseract4OcrException(Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE).SetMessageParams(inputFile .FullName); } return(TesseractOcrUtil.PreprocessPix(pix)); }
/// <summary> /// Gets OCR result from provided single page image and preprocesses it if /// it is needed. /// </summary> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be OCRed</param> /// <returns>result as string that will be written to a temporary file later</returns> private String GetOcrResultForSinglePage(FileInfo inputImage, OutputFormat outputFormat, int pageNumber) { String result = null; try { // preprocess if required if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { // preprocess and try to ocr result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), ImagePreprocessingUtil.PreprocessImage (inputImage, pageNumber), outputFormat); } if (result == null) { System.Drawing.Bitmap bufferedImage = ImagePreprocessingUtil.ReadImage(inputImage); if (bufferedImage != null) { try { result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), bufferedImage, outputFormat); } catch (Exception e) { // NOSONAR LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE , e.Message)); } } if (result == null) { // perform ocr using original input image result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), inputImage, outputFormat); } } } catch (Exception e) { // NOSONAR LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED , e.Message)); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } return(result); }
/// <summary> /// Create list of parameters for command moving to the image parent /// directory. /// </summary> /// <param name="imagePath">path to input image</param> /// <returns>command list</returns> private IList <String> MoveToImageDirectory(String imagePath) { // go the image parent directory IList <String> @params = new List <String>(); String parent = TesseractOcrUtil.GetParentDirectory(imagePath); String replacement = IsWindows() ? "" : "/"; parent = parent.Replace("file:///", replacement).Replace("file:/", replacement); // Use "/d" parameter to handle cases when the current directory on Windows // is located on a different drive compared to the directory we move to if (IsWindows()) { @params.Add("cd /d"); } else { @params.Add("cd"); } @params.Add(AddQuotes(parent)); return(@params); }
/// <summary> /// Using provided input stream there will be created /// temporary file (with name 'language.user-words') /// containing words (one per line) which ends with /// a new line character. /// </summary> /// <remarks> /// Using provided input stream there will be created /// temporary file (with name 'language.user-words') /// containing words (one per line) which ends with /// a new line character. Train data for provided language /// should exist in specified tess data directory. /// NOTE: /// User words dictionary doesn't work properly in tesseract4 /// and hidden for public usage until fix is available /// </remarks> /// <param name="language"> /// language as /// <see cref="System.String"/> /// , tessdata for /// this languages has to exist in tess data directory /// </param> /// <param name="inputStream"> /// custom user words as /// <see cref="System.IO.Stream"/> /// </param> /// <returns> /// the /// <see cref="Tesseract4OcrEngineProperties"/> /// instance /// </returns> internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWords(String language, Stream inputStream) { SetPathToUserWordsFile(null); if (!GetLanguages().Contains(language)) { if (DEFAULT_LANGUAGE.Equals(language.ToLowerInvariant())) { IList <String> languagesList = GetLanguages(); languagesList.Add(language); SetLanguages(languagesList); } else { throw new Tesseract4OcrException(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST).SetMessageParams(language ); } } String userWordsFileName = TesseractOcrUtil.GetTempFilePath(language, "." + DEFAULT_USER_WORDS_SUFFIX); try { using (StreamWriter writer = new StreamWriter(userWordsFileName)) { TextReader reader = new StreamReader(inputStream, System.Text.Encoding.UTF8); int data; while ((data = reader.Read()) != -1) { writer.Write(data); } writer.Write(Environment.NewLine); SetPathToUserWordsFile(userWordsFileName, true); } } catch (System.IO.IOException e) { SetPathToUserWordsFile(null); LogManager.GetLogger(GetType()).Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS , e.Message)); } return(this); }
/// <summary> /// Initializes instance of tesseract if it haven't been already /// initialized or it have been disposed and sets all the required /// properties. /// </summary> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> public virtual void InitializeTesseract(OutputFormat outputFormat) { if (GetTesseractInstance() == null || TesseractOcrUtil.IsTesseractInstanceDisposed(GetTesseractInstance()) ) { tesseractInstance = TesseractOcrUtil.InitializeTesseractInstance(IsWindows(), GetTessData(), GetLanguagesAsString (), GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } GetTesseractInstance().SetVariable("tessedit_create_hocr", outputFormat.Equals(OutputFormat.HOCR) ? "1" : "0"); GetTesseractInstance().SetVariable("user_defined_dpi", "300"); if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null) { GetTesseractInstance().SetVariable("load_system_dawg", "0"); GetTesseractInstance().SetVariable("load_freq_dawg", "0"); GetTesseractInstance().SetVariable("user_words_suffix", GetTesseract4OcrEngineProperties().GetDefaultUserWordsSuffix ()); GetTesseractInstance().SetVariable("user_words_file", GetTesseract4OcrEngineProperties().GetPathToUserWordsFile ()); } TesseractOcrUtil.SetTesseractProperties(GetTesseractInstance(), GetTessData(), GetLanguagesAsString(), GetTesseract4OcrEngineProperties ().GetPageSegMode(), GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); }
/// <summary> /// Creates a new /// <see cref="Tesseract4LibOcrEngine"/> /// instance. /// </summary> /// <param name="tesseract4OcrEngineProperties">set of properteis</param> public Tesseract4LibOcrEngine(Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) : base(tesseract4OcrEngineProperties) { tesseractInstance = TesseractOcrUtil.InitializeTesseractInstance(IsWindows(), null, null, null); }
/// <summary> /// Reads input file as Leptonica /// <see cref="Tesseract.Pix"/> /// and /// converts it to /// <see cref="System.Drawing.Bitmap"/>. /// </summary> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <returns> /// returns a /// <see cref="System.Drawing.Bitmap"/> /// as the result /// </returns> internal static System.Drawing.Bitmap ReadAsPixAndConvertToBufferedImage(FileInfo inputImage) { Pix pix = Tesseract.Pix.LoadFromFile(inputImage.FullName); return(TesseractOcrUtil.ConvertPixToImage(pix)); }
/// <summary>Creates a temporary file with given extension.</summary> /// <param name="extension"> /// file extension for a new file /// <see cref="System.String"/> /// </param> /// <returns> /// a new created /// <see cref="System.IO.FileInfo"/> /// instance /// </returns> private FileInfo CreateTempFile(String extension) { String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), extension); return(new FileInfo(tmpFileName)); }