/// <summary>Preprocess given image if it is needed.</summary> /// <param name="inputImage"> /// original input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="pageNumber">number of page to be OCRed</param> /// <returns> /// path to output image as /// <see cref="System.String"/> /// </returns> private String PreprocessImage(FileInfo inputImage, int pageNumber) { String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), GetExtension(inputImage)); String path = inputImage.FullName; try { if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { Pix pix = ImagePreprocessingUtil.PreprocessImage(inputImage, pageNumber); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix); if (!File.Exists(System.IO.Path.Combine(tmpFileName))) { System.Drawing.Bitmap img = TesseractOcrUtil.ConvertPixToImage(pix); if (img != null) { TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, img); } } } if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() || !File.Exists(System.IO.Path.Combine(tmpFileName ))) { TesseractOcrUtil.CreateTempFileCopy(path, tmpFileName); } if (File.Exists(System.IO.Path.Combine(tmpFileName))) { path = tmpFileName; } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE , e.Message)); } return(path); }
public virtual void TestNullSavingAsPng() { String tmpFileName = TesseractOcrUtil.GetTempFilePath(GetTargetDirectory() + "/testNullSavingAsPng", ".png" ); TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, null); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, null); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestDoTesseractOcrForNonAsciiPathForExecutable() { String path = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"; FileInfo imgFile = new FileInfo(path); FileInfo outputFile = new FileInfo(TesseractOcrUtil.GetTempFilePath("test", ".hocr")); Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(); properties.SetPathToTessData(GetTessDataDirectory()); properties.SetPreprocessingImages(false); Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), properties ); engine.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(outputFile.FullName))); TesseractHelper.DeleteFile(outputFile.FullName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(outputFile.FullName))); }
/// <summary> /// Using provided input stream there will be created /// temporary file (with name 'language.user-words') /// containing words (one per line) which ends with /// a new line character. /// </summary> /// <remarks> /// Using provided input stream there will be created /// temporary file (with name 'language.user-words') /// containing words (one per line) which ends with /// a new line character. Train data for provided language /// should exist in specified tess data directory. /// NOTE: /// User words dictionary doesn't work properly in tesseract4 /// and hidden for public usage until fix is available /// </remarks> /// <param name="language"> /// language as /// <see cref="System.String"/> /// , tessdata for /// this languages has to exist in tess data directory /// </param> /// <param name="inputStream"> /// custom user words as /// <see cref="System.IO.Stream"/> /// </param> /// <returns> /// the /// <see cref="Tesseract4OcrEngineProperties"/> /// instance /// </returns> internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWords(String language, Stream inputStream) { SetPathToUserWordsFile(null); if (!GetLanguages().Contains(language)) { if (DEFAULT_LANGUAGE.Equals(language.ToLowerInvariant())) { IList <String> languagesList = GetLanguages(); languagesList.Add(language); SetLanguages(languagesList); } else { throw new Tesseract4OcrException(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST).SetMessageParams(language ); } } String userWordsFileName = TesseractOcrUtil.GetTempFilePath(language, "." + DEFAULT_USER_WORDS_SUFFIX); try { using (StreamWriter writer = new StreamWriter(userWordsFileName)) { TextReader reader = new StreamReader(inputStream, System.Text.Encoding.UTF8); int data; while ((data = reader.Read()) != -1) { writer.Write(data); } writer.Write(Environment.NewLine); SetPathToUserWordsFile(userWordsFileName, true); } } catch (System.IO.IOException e) { SetPathToUserWordsFile(null); LogManager.GetLogger(GetType()).Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS , e.Message)); } return(this); }
/// <summary>Creates a temporary file with given extension.</summary> /// <param name="extension"> /// file extension for a new file /// <see cref="System.String"/> /// </param> /// <returns> /// a new created /// <see cref="System.IO.FileInfo"/> /// instance /// </returns> private FileInfo CreateTempFile(String extension) { String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), extension); return(new FileInfo(tmpFileName)); }