public virtual void TestCheckForInvalidTiff() { String path = TEST_IMAGES_DIRECTORY + "example_03_10MB"; FileInfo imgFile = new FileInfo(path); NUnit.Framework.Assert.IsFalse(ImagePreprocessingUtil.IsTiffImage(imgFile)); }
/// <summary>Preprocess given image if it is needed.</summary> /// <param name="inputImage"> /// original input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="pageNumber">number of page to be OCRed</param> /// <returns> /// path to output image as /// <see cref="System.String"/> /// </returns> private String PreprocessImage(FileInfo inputImage, int pageNumber) { String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), GetExtension(inputImage)); String path = inputImage.FullName; try { if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { Pix pix = ImagePreprocessingUtil.PreprocessImage(inputImage, pageNumber); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix); if (!File.Exists(System.IO.Path.Combine(tmpFileName))) { System.Drawing.Bitmap img = TesseractOcrUtil.ConvertPixToImage(pix); if (img != null) { TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, img); } } } if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() || !File.Exists(System.IO.Path.Combine(tmpFileName ))) { TesseractOcrUtil.CreateTempFileCopy(path, tmpFileName); } if (File.Exists(System.IO.Path.Combine(tmpFileName))) { path = tmpFileName; } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE , e.Message)); } return(path); }
/// <summary> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// </summary> /// <remarks> /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page /// of input image (by default 1st). /// Please note that list of output files is accepted instead of a single file because /// page number parameter is not respected in case of TIFF images not requiring preprocessing. /// In other words, if the passed image is the TIFF image and according to the /// <see cref="Tesseract4OcrEngineProperties"/> /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list /// is expected to be same as number of pages in the image, otherwise, only one file is expected /// </remarks> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFiles"> /// /// <see cref="System.Collections.IList{E}"/> /// of output files /// (one per each page) /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be processed</param> internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat , int pageNumber) { ScheduledCheck(); try { ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages()); InitializeTesseract(outputFormat); OnEvent(); // if preprocessing is not needed and provided image is tiff, // the image will be paginated and separate pages will be OCRed IList <String> resultList = new List <String>(); if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage )) { resultList = GetOcrResultForMultiPage(inputImage, outputFormat); } else { resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber)); } // list of result strings is written to separate files // (one for each page) for (int i = 0; i < resultList.Count; i++) { String result = resultList[i]; FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i]; if (result != null && outputFile != null) { try { using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding .UTF8)) { writer.Write(result); } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE , e.Message)); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } } } } catch (Tesseract4OcrException e) { LogManager.GetLogger(GetType()).Error(e.Message); throw new Tesseract4OcrException(e.Message, e); } finally { if (tesseractInstance != null) { TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance); } if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties ().IsUserWordsFileTemporary()) { TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile()); } } }
public virtual void TestReadingInvalidImagePath() { NUnit.Framework.Assert.That(() => { String path = TEST_IMAGES_DIRECTORY + "numbers_02"; FileInfo imgFile = new FileInfo(path); ImagePreprocessingUtil.PreprocessImage(imgFile, 1); } , NUnit.Framework.Throws.InstanceOf <Tesseract4OcrException>()) ; }
public virtual void TestPixSavingAsPng() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String tmpFileName = GetTargetDirectory() + "testPixSavingAsPng.png"; NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); Pix pix = ImagePreprocessingUtil.ReadPix(new FileInfo(path)); TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix); NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName))); TesseractHelper.DeleteFile(tmpFileName); NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName))); }
public virtual void TestTesseract4OcrForPix() { String path = TEST_IMAGES_DIRECTORY + "numbers_02.jpg"; String expected = "0123456789"; FileInfo imgFile = new FileInfo(path); Pix pix = ImagePreprocessingUtil.ReadPix(imgFile); Tesseract4LibOcrEngine tesseract4LibOcrEngine = GetTesseract4LibOcrEngine(); tesseract4LibOcrEngine.SetTesseract4OcrEngineProperties(new Tesseract4OcrEngineProperties().SetPathToTessData (GetTessDataDirectory())); tesseract4LibOcrEngine.InitializeTesseract(OutputFormat.TXT); String result = new TesseractOcrUtil().GetOcrResultAsString(tesseract4LibOcrEngine.GetTesseractInstance(), pix, OutputFormat.TXT); NUnit.Framework.Assert.IsTrue(result.Contains(expected)); }
/// <summary> /// Gets OCR result from provided single page image and preprocesses it if /// it is needed. /// </summary> /// <param name="inputImage"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// selected /// <see cref="OutputFormat"/> /// for tesseract /// </param> /// <param name="pageNumber">number of page to be OCRed</param> /// <returns>result as string that will be written to a temporary file later</returns> private String GetOcrResultForSinglePage(FileInfo inputImage, OutputFormat outputFormat, int pageNumber) { String result = null; try { // preprocess if required if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { // preprocess and try to ocr result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), ImagePreprocessingUtil.PreprocessImage (inputImage, pageNumber), outputFormat); } if (result == null) { System.Drawing.Bitmap bufferedImage = ImagePreprocessingUtil.ReadImage(inputImage); if (bufferedImage != null) { try { result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), bufferedImage, outputFormat); } catch (Exception e) { // NOSONAR LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE , e.Message)); } } if (result == null) { // perform ocr using original input image result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), inputImage, outputFormat); } } } catch (Exception e) { // NOSONAR LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED , e.Message)); throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED); } return(result); }
/// <summary>Reads data from the provided input image file.</summary> /// <param name="input"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// /// <see cref="OutputFormat"/> /// for the result returned /// by /// <see cref="iText.Pdfocr.IOcrEngine"/> /// </param> /// <returns> /// /// <see cref="ITesseractOcrResult"/> /// instance, either /// <see cref="StringTesseractOcrResult"/> /// if output format is TXT, or /// <see cref="TextInfoTesseractOcrResult"/> /// if the output format is HOCR /// </returns> private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat ) { IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >(); StringBuilder data = new StringBuilder(); IList <FileInfo> tempFiles = new List <FileInfo>(); AbstractTesseract4OcrEngine.ITesseractOcrResult result = null; try { // image needs to be paginated only if it's tiff // or preprocessing isn't required int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff (input); int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1; int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages; for (int page = 1; page <= numOfPages; page++) { String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt"; for (int i = 0; i < numOfFiles; i++) { tempFiles.Add(CreateTempFile(extension)); } DoTesseractOcr(input, tempFiles, outputFormat, page); if (outputFormat.Equals(OutputFormat.HOCR)) { IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties ().GetTextPositioning()); if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { imageData.Put(page, pageData.Get(1)); } else { imageData = pageData; } result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData); } else { foreach (FileInfo tmpFile in tempFiles) { if (File.Exists(System.IO.Path.Combine(tmpFile.FullName))) { data.Append(TesseractHelper.ReadTxtFile(tmpFile)); } } result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString()); } } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE , e.Message)); } finally { foreach (FileInfo file in tempFiles) { TesseractHelper.DeleteFile(file.FullName); } } return(result); }