ImagePreprocessingUtil, iText.Pdfocr.Tesseract4 C# (CSharp)代码示例

示例#1

0

显示文件

        public virtual void TestCheckForInvalidTiff()
        {
            String   path    = TEST_IMAGES_DIRECTORY + "example_03_10MB";
            FileInfo imgFile = new FileInfo(path);

            NUnit.Framework.Assert.IsFalse(ImagePreprocessingUtil.IsTiffImage(imgFile));
        }

示例#2

0

显示文件

        /// <summary>Preprocess given image if it is needed.</summary>
        /// <param name="inputImage">
        /// original input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="pageNumber">number of page to be OCRed</param>
        /// <returns>
        /// path to output image as
        /// <see cref="System.String"/>
        /// </returns>
        private String PreprocessImage(FileInfo inputImage, int pageNumber)
        {
            String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), GetExtension(inputImage));
            String path        = inputImage.FullName;

            try {
                if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                {
                    Pix pix = ImagePreprocessingUtil.PreprocessImage(inputImage, pageNumber);
                    TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix);
                    if (!File.Exists(System.IO.Path.Combine(tmpFileName)))
                    {
                        System.Drawing.Bitmap img = TesseractOcrUtil.ConvertPixToImage(pix);
                        if (img != null)
                        {
                            TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, img);
                        }
                    }
                }
                if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() || !File.Exists(System.IO.Path.Combine(tmpFileName
                                                                                                                       )))
                {
                    TesseractOcrUtil.CreateTempFileCopy(path, tmpFileName);
                }
                if (File.Exists(System.IO.Path.Combine(tmpFileName)))
                {
                    path = tmpFileName;
                }
            }
            catch (System.IO.IOException e) {
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE
                                                                               , e.Message));
            }
            return(path);
        }

示例#3

0

显示文件

 /// <summary>
 /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
 /// of input image (by default 1st).
 /// </summary>
 /// <remarks>
 /// Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page
 /// of input image (by default 1st).
 /// Please note that list of output files is accepted instead of a single file because
 /// page number parameter is not respected in case of TIFF images not requiring preprocessing.
 /// In other words, if the passed image is the TIFF image and according to the
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list
 /// is expected to be same as number of pages in the image, otherwise, only one file is expected
 /// </remarks>
 /// <param name="inputImage">
 /// input image
 /// <see cref="System.IO.FileInfo"/>
 /// </param>
 /// <param name="outputFiles">
 ///
 /// <see cref="System.Collections.IList{E}"/>
 /// of output files
 /// (one per each page)
 /// </param>
 /// <param name="outputFormat">
 /// selected
 /// <see cref="OutputFormat"/>
 /// for tesseract
 /// </param>
 /// <param name="pageNumber">number of page to be processed</param>
 internal override void DoTesseractOcr(FileInfo inputImage, IList <FileInfo> outputFiles, OutputFormat outputFormat
                                       , int pageNumber)
 {
     ScheduledCheck();
     try {
         ValidateLanguages(GetTesseract4OcrEngineProperties().GetLanguages());
         InitializeTesseract(outputFormat);
         OnEvent();
         // if preprocessing is not needed and provided image is tiff,
         // the image will be paginated and separate pages will be OCRed
         IList <String> resultList = new List <String>();
         if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() && ImagePreprocessingUtil.IsTiffImage(inputImage
                                                                                                               ))
         {
             resultList = GetOcrResultForMultiPage(inputImage, outputFormat);
         }
         else
         {
             resultList.Add(GetOcrResultForSinglePage(inputImage, outputFormat, pageNumber));
         }
         // list of result strings is written to separate files
         // (one for each page)
         for (int i = 0; i < resultList.Count; i++)
         {
             String   result     = resultList[i];
             FileInfo outputFile = i >= outputFiles.Count ? null : outputFiles[i];
             if (result != null && outputFile != null)
             {
                 try {
                     using (TextWriter writer = new StreamWriter(new FileStream(outputFile.FullName, FileMode.Create), System.Text.Encoding
                                                                 .UTF8)) {
                         writer.Write(result);
                     }
                 }
                 catch (System.IO.IOException e) {
                     LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE
                                                                                    , e.Message));
                     throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED);
                 }
             }
         }
     }
     catch (Tesseract4OcrException e) {
         LogManager.GetLogger(GetType()).Error(e.Message);
         throw new Tesseract4OcrException(e.Message, e);
     }
     finally {
         if (tesseractInstance != null)
         {
             TesseractOcrUtil.DisposeTesseractInstance(tesseractInstance);
         }
         if (GetTesseract4OcrEngineProperties().GetPathToUserWordsFile() != null && GetTesseract4OcrEngineProperties
                 ().IsUserWordsFileTemporary())
         {
             TesseractHelper.DeleteFile(GetTesseract4OcrEngineProperties().GetPathToUserWordsFile());
         }
     }
 }

示例#4

0

显示文件

 public virtual void TestReadingInvalidImagePath()
 {
     NUnit.Framework.Assert.That(() => {
         String path      = TEST_IMAGES_DIRECTORY + "numbers_02";
         FileInfo imgFile = new FileInfo(path);
         ImagePreprocessingUtil.PreprocessImage(imgFile, 1);
     }
                                 , NUnit.Framework.Throws.InstanceOf <Tesseract4OcrException>())
     ;
 }

示例#5

0

显示文件

        public virtual void TestPixSavingAsPng()
        {
            String path        = TEST_IMAGES_DIRECTORY + "numbers_01.jpg";
            String tmpFileName = GetTargetDirectory() + "testPixSavingAsPng.png";

            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
            Pix pix = ImagePreprocessingUtil.ReadPix(new FileInfo(path));

            TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix);
            NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(tmpFileName)));
            TesseractHelper.DeleteFile(tmpFileName);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
        }

示例#6

0

显示文件

        public virtual void TestTesseract4OcrForPix()
        {
            String   path     = TEST_IMAGES_DIRECTORY + "numbers_02.jpg";
            String   expected = "0123456789";
            FileInfo imgFile  = new FileInfo(path);
            Pix      pix      = ImagePreprocessingUtil.ReadPix(imgFile);
            Tesseract4LibOcrEngine tesseract4LibOcrEngine = GetTesseract4LibOcrEngine();

            tesseract4LibOcrEngine.SetTesseract4OcrEngineProperties(new Tesseract4OcrEngineProperties().SetPathToTessData
                                                                        (GetTessDataDirectory()));
            tesseract4LibOcrEngine.InitializeTesseract(OutputFormat.TXT);
            String result = new TesseractOcrUtil().GetOcrResultAsString(tesseract4LibOcrEngine.GetTesseractInstance(),
                                                                        pix, OutputFormat.TXT);

            NUnit.Framework.Assert.IsTrue(result.Contains(expected));
        }

示例#7

0

显示文件

        /// <summary>
        /// Gets OCR result from provided single page image and preprocesses it if
        /// it is needed.
        /// </summary>
        /// <param name="inputImage">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        /// selected
        /// <see cref="OutputFormat"/>
        /// for tesseract
        /// </param>
        /// <param name="pageNumber">number of page to be OCRed</param>
        /// <returns>result as string that will be written to a temporary file later</returns>
        private String GetOcrResultForSinglePage(FileInfo inputImage, OutputFormat outputFormat, int pageNumber)
        {
            String result = null;

            try {
                // preprocess if required
                if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                {
                    // preprocess and try to ocr
                    result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), ImagePreprocessingUtil.PreprocessImage
                                                                             (inputImage, pageNumber), outputFormat);
                }
                if (result == null)
                {
                    System.Drawing.Bitmap bufferedImage = ImagePreprocessingUtil.ReadImage(inputImage);
                    if (bufferedImage != null)
                    {
                        try {
                            result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), bufferedImage, outputFormat);
                        }
                        catch (Exception e) {
                            // NOSONAR
                            LogManager.GetLogger(GetType()).Info(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE
                                                                                          , e.Message));
                        }
                    }
                    if (result == null)
                    {
                        // perform ocr using original input image
                        result = new TesseractOcrUtil().GetOcrResultAsString(GetTesseractInstance(), inputImage, outputFormat);
                    }
                }
            }
            catch (Exception e) {
                // NOSONAR
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.TESSERACT_FAILED
                                                                               , e.Message));
                throw new Tesseract4OcrException(Tesseract4OcrException.TESSERACT_FAILED);
            }
            return(result);
        }

示例#8

0

显示文件

        /// <summary>Reads data from the provided input image file.</summary>
        /// <param name="input">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        ///
        /// <see cref="OutputFormat"/>
        /// for the result returned
        /// by
        /// <see cref="iText.Pdfocr.IOcrEngine"/>
        /// </param>
        /// <returns>
        ///
        /// <see cref="ITesseractOcrResult"/>
        /// instance, either
        /// <see cref="StringTesseractOcrResult"/>
        /// if output format is TXT, or
        /// <see cref="TextInfoTesseractOcrResult"/>
        /// if the output format is HOCR
        /// </returns>
        private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat
                                                                                  )
        {
            IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >();
            StringBuilder    data      = new StringBuilder();
            IList <FileInfo> tempFiles = new List <FileInfo>();

            AbstractTesseract4OcrEngine.ITesseractOcrResult result = null;
            try {
                // image needs to be paginated only if it's tiff
                // or preprocessing isn't required
                int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff
                                         (input);
                int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1;
                int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages;
                for (int page = 1; page <= numOfPages; page++)
                {
                    String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt";
                    for (int i = 0; i < numOfFiles; i++)
                    {
                        tempFiles.Add(CreateTempFile(extension));
                    }
                    DoTesseractOcr(input, tempFiles, outputFormat, page);
                    if (outputFormat.Equals(OutputFormat.HOCR))
                    {
                        IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties
                                                                                                          ().GetTextPositioning());
                        if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                        {
                            imageData.Put(page, pageData.Get(1));
                        }
                        else
                        {
                            imageData = pageData;
                        }
                        result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData);
                    }
                    else
                    {
                        foreach (FileInfo tmpFile in tempFiles)
                        {
                            if (File.Exists(System.IO.Path.Combine(tmpFile.FullName)))
                            {
                                data.Append(TesseractHelper.ReadTxtFile(tmpFile));
                            }
                        }
                        result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString());
                    }
                }
            }
            catch (System.IO.IOException e) {
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE
                                                                               , e.Message));
            }
            finally {
                foreach (FileInfo file in tempFiles)
                {
                    TesseractHelper.DeleteFile(file.FullName);
                }
            }
            return(result);
        }

C# (CSharp) iText.Pdfocr.Tesseract4 ImagePreprocessingUtil示例