Exemplo n.º 1
0
        /// <summary>Preprocess given image if it is needed.</summary>
        /// <param name="inputImage">
        /// original input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="pageNumber">number of page to be OCRed</param>
        /// <returns>
        /// path to output image as
        /// <see cref="System.String"/>
        /// </returns>
        private String PreprocessImage(FileInfo inputImage, int pageNumber)
        {
            String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), GetExtension(inputImage));
            String path        = inputImage.FullName;

            try {
                if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                {
                    Pix pix = ImagePreprocessingUtil.PreprocessImage(inputImage, pageNumber);
                    TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, pix);
                    if (!File.Exists(System.IO.Path.Combine(tmpFileName)))
                    {
                        System.Drawing.Bitmap img = TesseractOcrUtil.ConvertPixToImage(pix);
                        if (img != null)
                        {
                            TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, img);
                        }
                    }
                }
                if (!GetTesseract4OcrEngineProperties().IsPreprocessingImages() || !File.Exists(System.IO.Path.Combine(tmpFileName
                                                                                                                       )))
                {
                    TesseractOcrUtil.CreateTempFileCopy(path, tmpFileName);
                }
                if (File.Exists(System.IO.Path.Combine(tmpFileName)))
                {
                    path = tmpFileName;
                }
            }
            catch (System.IO.IOException e) {
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE
                                                                               , e.Message));
            }
            return(path);
        }
Exemplo n.º 2
0
        public virtual void TestNullSavingAsPng()
        {
            String tmpFileName = TesseractOcrUtil.GetTempFilePath(GetTargetDirectory() + "/testNullSavingAsPng", ".png"
                                                                  );

            TesseractOcrUtil.SaveImageToTempPngFile(tmpFileName, null);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
            TesseractOcrUtil.SavePixToTempPngFile(tmpFileName, null);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(tmpFileName)));
        }
Exemplo n.º 3
0
        public virtual void TestDoTesseractOcrForNonAsciiPathForExecutable()
        {
            String   path       = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png";
            FileInfo imgFile    = new FileInfo(path);
            FileInfo outputFile = new FileInfo(TesseractOcrUtil.GetTempFilePath("test", ".hocr"));
            Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties();

            properties.SetPathToTessData(GetTessDataDirectory());
            properties.SetPreprocessingImages(false);
            Tesseract4ExecutableOcrEngine engine = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), properties
                                                                                     );

            engine.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR);
            NUnit.Framework.Assert.IsTrue(File.Exists(System.IO.Path.Combine(outputFile.FullName)));
            TesseractHelper.DeleteFile(outputFile.FullName);
            NUnit.Framework.Assert.IsFalse(File.Exists(System.IO.Path.Combine(outputFile.FullName)));
        }
Exemplo n.º 4
0
        /// <summary>
        /// Using provided input stream there will be created
        /// temporary file (with name 'language.user-words')
        /// containing words (one per line) which ends with
        /// a new line character.
        /// </summary>
        /// <remarks>
        /// Using provided input stream there will be created
        /// temporary file (with name 'language.user-words')
        /// containing words (one per line) which ends with
        /// a new line character. Train data for provided language
        /// should exist in specified tess data directory.
        /// NOTE:
        /// User words dictionary doesn't work properly in tesseract4
        /// and hidden for public usage until fix is available
        /// </remarks>
        /// <param name="language">
        /// language as
        /// <see cref="System.String"/>
        /// , tessdata for
        /// this languages has to exist in tess data directory
        /// </param>
        /// <param name="inputStream">
        /// custom user words as
        /// <see cref="System.IO.Stream"/>
        /// </param>
        /// <returns>
        /// the
        /// <see cref="Tesseract4OcrEngineProperties"/>
        /// instance
        /// </returns>
        internal virtual iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetUserWords(String language, Stream
                                                                                            inputStream)
        {
            SetPathToUserWordsFile(null);
            if (!GetLanguages().Contains(language))
            {
                if (DEFAULT_LANGUAGE.Equals(language.ToLowerInvariant()))
                {
                    IList <String> languagesList = GetLanguages();
                    languagesList.Add(language);
                    SetLanguages(languagesList);
                }
                else
                {
                    throw new Tesseract4OcrException(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST).SetMessageParams(language
                                                                                                                          );
                }
            }
            String userWordsFileName = TesseractOcrUtil.GetTempFilePath(language, "." + DEFAULT_USER_WORDS_SUFFIX);

            try {
                using (StreamWriter writer = new StreamWriter(userWordsFileName)) {
                    TextReader reader = new StreamReader(inputStream, System.Text.Encoding.UTF8);
                    int        data;
                    while ((data = reader.Read()) != -1)
                    {
                        writer.Write(data);
                    }
                    writer.Write(Environment.NewLine);
                    SetPathToUserWordsFile(userWordsFileName, true);
                }
            }
            catch (System.IO.IOException e) {
                SetPathToUserWordsFile(null);
                LogManager.GetLogger(GetType()).Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS
                                                                              , e.Message));
            }
            return(this);
        }
Exemplo n.º 5
0
        /// <summary>Creates a temporary file with given extension.</summary>
        /// <param name="extension">
        /// file extension for a new file
        /// <see cref="System.String"/>
        /// </param>
        /// <returns>
        /// a new created
        /// <see cref="System.IO.FileInfo"/>
        /// instance
        /// </returns>
        private FileInfo CreateTempFile(String extension)
        {
            String tmpFileName = TesseractOcrUtil.GetTempFilePath(Guid.NewGuid().ToString(), extension);

            return(new FileInfo(tmpFileName));
        }