예제 #1
0
        /// <summary>
        /// Reads data from the provided input image file and returns retrieved
        /// data as string.
        /// </summary>
        /// <param name="input">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        /// return
        /// <see cref="OutputFormat"/>
        /// result
        /// </param>
        /// <returns>
        /// OCR result as a
        /// <see cref="System.String"/>
        /// that is
        /// returned after processing the given image
        /// </returns>
        public String DoImageOcr(FileInfo input, OutputFormat outputFormat)
        {
            String result = "";

            VerifyImageFormatValidity(input);
            AbstractTesseract4OcrEngine.ITesseractOcrResult processedData = ProcessInputFiles(input, outputFormat);
            if (processedData != null)
            {
                if (outputFormat.Equals(OutputFormat.TXT))
                {
                    result = ((AbstractTesseract4OcrEngine.StringTesseractOcrResult)processedData).GetData();
                }
                else
                {
                    StringBuilder outputText = new StringBuilder();
                    IDictionary <int, IList <TextInfo> > outputMap = ((AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult)processedData
                                                                      ).GetTextInfos();
                    foreach (int page in outputMap.Keys)
                    {
                        StringBuilder pageText = new StringBuilder();
                        foreach (TextInfo textInfo in outputMap.Get(page))
                        {
                            pageText.Append(textInfo.GetText());
                            pageText.Append(Environment.NewLine);
                        }
                        outputText.Append(pageText);
                        outputText.Append(Environment.NewLine);
                    }
                    result = outputText.ToString();
                }
            }
            return(result);
        }
예제 #2
0
        /// <summary>Reads data from the provided input image file.</summary>
        /// <param name="input">
        /// input image
        /// <see cref="System.IO.FileInfo"/>
        /// </param>
        /// <param name="outputFormat">
        ///
        /// <see cref="OutputFormat"/>
        /// for the result returned
        /// by
        /// <see cref="iText.Pdfocr.IOcrEngine"/>
        /// </param>
        /// <returns>
        ///
        /// <see cref="ITesseractOcrResult"/>
        /// instance, either
        /// <see cref="StringTesseractOcrResult"/>
        /// if output format is TXT, or
        /// <see cref="TextInfoTesseractOcrResult"/>
        /// if the output format is HOCR
        /// </returns>
        private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat
                                                                                  )
        {
            IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >();
            StringBuilder    data      = new StringBuilder();
            IList <FileInfo> tempFiles = new List <FileInfo>();

            AbstractTesseract4OcrEngine.ITesseractOcrResult result = null;
            try {
                // image needs to be paginated only if it's tiff
                // or preprocessing isn't required
                int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff
                                         (input);
                int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1;
                int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages;
                for (int page = 1; page <= numOfPages; page++)
                {
                    String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt";
                    for (int i = 0; i < numOfFiles; i++)
                    {
                        tempFiles.Add(CreateTempFile(extension));
                    }
                    DoTesseractOcr(input, tempFiles, outputFormat, page);
                    if (outputFormat.Equals(OutputFormat.HOCR))
                    {
                        IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties
                                                                                                          ().GetTextPositioning());
                        if (GetTesseract4OcrEngineProperties().IsPreprocessingImages())
                        {
                            imageData.Put(page, pageData.Get(1));
                        }
                        else
                        {
                            imageData = pageData;
                        }
                        result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData);
                    }
                    else
                    {
                        foreach (FileInfo tmpFile in tempFiles)
                        {
                            if (File.Exists(System.IO.Path.Combine(tmpFile.FullName)))
                            {
                                data.Append(TesseractHelper.ReadTxtFile(tmpFile));
                            }
                        }
                        result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString());
                    }
                }
            }
            catch (System.IO.IOException e) {
                LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE
                                                                               , e.Message));
            }
            finally {
                foreach (FileInfo file in tempFiles)
                {
                    TesseractHelper.DeleteFile(file.FullName);
                }
            }
            return(result);
        }