/// <summary> /// Reads data from the provided input image file and returns retrieved /// data as string. /// </summary> /// <param name="input"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// return /// <see cref="OutputFormat"/> /// result /// </param> /// <returns> /// OCR result as a /// <see cref="System.String"/> /// that is /// returned after processing the given image /// </returns> public String DoImageOcr(FileInfo input, OutputFormat outputFormat) { String result = ""; VerifyImageFormatValidity(input); AbstractTesseract4OcrEngine.ITesseractOcrResult processedData = ProcessInputFiles(input, outputFormat); if (processedData != null) { if (outputFormat.Equals(OutputFormat.TXT)) { result = ((AbstractTesseract4OcrEngine.StringTesseractOcrResult)processedData).GetData(); } else { StringBuilder outputText = new StringBuilder(); IDictionary <int, IList <TextInfo> > outputMap = ((AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult)processedData ).GetTextInfos(); foreach (int page in outputMap.Keys) { StringBuilder pageText = new StringBuilder(); foreach (TextInfo textInfo in outputMap.Get(page)) { pageText.Append(textInfo.GetText()); pageText.Append(Environment.NewLine); } outputText.Append(pageText); outputText.Append(Environment.NewLine); } result = outputText.ToString(); } } return(result); }
/// <summary>Reads data from the provided input image file.</summary> /// <param name="input"> /// input image /// <see cref="System.IO.FileInfo"/> /// </param> /// <param name="outputFormat"> /// /// <see cref="OutputFormat"/> /// for the result returned /// by /// <see cref="iText.Pdfocr.IOcrEngine"/> /// </param> /// <returns> /// /// <see cref="ITesseractOcrResult"/> /// instance, either /// <see cref="StringTesseractOcrResult"/> /// if output format is TXT, or /// <see cref="TextInfoTesseractOcrResult"/> /// if the output format is HOCR /// </returns> private AbstractTesseract4OcrEngine.ITesseractOcrResult ProcessInputFiles(FileInfo input, OutputFormat outputFormat ) { IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >(); StringBuilder data = new StringBuilder(); IList <FileInfo> tempFiles = new List <FileInfo>(); AbstractTesseract4OcrEngine.ITesseractOcrResult result = null; try { // image needs to be paginated only if it's tiff // or preprocessing isn't required int realNumOfPages = !ImagePreprocessingUtil.IsTiffImage(input) ? 1 : ImagePreprocessingUtil.GetNumberOfPageTiff (input); int numOfPages = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? realNumOfPages : 1; int numOfFiles = GetTesseract4OcrEngineProperties().IsPreprocessingImages() ? 1 : realNumOfPages; for (int page = 1; page <= numOfPages; page++) { String extension = outputFormat.Equals(OutputFormat.HOCR) ? ".hocr" : ".txt"; for (int i = 0; i < numOfFiles; i++) { tempFiles.Add(CreateTempFile(extension)); } DoTesseractOcr(input, tempFiles, outputFormat, page); if (outputFormat.Equals(OutputFormat.HOCR)) { IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(tempFiles, GetTesseract4OcrEngineProperties ().GetTextPositioning()); if (GetTesseract4OcrEngineProperties().IsPreprocessingImages()) { imageData.Put(page, pageData.Get(1)); } else { imageData = pageData; } result = new AbstractTesseract4OcrEngine.TextInfoTesseractOcrResult(imageData); } else { foreach (FileInfo tmpFile in tempFiles) { if (File.Exists(System.IO.Path.Combine(tmpFile.FullName))) { data.Append(TesseractHelper.ReadTxtFile(tmpFile)); } } result = new AbstractTesseract4OcrEngine.StringTesseractOcrResult(data.ToString()); } } } catch (System.IO.IOException e) { LogManager.GetLogger(GetType()).Error(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE , e.Message)); } finally { foreach (FileInfo file in tempFiles) { TesseractHelper.DeleteFile(file.FullName); } } return(result); }