private static void DoImageToPdfOcr(AbstractTesseract4OcrEngine tesseractReader, IList <FileInfo> imageFiles ) { OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); ocrPdfCreator.CreatePdf(imageFiles, new PdfWriter(new MemoryStream())); }
/// <summary>Parse text from image and compare with expected.</summary> private void TestImageOcrText(AbstractTesseract4OcrEngine tesseractReader, String path, String expectedOutput ) { FileInfo ex1 = new FileInfo(path); String realOutputHocr = GetTextUsingTesseractFromImage(tesseractReader, ex1); NUnit.Framework.Assert.IsTrue(realOutputHocr.Contains(expectedOutput)); }
/// <summary>Do OCR for given image and compare result text file with expected one.</summary> private bool DoOcrAndCompareTxtFiles(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String expectedPath , IList <String> languages) { String resultTxtFile = GetTargetDirectory() + GetImageName(imgPath, languages) + ".txt"; DoOcrAndSaveToTextFile(tesseractReader, imgPath, resultTxtFile, languages); return(CompareTxtFiles(expectedPath, resultTxtFile)); }
internal DoImageOcrRunnable(AbstractTesseract4OcrEngine tesseractReader, IMetaInfo metaInfo, FileInfo imgFile , FileInfo outputFile, bool createPdf) { this.tesseractReader = tesseractReader; this.metaInfo = metaInfo; this.imgFile = imgFile; this.outputFile = outputFile; this.createPdf = createPdf; }
public EventCountingTest(IntegrationTestHelper.ReaderType type) { isExecutableReaderType = type.Equals(IntegrationTestHelper.ReaderType.EXECUTABLE); if (isExecutableReaderType) { testFileTypeName = "executable"; } else { testFileTypeName = "lib"; } tesseractReader = GetTesseractReader(type); }
/// <summary>Retrieve text from specified page from given PDF document.</summary> protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, FileInfo file , int page, IList <String> languages, IList <String> fonts) { String result = null; String pdfPath = null; try { pdfPath = GetTargetDirectory() + GetImageName(file.FullName, languages) + ".pdf"; DoOcrAndSavePdfToPath(tesseractReader, file.FullName, pdfPath, languages, fonts); result = GetTextFromPdfLayer(pdfPath, null, page); } catch (System.IO.IOException e) { LOGGER.Error(e.Message); } return(result); }
/// <summary> /// Perform OCR using provided path to image (imgPath), /// save to file and get text from file. /// </summary> protected internal virtual String GetRecognizedTextFromTextFile(AbstractTesseract4OcrEngine tesseractReader , String input, IList <String> languages) { String result = null; String txtPath = null; try { txtPath = GetTargetDirectory() + GetImageName(input, languages) + ".txt"; DoOcrAndSaveToTextFile(tesseractReader, input, txtPath, languages); result = GetTextFromTextFile(new FileInfo(txtPath)); } catch (Exception e) { LOGGER.Error(e.Message); } return(result); }
/// <summary> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// </summary> /// <remarks> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// (Method is used for compare tool) /// </remarks> protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String pdfPath, IList <String> languages, IList <String> fonts, Color color) { if (languages != null) { Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetLanguages(languages); tesseractReader.SetTesseract4OcrEngineProperties(properties); } OcrPdfCreatorProperties properties_1 = new OcrPdfCreatorProperties(); properties_1.SetPdfLang("en-US"); properties_1.SetTitle(""); if (fonts != null && fonts.Count > 0) { FontProvider fontProvider = new FontProvider(); foreach (String fontPath in fonts) { String name = FONT_PATH_TO_FONT_NAME_MAP.Get(fontPath); fontProvider.GetFontSet().AddFont(fontPath, PdfEncodings.IDENTITY_H, name); } properties_1.SetFontProvider(fontProvider); } if (color != null) { properties_1.SetTextColor(color); } if (languages != null) { NUnit.Framework.Assert.AreEqual(languages.Count, tesseractReader.GetTesseract4OcrEngineProperties().GetLanguages ().Count); } OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties_1); try { using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) { PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(imgPath )), pdfWriter); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); } } catch (System.IO.IOException e) { LOGGER.Error(e.Message); } }
/// <summary> /// Perform OCR using provided path to image (imgPath) /// and save result to text file. /// </summary> protected internal virtual void DoOcrAndSaveToTextFile(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String txtPath, IList <String> languages) { if (languages != null) { Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetLanguages(languages); tesseractReader.SetTesseract4OcrEngineProperties(properties); } tesseractReader.CreateTxtFile(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(imgPath)), new FileInfo (txtPath)); if (languages != null) { NUnit.Framework.Assert.AreEqual(languages.Count, tesseractReader.GetTesseract4OcrEngineProperties().GetLanguages ().Count); } }
private static void DoImageToPdfAOcr(AbstractTesseract4OcrEngine tesseractReader, IList <FileInfo> imageFiles ) { OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, new OcrPdfCreatorProperties().SetPdfLang( "en-US")); Stream @is = null; try { @is = new FileStream(PROFILE_FOLDER + "sRGB_CS_profile.icm", FileMode.Open, FileAccess.Read); } catch (FileNotFoundException) { } // No expected PdfOutputIntent outputIntent = new PdfOutputIntent("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1" , @is); ocrPdfCreator.CreatePdfA(imageFiles, new PdfWriter(new MemoryStream()), outputIntent); }
/// <summary>Retrieve text from specified page from given PDF document.</summary> protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, FileInfo file , int page, IList <String> languages, String fontPath) { return(GetTextFromPdf(tesseractReader, file, page, languages, JavaCollectionsUtil.SingletonList <String>(fontPath ))); }
private static void DoImageOcr(AbstractTesseract4OcrEngine tesseractReader, FileInfo imageFile) { tesseractReader.DoImageOcr(imageFile); }
public ImageFormatIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); this.testType = type.ToString().ToLowerInvariant(); }
public MultiThreadingTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); }
/// <summary>Retrieve text from the first page of given PDF document setting font.</summary> protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, FileInfo file , IList <String> languages, String fontPath) { return(GetTextFromPdf(tesseractReader, file, 1, languages, fontPath)); }
/// <summary> /// Perform OCR using provided path to image (imgPath), /// save to file and get text from file. /// </summary> protected internal virtual String GetRecognizedTextFromTextFile(AbstractTesseract4OcrEngine tesseractReader , String input) { return(GetRecognizedTextFromTextFile(tesseractReader, input, null)); }
/// <summary>Retrieve text from the first page of given PDF document.</summary> protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, FileInfo file ) { return(GetTextFromPdf(tesseractReader, file, 1, null, new List <String>())); }
/// <summary>Retrieve text from the required page of given PDF document.</summary> protected internal virtual String GetTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, FileInfo file , int page, IList <String> languages) { return(GetTextFromPdf(tesseractReader, file, page, languages, new List <String>())); }
public BasicTesseractIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); }
/// <summary> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// </summary> /// <remarks> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// (Text will be invisible) /// </remarks> protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String pdfPath, IList <String> languages, IList <String> fonts) { DoOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath, languages, fonts, null); }
public PdfA3UIntegrationTest(IntegrationTestHelper.ReaderType type) { tesseractReader = GetTesseractReader(type); }
/// <summary> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// </summary> /// <remarks> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// (Method is used for compare tool) /// </remarks> protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String pdfPath) { DoOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath, null, null, null); }