public virtual void ComparePdfA3uRGBSpanishJPG() { String testName = "comparePdfA3uRGBSpanishJPG"; String filename = "spanish_01"; String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf"; String resultPdfPath = GetTargetDirectory() + filename + "_" + testName + "_a3u.pdf"; Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties ()); properties.SetPathToTessData(GetTessDataDirectory()); properties.SetLanguages(JavaCollectionsUtil.SingletonList <String>("spa")); tesseractReader.SetTesseract4OcrEngineProperties(properties); OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetPdfLang("en-US"); ocrPdfCreatorProperties.SetTitle(""); ocrPdfCreatorProperties.SetTextColor(DeviceRgb.BLACK); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); PdfDocument doc = ocrPdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename + ".jpg")), GetPdfWriter(resultPdfPath), GetRGBPdfOutputIntent()); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, GetTargetDirectory (), "diff_")); }
/// <summary> /// Perform OCR using provided path to image (imgPath) /// and save result to text file. /// </summary> protected internal virtual void DoOcrAndSaveToTextFile(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String txtPath, IList <String> languages) { if (languages != null) { Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetLanguages(languages); tesseractReader.SetTesseract4OcrEngineProperties(properties); } tesseractReader.CreateTxtFile(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(imgPath)), new FileInfo (txtPath)); if (languages != null) { NUnit.Framework.Assert.AreEqual(languages.Count, tesseractReader.GetTesseract4OcrEngineProperties().GetLanguages ().Count); } }
/// <summary> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// </summary> /// <remarks> /// Perform OCR using provided path to image (imgPath) /// and save result PDF document to "pdfPath". /// (Method is used for compare tool) /// </remarks> protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath, String pdfPath, IList <String> languages, IList <String> fonts, Color color) { if (languages != null) { Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetLanguages(languages); tesseractReader.SetTesseract4OcrEngineProperties(properties); } OcrPdfCreatorProperties properties_1 = new OcrPdfCreatorProperties(); properties_1.SetPdfLang("en-US"); properties_1.SetTitle(""); if (fonts != null && fonts.Count > 0) { FontProvider fontProvider = new FontProvider(); foreach (String fontPath in fonts) { String name = FONT_PATH_TO_FONT_NAME_MAP.Get(fontPath); fontProvider.GetFontSet().AddFont(fontPath, PdfEncodings.IDENTITY_H, name); } properties_1.SetFontProvider(fontProvider); } if (color != null) { properties_1.SetTextColor(color); } if (languages != null) { NUnit.Framework.Assert.AreEqual(languages.Count, tesseractReader.GetTesseract4OcrEngineProperties().GetLanguages ().Count); } OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties_1); try { using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) { PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(imgPath )), pdfWriter); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); } } catch (System.IO.IOException e) { LOGGER.Error(e.Message); } }
public virtual void CompareInvoiceFrontThaiImage() { String testName = "compareInvoiceFrontThaiImage"; String filename = "invoice_front_thai"; //Tesseract for Java and Tesseract for .NET give different output //So we cannot use one reference pdf file for them String expectedPdfPathJava = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_java.pdf"; String expectedPdfPathDotNet = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_dotnet.pdf"; String resultPdfPath = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf"; Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); properties.SetTextPositioning(TextPositioning.BY_WORDS_AND_LINES); properties.SetPathToTessData(GetTessDataDirectory()); properties.SetLanguages(JavaUtil.ArraysAsList("tha", "eng")); tesseractReader.SetTesseract4OcrEngineProperties(properties); DoOcrAndSavePdfToPath(tesseractReader, TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath, JavaUtil.ArraysAsList ("tha", "eng"), JavaUtil.ArraysAsList(NOTO_SANS_THAI_FONT_PATH, NOTO_SANS_FONT_PATH), DeviceRgb.RED); bool javaTest = new CompareTool().CompareByContent(resultPdfPath, expectedPdfPathJava, TEST_DOCUMENTS_DIRECTORY , "diff_") == null; bool dotNetTest = new CompareTool().CompareByContent(resultPdfPath, expectedPdfPathDotNet, TEST_DOCUMENTS_DIRECTORY , "diff_") == null; NUnit.Framework.Assert.IsTrue(javaTest || dotNetTest); }
public virtual void TestSpanishPNG() { String testName = "compareSpanishPNG"; String filename = "scanned_spa_01"; String expectedText1 = "¿Y SI ENSAYARA COMO ACTUAR?"; String expectedText2 = "¿Y SI ENSAYARA ACTUAR?"; String resultPdfPath = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf"; IList <String> languages = JavaUtil.ArraysAsList("spa", "spa_old"); Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); if (isExecutableReaderType) { properties.SetPreprocessingImages(false); } // locate text by words properties.SetTextPositioning(TextPositioning.BY_WORDS); properties.SetLanguages(languages); tesseractReader.SetTesseract4OcrEngineProperties(properties); OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetTextColor(DeviceCmyk.BLACK); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); using (PdfWriter pdfWriter = GetPdfWriter(resultPdfPath)) { ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename + ".png")), pdfWriter).Close(); } try { String result = GetTextFromPdfLayer(resultPdfPath, null, 1).Replace("\n", " "); NUnit.Framework.Assert.IsTrue(result.Contains(expectedText1) || result.Contains(expectedText2)); } finally { NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties ().GetTextPositioning()); } }