public virtual void ComparePdfA3uRGBSpanishJPG() { String testName = "comparePdfA3uRGBSpanishJPG"; String filename = "spanish_01"; String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf"; String resultPdfPath = GetTargetDirectory() + filename + "_" + testName + "_a3u.pdf"; Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties ()); properties.SetPathToTessData(GetTessDataDirectory()); properties.SetLanguages(JavaCollectionsUtil.SingletonList <String>("spa")); tesseractReader.SetTesseract4OcrEngineProperties(properties); OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetPdfLang("en-US"); ocrPdfCreatorProperties.SetTitle(""); ocrPdfCreatorProperties.SetTextColor(DeviceRgb.BLACK); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); PdfDocument doc = ocrPdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename + ".jpg")), GetPdfWriter(resultPdfPath), GetRGBPdfOutputIntent()); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, GetTargetDirectory (), "diff_")); }
public virtual void ComparePdfA3uCMYKColorSpaceJPG() { String testName = "comparePdfA3uCMYKColorSpaceJPG"; String filename = "numbers_01"; String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf"; String resultPdfPath = GetTargetDirectory() + filename + "_" + testName + "_a3u.pdf"; try { OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetPdfLang("en-US"); ocrPdfCreatorProperties.SetTitle(""); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetTextPositioning (TextPositioning.BY_WORDS)); NUnit.Framework.Assert.AreEqual(tesseractReader, ocrPdfCreator.GetOcrEngine()); ocrPdfCreator.SetOcrEngine(tesseractReader); PdfDocument doc = ocrPdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename + ".jpg")), GetPdfWriter(resultPdfPath), GetCMYKPdfOutputIntent()); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, GetTargetDirectory (), "diff_")); } finally { NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties ().GetTextPositioning()); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetTextPositioning (TextPositioning.BY_LINES)); } }
public virtual void TestTextFromPdfLayersFromMultiPagePdf() { String testName = "testTextFromPdfLayersFromMultiPagePdf"; String pdfPath = GetTargetDirectory() + testName + ".pdf"; IList <FileInfo> files = JavaUtil.ArraysAsList(new FileInfo(TEST_IMAGES_DIRECTORY + "german_01.jpg"), new FileInfo (TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"), new FileInfo(TEST_IMAGES_DIRECTORY + "nümbérs.jpg"), new FileInfo(TEST_IMAGES_DIRECTORY + "example_04.png")); OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); properties.SetImageLayerName("image"); properties.SetTextLayerName("text"); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); PdfDocument doc = ocrPdfCreator.CreatePdf(files, GetPdfWriter(pdfPath)); NUnit.Framework.Assert.IsNotNull(doc); int numOfPages = doc.GetNumberOfPages(); NUnit.Framework.Assert.AreEqual(numOfPages, files.Count); IList <PdfLayer> layers = doc.GetCatalog().GetOCProperties(true).GetLayers(); NUnit.Framework.Assert.AreEqual(numOfPages * 2, layers.Count); NUnit.Framework.Assert.AreEqual("image", layers[2].GetPdfObject().Get(PdfName.Name).ToString()); NUnit.Framework.Assert.AreEqual("text", layers[3].GetPdfObject().Get(PdfName.Name).ToString()); doc.Close(); // Text layer should contain all text // Image layer shouldn't contain any text String expectedOutput = "619121"; NUnit.Framework.Assert.AreEqual(expectedOutput, GetTextFromPdfLayer(pdfPath, "text", 3)); NUnit.Framework.Assert.AreEqual("", GetTextFromPdfLayer(pdfPath, "image", 3)); }
public virtual void TestFontColorInMultiPagePdf() { String testName = "testFontColorInMultiPagePdf"; String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; String pdfPath = GetTargetDirectory() + testName + ".pdf"; FileInfo file = new FileInfo(path); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages (false)); OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetTextLayerName("Text1"); Color color = DeviceCmyk.MAGENTA; ocrPdfCreatorProperties.SetTextColor(color); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(file), GetPdfWriter( pdfPath)); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); IntegrationTestHelper.ExtractionStrategy strategy = new IntegrationTestHelper.ExtractionStrategy("Text1"); PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); processor.ProcessPageContent(pdfDocument.GetPage(1)); Color fillColor = strategy.GetFillColor(); NUnit.Framework.Assert.AreEqual(fillColor, color); pdfDocument.Close(); }
/// <summary> /// Perform OCR with custom ocr engine using provided input image and set /// of properties and save to the given path. /// </summary> public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties) { OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); try { using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) { ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(inputFile), pdfWriter).Close(); } } catch (System.IO.IOException e) { LOGGER.Error(e.Message); } }
public virtual void TestTextFromPdfLayersFromMultiPageTiff() { String testName = "testTextFromPdfLayersFromMultiPageTiff"; bool preprocess = tesseractReader.GetTesseract4OcrEngineProperties().IsPreprocessingImages(); String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; String pdfPath = GetTargetDirectory() + testName + ".pdf"; FileInfo file = new FileInfo(path); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages (false)); OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); properties.SetTextLayerName("Text Layer"); properties.SetImageLayerName("Image Layer"); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(file), GetPdfWriter( pdfPath)); NUnit.Framework.Assert.IsNotNull(doc); int numOfPages = doc.GetNumberOfPages(); IList <PdfLayer> layers = doc.GetCatalog().GetOCProperties(true).GetLayers(); NUnit.Framework.Assert.AreEqual(numOfPages * 2, layers.Count); NUnit.Framework.Assert.AreEqual("Image Layer", layers[2].GetPdfObject().Get(PdfName.Name).ToString()); NUnit.Framework.Assert.AreEqual("Text Layer", layers[3].GetPdfObject().Get(PdfName.Name).ToString()); doc.Close(); // Text layer should contain all text // Image layer shouldn't contain any text String expectedOutput = "Multipage\nTIFF\nExample\nPage 5"; NUnit.Framework.Assert.AreEqual(expectedOutput, GetTextFromPdfLayer(pdfPath, "Text Layer", 5)); NUnit.Framework.Assert.AreEqual("", GetTextFromPdfLayer(pdfPath, "Image Layer", 5)); NUnit.Framework.Assert.IsFalse(tesseractReader.GetTesseract4OcrEngineProperties().IsPreprocessingImages()); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages (preprocess)); }
public virtual void TestSpanishPNG() { String testName = "compareSpanishPNG"; String filename = "scanned_spa_01"; String expectedText1 = "¿Y SI ENSAYARA COMO ACTUAR?"; String expectedText2 = "¿Y SI ENSAYARA ACTUAR?"; String resultPdfPath = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf"; IList <String> languages = JavaUtil.ArraysAsList("spa", "spa_old"); Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties(); if (isExecutableReaderType) { properties.SetPreprocessingImages(false); } // locate text by words properties.SetTextPositioning(TextPositioning.BY_WORDS); properties.SetLanguages(languages); tesseractReader.SetTesseract4OcrEngineProperties(properties); OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetTextColor(DeviceCmyk.BLACK); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); using (PdfWriter pdfWriter = GetPdfWriter(resultPdfPath)) { ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename + ".png")), pdfWriter).Close(); } try { String result = GetTextFromPdfLayer(resultPdfPath, null, 1).Replace("\n", " "); NUnit.Framework.Assert.IsTrue(result.Contains(expectedText1) || result.Contains(expectedText2)); } finally { NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties ().GetTextPositioning()); } }