Exemplo n.º 1
0
        public virtual void ComparePdfA3uRGBSpanishJPG()
        {
            String testName        = "comparePdfA3uRGBSpanishJPG";
            String filename        = "spanish_01";
            String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf";
            String resultPdfPath   = GetTargetDirectory() + filename + "_" + testName + "_a3u.pdf";
            Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties
                                                                                             ());

            properties.SetPathToTessData(GetTessDataDirectory());
            properties.SetLanguages(JavaCollectionsUtil.SingletonList <String>("spa"));
            tesseractReader.SetTesseract4OcrEngineProperties(properties);
            OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties();

            ocrPdfCreatorProperties.SetPdfLang("en-US");
            ocrPdfCreatorProperties.SetTitle("");
            ocrPdfCreatorProperties.SetTextColor(DeviceRgb.BLACK);
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties);
            PdfDocument   doc           = ocrPdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY
                                                                                                                             + filename + ".jpg")), GetPdfWriter(resultPdfPath), GetRGBPdfOutputIntent());

            NUnit.Framework.Assert.IsNotNull(doc);
            doc.Close();
            NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, GetTargetDirectory
                                                                                 (), "diff_"));
        }
Exemplo n.º 2
0
        public virtual void ComparePdfA3uCMYKColorSpaceJPG()
        {
            String testName        = "comparePdfA3uCMYKColorSpaceJPG";
            String filename        = "numbers_01";
            String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf";
            String resultPdfPath   = GetTargetDirectory() + filename + "_" + testName + "_a3u.pdf";

            try {
                OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties();
                ocrPdfCreatorProperties.SetPdfLang("en-US");
                ocrPdfCreatorProperties.SetTitle("");
                OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties);
                tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetTextPositioning
                                                                     (TextPositioning.BY_WORDS));
                NUnit.Framework.Assert.AreEqual(tesseractReader, ocrPdfCreator.GetOcrEngine());
                ocrPdfCreator.SetOcrEngine(tesseractReader);
                PdfDocument doc = ocrPdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY
                                                                                                                     + filename + ".jpg")), GetPdfWriter(resultPdfPath), GetCMYKPdfOutputIntent());
                NUnit.Framework.Assert.IsNotNull(doc);
                doc.Close();
                NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, GetTargetDirectory
                                                                                     (), "diff_"));
            }
            finally {
                NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties
                                                    ().GetTextPositioning());
                tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetTextPositioning
                                                                     (TextPositioning.BY_LINES));
            }
        }
Exemplo n.º 3
0
        public virtual void TestTextFromPdfLayersFromMultiPagePdf()
        {
            String           testName = "testTextFromPdfLayersFromMultiPagePdf";
            String           pdfPath  = GetTargetDirectory() + testName + ".pdf";
            IList <FileInfo> files    = JavaUtil.ArraysAsList(new FileInfo(TEST_IMAGES_DIRECTORY + "german_01.jpg"), new FileInfo
                                                                  (TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"), new FileInfo(TEST_IMAGES_DIRECTORY + "nümbérs.jpg"), new
                                                              FileInfo(TEST_IMAGES_DIRECTORY + "example_04.png"));
            OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();

            properties.SetImageLayerName("image");
            properties.SetTextLayerName("text");
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties);
            PdfDocument   doc           = ocrPdfCreator.CreatePdf(files, GetPdfWriter(pdfPath));

            NUnit.Framework.Assert.IsNotNull(doc);
            int numOfPages = doc.GetNumberOfPages();

            NUnit.Framework.Assert.AreEqual(numOfPages, files.Count);
            IList <PdfLayer> layers = doc.GetCatalog().GetOCProperties(true).GetLayers();

            NUnit.Framework.Assert.AreEqual(numOfPages * 2, layers.Count);
            NUnit.Framework.Assert.AreEqual("image", layers[2].GetPdfObject().Get(PdfName.Name).ToString());
            NUnit.Framework.Assert.AreEqual("text", layers[3].GetPdfObject().Get(PdfName.Name).ToString());
            doc.Close();
            // Text layer should contain all text
            // Image layer shouldn't contain any text
            String expectedOutput = "619121";

            NUnit.Framework.Assert.AreEqual(expectedOutput, GetTextFromPdfLayer(pdfPath, "text", 3));
            NUnit.Framework.Assert.AreEqual("", GetTextFromPdfLayer(pdfPath, "image", 3));
        }
        public virtual void TestFontColorInMultiPagePdf()
        {
            String   testName = "testFontColorInMultiPagePdf";
            String   path     = TEST_IMAGES_DIRECTORY + "multîpage.tiff";
            String   pdfPath  = GetTargetDirectory() + testName + ".pdf";
            FileInfo file     = new FileInfo(path);

            tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages
                                                                 (false));
            OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties();

            ocrPdfCreatorProperties.SetTextLayerName("Text1");
            Color color = DeviceCmyk.MAGENTA;

            ocrPdfCreatorProperties.SetTextColor(color);
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties);
            PdfDocument   doc           = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(file), GetPdfWriter(
                                                                      pdfPath));

            NUnit.Framework.Assert.IsNotNull(doc);
            doc.Close();
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath));

            IntegrationTestHelper.ExtractionStrategy strategy = new IntegrationTestHelper.ExtractionStrategy("Text1");
            PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy);

            processor.ProcessPageContent(pdfDocument.GetPage(1));
            Color fillColor = strategy.GetFillColor();

            NUnit.Framework.Assert.AreEqual(fillColor, color);
            pdfDocument.Close();
        }
Exemplo n.º 5
0
        /// <summary>
        /// Perform OCR with custom ocr engine using provided input image and set
        /// of properties and save to the given path.
        /// </summary>
        public static void CreatePdf(String pdfPath, FileInfo inputFile, OcrPdfCreatorProperties properties)
        {
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties);

            try {
                using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) {
                    ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(inputFile), pdfWriter).Close();
                }
            }
            catch (System.IO.IOException e) {
                LOGGER.Error(e.Message);
            }
        }
Exemplo n.º 6
0
        public virtual void TestTextFromPdfLayersFromMultiPageTiff()
        {
            String   testName   = "testTextFromPdfLayersFromMultiPageTiff";
            bool     preprocess = tesseractReader.GetTesseract4OcrEngineProperties().IsPreprocessingImages();
            String   path       = TEST_IMAGES_DIRECTORY + "multîpage.tiff";
            String   pdfPath    = GetTargetDirectory() + testName + ".pdf";
            FileInfo file       = new FileInfo(path);

            tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages
                                                                 (false));
            OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties();

            properties.SetTextLayerName("Text Layer");
            properties.SetImageLayerName("Image Layer");
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties);
            PdfDocument   doc           = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(file), GetPdfWriter(
                                                                      pdfPath));

            NUnit.Framework.Assert.IsNotNull(doc);
            int numOfPages          = doc.GetNumberOfPages();
            IList <PdfLayer> layers = doc.GetCatalog().GetOCProperties(true).GetLayers();

            NUnit.Framework.Assert.AreEqual(numOfPages * 2, layers.Count);
            NUnit.Framework.Assert.AreEqual("Image Layer", layers[2].GetPdfObject().Get(PdfName.Name).ToString());
            NUnit.Framework.Assert.AreEqual("Text Layer", layers[3].GetPdfObject().Get(PdfName.Name).ToString());
            doc.Close();
            // Text layer should contain all text
            // Image layer shouldn't contain any text
            String expectedOutput = "Multipage\nTIFF\nExample\nPage 5";

            NUnit.Framework.Assert.AreEqual(expectedOutput, GetTextFromPdfLayer(pdfPath, "Text Layer", 5));
            NUnit.Framework.Assert.AreEqual("", GetTextFromPdfLayer(pdfPath, "Image Layer", 5));
            NUnit.Framework.Assert.IsFalse(tesseractReader.GetTesseract4OcrEngineProperties().IsPreprocessingImages());
            tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages
                                                                 (preprocess));
        }
Exemplo n.º 7
0
        public virtual void TestSpanishPNG()
        {
            String         testName                  = "compareSpanishPNG";
            String         filename                  = "scanned_spa_01";
            String         expectedText1             = "¿Y SI ENSAYARA COMO ACTUAR?";
            String         expectedText2             = "¿Y SI ENSAYARA ACTUAR?";
            String         resultPdfPath             = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf";
            IList <String> languages                 = JavaUtil.ArraysAsList("spa", "spa_old");
            Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties();

            if (isExecutableReaderType)
            {
                properties.SetPreprocessingImages(false);
            }
            // locate text by words
            properties.SetTextPositioning(TextPositioning.BY_WORDS);
            properties.SetLanguages(languages);
            tesseractReader.SetTesseract4OcrEngineProperties(properties);
            OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties();

            ocrPdfCreatorProperties.SetTextColor(DeviceCmyk.BLACK);
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties);

            using (PdfWriter pdfWriter = GetPdfWriter(resultPdfPath)) {
                ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename
                                                                                                  + ".png")), pdfWriter).Close();
            }
            try {
                String result = GetTextFromPdfLayer(resultPdfPath, null, 1).Replace("\n", " ");
                NUnit.Framework.Assert.IsTrue(result.Contains(expectedText1) || result.Contains(expectedText2));
            }
            finally {
                NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties
                                                    ().GetTextPositioning());
            }
        }