Exemplo n.º 1
0
        public virtual void ComparePdfA3uRGBSpanishJPG()
        {
            String testName        = "comparePdfA3uRGBSpanishJPG";
            String filename        = "spanish_01";
            String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf";
            String resultPdfPath   = GetTargetDirectory() + filename + "_" + testName + "_a3u.pdf";
            Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties
                                                                                             ());

            properties.SetPathToTessData(GetTessDataDirectory());
            properties.SetLanguages(JavaCollectionsUtil.SingletonList <String>("spa"));
            tesseractReader.SetTesseract4OcrEngineProperties(properties);
            OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties();

            ocrPdfCreatorProperties.SetPdfLang("en-US");
            ocrPdfCreatorProperties.SetTitle("");
            ocrPdfCreatorProperties.SetTextColor(DeviceRgb.BLACK);
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties);
            PdfDocument   doc           = ocrPdfCreator.CreatePdfA(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY
                                                                                                                             + filename + ".jpg")), GetPdfWriter(resultPdfPath), GetRGBPdfOutputIntent());

            NUnit.Framework.Assert.IsNotNull(doc);
            doc.Close();
            NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, GetTargetDirectory
                                                                                 (), "diff_"));
        }
        public virtual void InitTesseractProperties()
        {
            Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties();

            ocrEngineProperties.SetPathToTessData(GetTessDataDirectory());
            tesseractReader.SetTesseract4OcrEngineProperties(ocrEngineProperties);
        }
Exemplo n.º 3
0
        public virtual void InitTesseractProperties()
        {
            Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties();

            ocrEngineProperties.SetPathToTessData(new FileInfo(sourceFolder + "../../tessdata"));
            tesseractReader.SetTesseract4OcrEngineProperties(ocrEngineProperties);
        }
Exemplo n.º 4
0
        public virtual void CompareMultiLangImage()
        {
            String testName        = "compareMultiLangImage";
            String filename        = "multilang";
            String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + ".pdf";
            String resultPdfPath   = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf";

            try {
                Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties();
                properties.SetTextPositioning(TextPositioning.BY_WORDS);
                properties.SetPathToTessData(GetTessDataDirectory());
                properties.SetPageSegMode(3);
                tesseractReader.SetTesseract4OcrEngineProperties(properties);
                DoOcrAndSavePdfToPath(tesseractReader, TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath, JavaUtil.ArraysAsList
                                          ("eng", "deu", "spa"), DeviceCmyk.BLACK);
                NUnit.Framework.Assert.IsNull(new CompareTool().CompareByContent(resultPdfPath, expectedPdfPath, TEST_DOCUMENTS_DIRECTORY
                                                                                 , "diff_"));
            }
            finally {
                NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties
                                                    ().GetTextPositioning());
                NUnit.Framework.Assert.AreEqual(3, tesseractReader.GetTesseract4OcrEngineProperties().GetPageSegMode().Value
                                                );
            }
        }
Exemplo n.º 5
0
        public IntegrationTestHelper()
        {
            Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties();

            ocrEngineProperties.SetPathToTessData(GetTessDataDirectory());
            tesseractLibReader        = new Tesseract4LibOcrEngine(ocrEngineProperties);
            tesseractExecutableReader = new Tesseract4ExecutableOcrEngine(GetTesseractDirectory(), ocrEngineProperties
                                                                          );
        }
 public virtual void TestEmptyPathToTessData()
 {
     NUnit.Framework.Assert.That(() => {
         FileInfo file = new FileInfo(TEST_IMAGES_DIRECTORY + "spanish_01.jpg");
         Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties().SetPathToTessData
                                                        (new FileInfo("."));
         tesseractReader.SetTesseract4OcrEngineProperties(properties);
         GetTextFromPdf(tesseractReader, file);
         NUnit.Framework.Assert.AreEqual(new FileInfo("").FullName, tesseractReader.GetTesseract4OcrEngineProperties
                                             ().GetPathToTessData().FullName);
     }
                                 , NUnit.Framework.Throws.InstanceOf <Tesseract4OcrException>().With.Message.EqualTo(MessageFormatUtil.Format(Tesseract4OcrException.INCORRECT_LANGUAGE, "eng.traineddata", new FileInfo(".").FullName)))
     ;
 }
Exemplo n.º 7
0
        /// <summary>
        /// Perform OCR using provided path to image (imgPath)
        /// and save result PDF document to "pdfPath".
        /// </summary>
        /// <remarks>
        /// Perform OCR using provided path to image (imgPath)
        /// and save result PDF document to "pdfPath".
        /// (Method is used for compare tool)
        /// </remarks>
        protected internal virtual void DoOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String
                                                              imgPath, String pdfPath, IList <String> languages, IList <String> fonts, Color color)
        {
            if (languages != null)
            {
                Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties();
                properties.SetLanguages(languages);
                tesseractReader.SetTesseract4OcrEngineProperties(properties);
            }
            OcrPdfCreatorProperties properties_1 = new OcrPdfCreatorProperties();

            properties_1.SetPdfLang("en-US");
            properties_1.SetTitle("");
            if (fonts != null && fonts.Count > 0)
            {
                FontProvider fontProvider = new FontProvider();
                foreach (String fontPath in fonts)
                {
                    String name = FONT_PATH_TO_FONT_NAME_MAP.Get(fontPath);
                    fontProvider.GetFontSet().AddFont(fontPath, PdfEncodings.IDENTITY_H, name);
                }
                properties_1.SetFontProvider(fontProvider);
            }
            if (color != null)
            {
                properties_1.SetTextColor(color);
            }
            if (languages != null)
            {
                NUnit.Framework.Assert.AreEqual(languages.Count, tesseractReader.GetTesseract4OcrEngineProperties().GetLanguages
                                                    ().Count);
            }
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties_1);

            try {
                using (PdfWriter pdfWriter = GetPdfWriter(pdfPath)) {
                    PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(imgPath
                                                                                                                        )), pdfWriter);
                    NUnit.Framework.Assert.IsNotNull(doc);
                    doc.Close();
                }
            }
            catch (System.IO.IOException e) {
                LOGGER.Error(e.Message);
            }
        }
Exemplo n.º 8
0
 /// <summary>
 /// Perform OCR using provided path to image (imgPath)
 /// and save result to text file.
 /// </summary>
 protected internal virtual void DoOcrAndSaveToTextFile(AbstractTesseract4OcrEngine tesseractReader, String
                                                        imgPath, String txtPath, IList <String> languages)
 {
     if (languages != null)
     {
         Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties();
         properties.SetLanguages(languages);
         tesseractReader.SetTesseract4OcrEngineProperties(properties);
     }
     tesseractReader.CreateTxtFile(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(imgPath)), new FileInfo
                                       (txtPath));
     if (languages != null)
     {
         NUnit.Framework.Assert.AreEqual(languages.Count, tesseractReader.GetTesseract4OcrEngineProperties().GetLanguages
                                             ().Count);
     }
 }
        public virtual void CompareInvoiceFrontThaiImage()
        {
            String testName = "compareInvoiceFrontThaiImage";
            String filename = "invoice_front_thai";
            //Tesseract for Java and Tesseract for .NET give different output
            //So we cannot use one reference pdf file for them
            String expectedPdfPathJava               = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_java.pdf";
            String expectedPdfPathDotNet             = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_dotnet.pdf";
            String resultPdfPath                     = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf";
            Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties();

            properties.SetTextPositioning(TextPositioning.BY_WORDS_AND_LINES);
            properties.SetPathToTessData(GetTessDataDirectory());
            properties.SetLanguages(JavaUtil.ArraysAsList("tha", "eng"));
            tesseractReader.SetTesseract4OcrEngineProperties(properties);
            DoOcrAndSavePdfToPath(tesseractReader, TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath, JavaUtil.ArraysAsList
                                      ("tha", "eng"), JavaUtil.ArraysAsList(NOTO_SANS_THAI_FONT_PATH, NOTO_SANS_FONT_PATH), DeviceRgb.RED);
            bool javaTest = new CompareTool().CompareByContent(resultPdfPath, expectedPdfPathJava, TEST_DOCUMENTS_DIRECTORY
                                                               , "diff_") == null;
            bool dotNetTest = new CompareTool().CompareByContent(resultPdfPath, expectedPdfPathDotNet, TEST_DOCUMENTS_DIRECTORY
                                                                 , "diff_") == null;

            NUnit.Framework.Assert.IsTrue(javaTest || dotNetTest);
        }
Exemplo n.º 10
0
        public virtual void TestSpanishPNG()
        {
            String         testName                  = "compareSpanishPNG";
            String         filename                  = "scanned_spa_01";
            String         expectedText1             = "¿Y SI ENSAYARA COMO ACTUAR?";
            String         expectedText2             = "¿Y SI ENSAYARA ACTUAR?";
            String         resultPdfPath             = GetTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf";
            IList <String> languages                 = JavaUtil.ArraysAsList("spa", "spa_old");
            Tesseract4OcrEngineProperties properties = tesseractReader.GetTesseract4OcrEngineProperties();

            if (isExecutableReaderType)
            {
                properties.SetPreprocessingImages(false);
            }
            // locate text by words
            properties.SetTextPositioning(TextPositioning.BY_WORDS);
            properties.SetLanguages(languages);
            tesseractReader.SetTesseract4OcrEngineProperties(properties);
            OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties();

            ocrPdfCreatorProperties.SetTextColor(DeviceCmyk.BLACK);
            OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties);

            using (PdfWriter pdfWriter = GetPdfWriter(resultPdfPath)) {
                ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(new FileInfo(TEST_IMAGES_DIRECTORY + filename
                                                                                                  + ".png")), pdfWriter).Close();
            }
            try {
                String result = GetTextFromPdfLayer(resultPdfPath, null, 1).Replace("\n", " ");
                NUnit.Framework.Assert.IsTrue(result.Contains(expectedText1) || result.Contains(expectedText2));
            }
            finally {
                NUnit.Framework.Assert.AreEqual(TextPositioning.BY_WORDS, tesseractReader.GetTesseract4OcrEngineProperties
                                                    ().GetTextPositioning());
            }
        }