public virtual void HocrOutputFromHalftoneFile() { String path = TEST_IMAGES_DIRECTORY + "halftone.jpg"; String expected01 = "Silliness"; String expected02 = "Enablers"; String expected03 = "You"; String expected04 = "Middle"; String expected05 = "André"; String expected06 = "QUANTITY"; String expected07 = "DESCRIPTION"; String expected08 = "Silliness Enablers"; String expected09 = "QUANTITY DESCRIPTION UNIT PRICE TOTAL"; FileInfo imgFile = new FileInfo(path); FileInfo outputFile = new FileInfo(GetTargetDirectory() + "hocrOutputFromHalftoneFile.hocr"); tesseractReader.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(JavaCollectionsUtil.SingletonList <FileInfo>(outputFile), TextPositioning.BY_WORDS); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected01)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected02)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected03)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected04)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected05)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected06)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected07)); pageData = TesseractHelper.ParseHocrFile(JavaCollectionsUtil.SingletonList <FileInfo>(outputFile), TextPositioning .BY_LINES); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected08)); NUnit.Framework.Assert.IsTrue(FindTextInPageData(pageData, 1, expected09)); }
public virtual void TestTesseract4OcrForOnePageWithHocrFormat() { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; String expected = "619121"; FileInfo imgFile = new FileInfo(path); FileInfo outputFile = new FileInfo(GetTargetDirectory() + "testTesseract4OcrForOnePage.hocr"); tesseractReader.DoTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); IDictionary <int, IList <TextInfo> > pageData = TesseractHelper.ParseHocrFile(JavaCollectionsUtil.SingletonList <FileInfo>(outputFile), tesseractReader.GetTesseract4OcrEngineProperties().GetTextPositioning()); String result = GetTextFromPage(pageData.Get(1)); NUnit.Framework.Assert.AreEqual(expected, result.Trim()); }