public static void processImage(Tesseract.TesseractEngine engine, Tesseract.Pix img, string testImagePath, int pageNum) { System.IO.FileInfo mfile = new System.IO.FileInfo(testImagePath); string hocrPath = mfile.DirectoryName + "\\" + System.IO.Path.GetFileNameWithoutExtension(testImagePath) + "_p" + pageNum.ToString() + ".xhtml"; using (var page = engine.Process(img, PageSegMode.AutoOsd)) { page.AnalyseLayout(); string hocr = page.GetHOCRText(0, true); System.IO.File.AppendAllText(hocrPath, hocr); var text = page.GetText(); Console.WriteLine("Mean confidence: {0}", page.GetMeanConfidence()); Console.WriteLine("Text (GetText): \r\n{0}", text); Console.WriteLine("Text (iterator):"); using (var iter = page.GetIterator()) { iter.Begin(); do { do { do { do { if (iter.IsAtBeginningOf(PageIteratorLevel.Block)) { Console.WriteLine("<BLOCK>"); Rect currentBlock; iter.TryGetBoundingBox(PageIteratorLevel.Block, out currentBlock); Console.WriteLine(iter.BlockType.ToString()); Console.WriteLine("(" + currentBlock.X1.ToString() + "," + currentBlock.Y1.ToString() + ") (" + currentBlock.X2.ToString() + "," + currentBlock.Y2.ToString() + ")"); Console.WriteLine(""); } Console.Write(iter.GetText(PageIteratorLevel.Word)); Console.Write(" "); if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word)) { Console.WriteLine(iter.BlockType.ToString()); } } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); if (iter.IsAtFinalOf(PageIteratorLevel.Para, PageIteratorLevel.TextLine)) { Console.WriteLine(); } } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); } while (iter.Next(PageIteratorLevel.Block)); } } }
public static byte[] PerformOCRTesseract(byte[] image) { // Specify that Tesseract use three 3 languages: English, Russian and Vietnamese. string tesseractLanguages = "rus+eng+vie"; // A path to a folder which contains languages data files and font file "pdf.ttf". // Language data files can be found here: // Good and fast: https://github.com/tesseract-ocr/tessdata_fast // or // Best and slow: https://github.com/tesseract-ocr/tessdata_best // Also this folder must have write permissions. string tesseractData = Path.GetFullPath(@"..\..\tessdata\"); // A path for a temporary PDF file (because Tesseract returns OCR result as PDF document) string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName()); try { using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, true)) { using (renderer.BeginDocument("Serachablepdf")) { using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)) { engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto; using (MemoryStream msImg = new MemoryStream(image)) { System.Drawing.Image imgWithText = System.Drawing.Image.FromStream(msImg); for (int i = 0; i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page); i++) { imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i); using (MemoryStream ms = new MemoryStream()) { imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png); byte[] imgBytes = ms.ToArray(); using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes)) { using (var page = engine.Process(img, "Serachablepdf")) { renderer.AddPage(page); } } } } } } } } PdfLoadOptions pl = new PdfLoadOptions(); pl.ShowInvisibleText = true; pl.PreserveEmbeddedFonts = false; pl.ConversionMode = PdfConversionMode.Continuous; DocumentCore dc = DocumentCore.Load(File.OpenRead(tempFile + @".pdf"), pl); byte[] returnPdf; using (MemoryStream ms = new MemoryStream()) { PdfSaveOptions ps = new PdfSaveOptions(); dc.Save(ms, ps); returnPdf = ms.ToArray(); } return(returnPdf); } catch (Exception e) { Console.WriteLine(); Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\""); Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast"); Console.ReadKey(); throw new Exception("Error Tesseract: " + e.Message); } finally { if (File.Exists(tempFile + ".pdf")) { File.Delete(tempFile + ".pdf"); } } }