// Processes exactly one page, returns data on 1 single page public TesseractOutput OCRImageFile(String PathToPngFile, Boolean Debug) { try { var testImagePath = PathToPngFile; w.WriteLine("DEBUG - Tesseract Engine Loading..."); using (var engine = new TesseractEngine(this.TessDataPath, this.TessLanguage, EngineMode.Default)) { w.WriteLine("DEBUG - Tesseract Image Loading..."); using (var img = Pix.LoadFromFile(testImagePath)) { w.WriteLine("DEBUG - Tesseract Page Loading..."); using (var page = engine.Process(img)) { w.WriteLine("DEBUG - Tesseract Get Page Content..."); var text = page.GetText(); if (Debug) { Console.WriteLine("\nDEBUG: Tesseract Mean Confidence: {0}", page.GetMeanConfidence()); } w.WriteLine("DEBUG - Tesseract Confidence: " + page.GetMeanConfidence()); TesseractOutput to = new TesseractOutput(page.GetMeanConfidence(), text); this.w.Flush(); return(to); } } } } catch (Exception e) { w.WriteLine("DEBUG - Tesseract Error: " + e.Message); w.WriteLine("DEBUG - Tesseract Error Details: " + e.ToString()); //Trace.TraceError(e.ToString()); //if (Debug) { Console.WriteLine("\nUnexpected Error: " + e.Message); } //if (Debug) { Console.WriteLine("\nDetails: "); } //if (Debug) { Console.WriteLine(e.ToString()); } //Console.ReadKey(); this.w.Flush(); return(null); } }
public Dictionary <int, String> GetDictionaryFromPdf(String RootPath, String FileName, String PathToTessData, String Language, Boolean Debug) { // Split PDF into individual Pages using PDFSharp (PDFSharp in NuGet Package Manager) Dictionary <int, String> DocumentContent = new Dictionary <int, String>(); //String RootPath = @"C:\dev\docs\"; // "90000081.pdf" // accounting for C:\mypath and C:\mypath\ if (!RootPath.EndsWith(@"\") && !RootPath.EndsWith("/")) { RootPath = RootPath + "/"; } String TempPath = RootPath + guid + @"\"; String LogFile = TempPath + "run.log"; System.IO.Directory.CreateDirectory(TempPath); StreamWriter w = File.AppendText(LogFile); PdfSharpUtils psu = new PdfSharpUtils(w); psu.SplitAllPDFPages(RootPath, FileName, TempPath, Debug); // Now convert each pdf file into a png file // requires ImageMagick Wrapper for C# (Magick.NET Q16 Any CPU in NuGet Package manager) // because we do PDF conversions, the target system on which this is running requires install GhostScript ImageMagickUtils imu = new ImageMagickUtils(300, 300, w); string[] PdfFiles = Directory.GetFiles(TempPath, "*.pdf", SearchOption.TopDirectoryOnly); w.WriteLine("DEBUG - Number of PDF Files Found: " + PdfFiles.Count()); w.Flush(); foreach (string filepath in PdfFiles) { imu.ConvertPDFToPng(filepath, filepath + ".png"); } // Now using Tesseract to OCR the single pdf page turned into a png file (Tesseract in NuGet Package Manager) // this involves downloading the Tesseract language data files locally (see tessdata below). //ConfigItem item = LoadJson(); TesseractUtils tu = new TesseractUtils(PathToTessData, Language, w); string[] PngFiles = Directory.GetFiles(TempPath, "*.png", SearchOption.TopDirectoryOnly); foreach (string filepath in PngFiles) { if (Debug) { Console.WriteLine("\nDEBUG: Processing Temp Image File: " + filepath); } w.WriteLine("DEBUG: Processing Temp Image File: " + filepath); String filename = Path.GetFileName(filepath); String[] tempArr = filename.Split('_'); String PageNum = tempArr[0]; if (Debug) { Console.WriteLine("\nDEBUG: Current Page Number: " + PageNum); } w.WriteLine("DEBUG: Current Page Number: " + PageNum); TesseractOutput to = tu.OCRImageFile(filepath, Debug); //Console.WriteLine(to.getText()); int PageNumAsInt = -1; try { PageNumAsInt = Int32.Parse(PageNum); if (to != null) { w.WriteLine("DEBUG: Adding Document to Dictionary: " + PageNumAsInt); DocumentContent.Add(PageNumAsInt, to.getText()); } else { w.WriteLine("DEBUG: ERROR, Could not add Document to Dictionary (Tesseract Output seems empty?): " + PageNumAsInt); } } catch (Exception e) { // do nothing and quit? w.WriteLine("DEBUG: Error in Page Number Retrieved from File Name (it isnt a valid Integer?): " + PageNum); } //Console.ReadKey(); w.Flush(); } w.Close(); // Delete Temp folder if not in Debug mode if (!Debug) { System.IO.Directory.Delete(TempPath, true); } return(DocumentContent); }