Example #1
0
 // Processes exactly one page, returns data on 1 single page
 public TesseractOutput OCRImageFile(String PathToPngFile, Boolean Debug)
 {
     try
     {
         var testImagePath = PathToPngFile;
         w.WriteLine("DEBUG - Tesseract Engine Loading...");
         using (var engine = new TesseractEngine(this.TessDataPath, this.TessLanguage, EngineMode.Default))
         {
             w.WriteLine("DEBUG - Tesseract Image Loading...");
             using (var img = Pix.LoadFromFile(testImagePath))
             {
                 w.WriteLine("DEBUG - Tesseract Page Loading...");
                 using (var page = engine.Process(img))
                 {
                     w.WriteLine("DEBUG - Tesseract Get Page Content...");
                     var text = page.GetText();
                     if (Debug)
                     {
                         Console.WriteLine("\nDEBUG: Tesseract Mean Confidence: {0}", page.GetMeanConfidence());
                     }
                     w.WriteLine("DEBUG - Tesseract Confidence: " + page.GetMeanConfidence());
                     TesseractOutput to = new TesseractOutput(page.GetMeanConfidence(), text);
                     this.w.Flush();
                     return(to);
                 }
             }
         }
     }
     catch (Exception e)
     {
         w.WriteLine("DEBUG - Tesseract Error: " + e.Message);
         w.WriteLine("DEBUG - Tesseract Error Details: " + e.ToString());
         //Trace.TraceError(e.ToString());
         //if (Debug) { Console.WriteLine("\nUnexpected Error: " + e.Message); }
         //if (Debug) { Console.WriteLine("\nDetails: "); }
         //if (Debug) { Console.WriteLine(e.ToString()); }
         //Console.ReadKey();
         this.w.Flush();
         return(null);
     }
 }
        public Dictionary <int, String> GetDictionaryFromPdf(String RootPath, String FileName, String PathToTessData, String Language, Boolean Debug)
        {
            // Split PDF into individual Pages using PDFSharp (PDFSharp in NuGet Package Manager)
            Dictionary <int, String> DocumentContent = new Dictionary <int, String>();

            //String RootPath = @"C:\dev\docs\";
            // "90000081.pdf"

            // accounting for C:\mypath and C:\mypath\
            if (!RootPath.EndsWith(@"\") && !RootPath.EndsWith("/"))
            {
                RootPath = RootPath + "/";
            }

            String TempPath = RootPath + guid + @"\";
            String LogFile  = TempPath + "run.log";


            System.IO.Directory.CreateDirectory(TempPath);

            StreamWriter w = File.AppendText(LogFile);

            PdfSharpUtils psu = new PdfSharpUtils(w);

            psu.SplitAllPDFPages(RootPath, FileName, TempPath, Debug);

            // Now convert each pdf file into a png file
            // requires ImageMagick Wrapper for C# (Magick.NET Q16 Any CPU in NuGet Package manager)
            // because we do PDF conversions, the target system on which this is running requires install GhostScript

            ImageMagickUtils imu = new ImageMagickUtils(300, 300, w);

            string[] PdfFiles = Directory.GetFiles(TempPath, "*.pdf", SearchOption.TopDirectoryOnly);
            w.WriteLine("DEBUG - Number of PDF Files Found: " + PdfFiles.Count());
            w.Flush();
            foreach (string filepath in PdfFiles)
            {
                imu.ConvertPDFToPng(filepath, filepath + ".png");
            }

            // Now using Tesseract to OCR the single pdf page turned into a png file (Tesseract in NuGet Package Manager)
            // this involves downloading the Tesseract language data files locally (see tessdata below).
            //ConfigItem item = LoadJson();

            TesseractUtils tu = new TesseractUtils(PathToTessData, Language, w);

            string[] PngFiles = Directory.GetFiles(TempPath, "*.png", SearchOption.TopDirectoryOnly);
            foreach (string filepath in PngFiles)
            {
                if (Debug)
                {
                    Console.WriteLine("\nDEBUG: Processing Temp Image File: " + filepath);
                }
                w.WriteLine("DEBUG: Processing Temp Image File: " + filepath);
                String   filename = Path.GetFileName(filepath);
                String[] tempArr  = filename.Split('_');
                String   PageNum  = tempArr[0];
                if (Debug)
                {
                    Console.WriteLine("\nDEBUG: Current Page Number: " + PageNum);
                }
                w.WriteLine("DEBUG: Current Page Number: " + PageNum);

                TesseractOutput to = tu.OCRImageFile(filepath, Debug);
                //Console.WriteLine(to.getText());
                int PageNumAsInt = -1;
                try
                {
                    PageNumAsInt = Int32.Parse(PageNum);
                    if (to != null)
                    {
                        w.WriteLine("DEBUG: Adding Document to Dictionary: " + PageNumAsInt);
                        DocumentContent.Add(PageNumAsInt, to.getText());
                    }
                    else
                    {
                        w.WriteLine("DEBUG: ERROR, Could not add Document to Dictionary (Tesseract Output seems empty?): " + PageNumAsInt);
                    }
                }
                catch (Exception e)
                {
                    // do nothing and quit?
                    w.WriteLine("DEBUG: Error in Page Number Retrieved from File Name (it isnt a valid Integer?): " + PageNum);
                }


                //Console.ReadKey();
                w.Flush();
            }
            w.Close();
            // Delete Temp folder if not in Debug mode
            if (!Debug)
            {
                System.IO.Directory.Delete(TempPath, true);
            }

            return(DocumentContent);
        }