static void Main(string[] args) { //Initialize OCR processor OCRProcessor processor = new OCRProcessor(@"../../../../../../Data/TesseractBinaries/3.02/"); //Load a PDF document PdfLoadedDocument lDoc = new PdfLoadedDocument("../../../../../../Data/Invoice_scanned.pdf"); //Set OCR language to process processor.Settings.Language = Languages.English; OCRLayoutResult hocrBounds; processor.PerformOCR(lDoc, @"../../../../../../Data/Tessdata/", out hocrBounds); StreamWriter writer = new StreamWriter("data.txt"); foreach (Page pages in hocrBounds.Pages) { foreach (Line line in pages.Lines) { writer.WriteLine(line.Text); } } writer.Close(); lDoc.Close(true); processor.Dispose(); }
private void CreateTxtFromPDF(string filename) { //ScanPageStatus scanPageStatus = new ScanPageStatus(); string tesseractPath = Path.Combine(AssemblyDirectory(), GetConstants.TesseractBinaries()); string tesseractData = Path.Combine(AssemblyDirectory(), GetConstants.TesseractData()); try { using (OCRProcessor processor = new OCRProcessor(tesseractPath)) { //Stream pdfStream2 = filename; // FileUpload1.PostedFile.InputStream; // Read in PDF image file, and convert to searchable TXT pdf file PdfLoadedDocument IDoc = new PdfLoadedDocument(filename); processor.Settings.Language = Languages.English; processor.Settings.Performance = Performance.Slow; // var zz = processor.Settings.Performance; //string tessdata = tesseractPath + @"\\Tessdata\\"; processor.PerformOCR(IDoc, tesseractData); string outFileName = Path.GetFileName(filename) + "_OCR" + Path.GetExtension(filename); string homePath = Path.GetDirectoryName(Path.GetDirectoryName(filename)); string savePath = Path.Combine(homePath, GetConstants.Directory("out"), outFileName); // If file exists - delete it first. if (File.Exists(savePath)) { File.SetAttributes(savePath, FileAttributes.Normal); File.Delete(savePath); } IDoc.Save(savePath); IDoc.Close(true); IDoc.Dispose(); scanPageStatus.scannedFileName = savePath; scanPageStatus.rc = 0; scanPageStatus.statusMessage = String.Format("File {0} scanned and saved to {1}", filename, scanPageStatus.scannedFileName); } } catch (Exception ex) { scanPageStatus.scannedFileName = ""; scanPageStatus.statusMessage = String.Format("Error {0} when running OCR on source file {1}", ex, filename); scanPageStatus.rc = -1; } // return scanPageStatus; }