public static void GetIMagesOnly(string FileDirectory) { var ext = new List <string> { ".jpg", ".gif", ".png", "tif" }; var fileEntries = Directory.GetFiles(FileDirectory, "*.*", SearchOption.AllDirectories) .Where(s => ext.Any(e => s.EndsWith(e))); foreach (string fileName in fileEntries) { Console.WriteLine("Image file '{0}'.", fileName); //Console.WriteLine(Path.GetFileName(fileName));// used to get only file name // PerformingOCR.DoOCR1(fileName); ImagePreProcessing.EnhanceImageQuality(fileName.ToString()); // Console.ReadLine(); } fileEntries = Directory.GetFiles(@"C:\OCR\EnhancedImage", "*.*", SearchOption.AllDirectories) .Where(s => ext.Any(e => s.EndsWith(e))); var InvoiceList = new List <Invoice>(); foreach (string fileName in fileEntries) { Console.WriteLine("Image file '{0}'.", fileName); var pageItem = PerformingOCR.DoOCR1(fileName); Identify removeSpace = new Identify(); pageItem = removeSpace.RemoveSpace(pageItem); Invoice ck = PopInvoice.poplateInvoice(pageItem, fileName); InvoiceList.Add(ck); } List <String> lines = new List <String>(); foreach (Invoice invoice in InvoiceList) { lines.Add("Invocie File Name" + " " + invoice.InvoiceID); lines.Add("Invoice Date" + " " + invoice.InvoiceDate); lines.Add("VendorName" + " " + invoice.VendorName); lines.Add(" " + "QTY" + " " + " " + "Amount " + " " + " " + "Item" + " "); foreach (Lineitem lineitem in invoice.Lineitems) { lines.Add(" " + lineitem.ItemQty + " " + " " + lineitem.ItemAmount + " " + lineitem.ItemName); } lines.Add("Total Amount" + " " + invoice.TotalAmount); lines.Add("****************************************************************************"); } // WriteAllLines creates a file, writes a collection of strings to the file, // and then closes the file. You do NOT need to call Flush() or Close(). System.IO.File.WriteAllLines(@"C:\OCR\EnhancedImage\OcrResult.txt", lines); Console.ReadLine(); }
public static List <row> DoOCR1(string imageDir) { if (imageDir != null)//&& imageDir.fil > 0) check file exist { var Rowlist = new List <row>(); using (var engine = new TesseractEngine(@"./tessdata", "eng", EngineMode.Default)) { // have to load Pix via a bitmap since Pix doesn't support loading a stream. engine.SetVariable("tessedit_char_whitelist", "16.00ABCDEFGHIJKLMNOPQRSTUVWXYZ(quick) brown { fox} jumps!over the $3,456.78 < lazy >: #90 dog & duck/goose, as 12.5% of Email from [email protected] is spam?"); // engine.DefaultPageSegMode = PageSegMode.AutoOsd; // engine.SetVariable("tessedit_char_whitelist", "0123456789,/ABCDEFGHIJKLMNOPQRSTUVWXYZ.abcdefghijklmnopqrstuvwxyz():!'$"); Pix pixImage = Pix.LoadFromFile(imageDir); // pixImage = pixImage.Deskew(); //Scew scew; //pixImage = pixImage.Deskew(new ScewSweep(range: 90), Pix.DefaultBinarySearchReduction, Pix.DefaultBinaryThreshold, out scew); //pixImage.Save(@"C:\OCR\EnhancedImage\deskew.tiff", ImageFormat.Tiff); using (var image = new System.Drawing.Bitmap(imageDir)) { using (var pix = PixConverter.ToPix(image)) { // pix.Deskew(); using (var page = engine.Process(pix)) { Console.WriteLine("Mean confidence: {0:p}", page.GetMeanConfidence()); Console.WriteLine(page.GetText()); var i = 1; var j = 1; using (var iter = page.GetIterator()) { iter.Begin(); do { do { // Console.WriteLine("in-looop"); do { Console.WriteLine("Line {0}", i); j = 1; do { //if (iter.GetText(PageIteratorLevel.Word) != "") // strWord = iter.GetText(PageIteratorLevel.Word).GetType().GetGenericTypeDefinition(); Console.WriteLine("word:{0} ", iter.GetText(PageIteratorLevel.Word)); Identify strType = new Identify(); Rowlist.Add(new row { line = i, colomun = j, type = Convert.ToInt16(strType.StringType(iter.GetText(PageIteratorLevel.Word))), word = iter.GetText(PageIteratorLevel.Word) }); j = j + 1; } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)); i++; } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)); } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)); } while (iter.Next(PageIteratorLevel.Block)); var k = Rowlist; } } } } } return(Rowlist); } else { return(null); } }