public override void Process(TextExtractorOcrData data) { WFLogger.NLogger.Info("DocumentToProcess={0}", data.DocumentToProcess); tessocr.Clear(); data.WFState.Value = WFState.WFStateFail; System.IO.StreamWriter file = null; try { IList <Image> images = GetImageList(data.DocumentToProcess); if (images != null && images.Count > 0) { file = new System.IO.StreamWriter(string.Format(@"{0}.ocr", data.DocumentToProcess)); } foreach (System.Drawing.Image image in GetImageList(data.DocumentToProcess)) { System.Drawing.Image deskewedimage = image; // deskew the image gmseDeskew deskew = new gmseDeskew((System.Drawing.Bitmap)image); double imageSkewAngle = deskew.GetSkewAngle(); if ((imageSkewAngle > MINIMUM_DESKEW_THRESHOLD || imageSkewAngle < -(MINIMUM_DESKEW_THRESHOLD))) { deskewedimage = ImageHelper.Rotate(image, imageSkewAngle); } try { // string tessdata = System.IO.Path.Combine(basedir, TESSDATA); StringBuilder sb = new StringBuilder(); // if (true == tessocr.Init(tessdata, "eng", (int)OCR.TesseractWrapper.ePageSegMode.PSM_AUTO)) { file.WriteLine(tessocr.Recognize(deskewedimage)); #if false string prevword = string.Empty; OCR.TesseractWrapper.DocumentLayout dl = tessocr.AnalyseLayout((System.Drawing.Bitmap)image); foreach (OCR.TesseractWrapper.Block block in dl.Blocks) { foreach (OCR.TesseractWrapper.Paragraph paragraph in block.Paragraphs) { foreach (OCR.TesseractWrapper.TextLine line in paragraph.Lines) { foreach (OCR.TesseractWrapper.Word word in line.Words) { System.Drawing.Rectangle part = new Rectangle(word.Left, word.Top, word.Right - word.Left + 1, word.Bottom - word.Top + 1); tessocr.UseROI = true; tessocr.ROI = part; string recognizedword = tessocr.Recognize(image).Replace("\n", ""); recognizedword = System.Text.RegularExpressions.Regex.Replace(recognizedword, @"[^\w\.@-]", string.Empty); prevword = recognizedword; } } sb.Append("\n"); } sb.Append("\n"); } #endif } } catch (Exception ex) { WFLogger.NLogger.ErrorException("ERROR: Could not instantiate TesseractProcessor.", ex); } image.Dispose(); } } catch (Exception ex) { WFLogger.NLogger.ErrorException("ERROR: GetImageList failed.", ex); } finally { if (file != null) { file.Flush(); file.Close(); file.Dispose(); } // image.Dispose(); } data.WFState.Value = WFState.WFStateSuccess; }
public override SFWorkflow.WFState Run() { // System.IO.FileStream fs = System.IO.File.Open(this.FileToProcess, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite); // byte [] inByte = new byte[fs.Length]; // fs.Read(inByte, 0, inByte.Length); // fs.Close(); // Image imgInFile = Image.FromStream(new System.IO.MemoryStream(inByte)); // System.IO.MemoryStream myTempStream = new System.IO.MemoryStream(); // imgInFile.Save( myTempStream, System.Drawing.Imaging.ImageFormat.Tiff); SFWorkflow.WFState retval = new SFWorkflow.WFState(); // System.Drawing.Image image = null; try { foreach (System.Drawing.Image image in GetImageList(this.FileToProcess)) { // image = new System.Drawing.Bitmap(this.FileToProcess); // deskew the image gmseDeskew deskew = new gmseDeskew((System.Drawing.Bitmap)image); double imageSkewAngle = deskew.GetSkewAngle(); // if ((imageSkewAngle > MINIMUM_DESKEW_THRESHOLD || imageSkewAngle < -(MINIMUM_DESKEW_THRESHOLD))) // image = ImageHelper.Rotate(image, imageSkewAngle); OCR.TesseractWrapper.TesseractProcessor tessocr = new OCR.TesseractWrapper.TesseractProcessor(); try { string tessdata = System.IO.Path.Combine(basedir, TESSDATA); StringBuilder sb = new StringBuilder(); if (true == tessocr.Init(tessdata, "eng", (int)OCR.TesseractWrapper.ePageSegMode.PSM_AUTO)) { string prevword = string.Empty; OCR.TesseractWrapper.DocumentLayout dl = tessocr.AnalyseLayout((System.Drawing.Bitmap)image); #if false foreach (OCR.TesseractWrapper.Block block in dl.Blocks) { foreach (OCR.TesseractWrapper.Paragraph paragraph in block.Paragraphs) { foreach (OCR.TesseractWrapper.TextLine line in paragraph.Lines) { foreach (OCR.TesseractWrapper.Word word in line.Words) { System.Drawing.Rectangle part = new Rectangle(word.Left, word.Top, word.Right - word.Left + 1, word.Bottom - word.Top + 1); tessocr.UseROI = true; tessocr.ROI = part; string recognizedword = tessocr.Recognize(image).Replace("\n", ""); recognizedword = System.Text.RegularExpressions.Regex.Replace(recognizedword, @"[^\w\.@-]", string.Empty); prevword = recognizedword; } } sb.Append("\n"); } sb.Append("\n"); } #endif } } catch (Exception ex) { WFLogger.NLogger.ErrorException("ERROR: Could not instantiate TesseractProcessor.", ex); } } } catch (Exception ex) { WFLogger.NLogger.ErrorException("ERROR: GetImageList failed.", ex); } finally { // if (image != null) // image.Dispose(); } retval.Value = SFWorkflow.WFState.WFStateSuccess; return(retval); }