Пример #1
0
        public override void Process(TextExtractorOcrData data)
        {
            WFLogger.NLogger.Info("DocumentToProcess={0}", data.DocumentToProcess);

            tessocr.Clear();

            data.WFState.Value = WFState.WFStateFail;

            System.IO.StreamWriter file = null;
            try
            {
                IList <Image> images = GetImageList(data.DocumentToProcess);

                if (images != null && images.Count > 0)
                {
                    file = new System.IO.StreamWriter(string.Format(@"{0}.ocr", data.DocumentToProcess));
                }

                foreach (System.Drawing.Image image in GetImageList(data.DocumentToProcess))
                {
                    System.Drawing.Image deskewedimage = image;

                    // deskew the image
                    gmseDeskew deskew         = new gmseDeskew((System.Drawing.Bitmap)image);
                    double     imageSkewAngle = deskew.GetSkewAngle();

                    if ((imageSkewAngle > MINIMUM_DESKEW_THRESHOLD || imageSkewAngle < -(MINIMUM_DESKEW_THRESHOLD)))
                    {
                        deskewedimage = ImageHelper.Rotate(image, imageSkewAngle);
                    }

                    try
                    {
//						string tessdata = System.IO.Path.Combine(basedir, TESSDATA);

                        StringBuilder sb = new StringBuilder();
//						if (true == tessocr.Init(tessdata, "eng", (int)OCR.TesseractWrapper.ePageSegMode.PSM_AUTO))
                        {
                            file.WriteLine(tessocr.Recognize(deskewedimage));
#if false
                            string prevword = string.Empty;
                            OCR.TesseractWrapper.DocumentLayout dl = tessocr.AnalyseLayout((System.Drawing.Bitmap)image);
                            foreach (OCR.TesseractWrapper.Block block in dl.Blocks)
                            {
                                foreach (OCR.TesseractWrapper.Paragraph paragraph in block.Paragraphs)
                                {
                                    foreach (OCR.TesseractWrapper.TextLine line in paragraph.Lines)
                                    {
                                        foreach (OCR.TesseractWrapper.Word word in line.Words)
                                        {
                                            System.Drawing.Rectangle part = new Rectangle(word.Left, word.Top, word.Right - word.Left + 1, word.Bottom - word.Top + 1);
                                            tessocr.UseROI = true;
                                            tessocr.ROI    = part;
                                            string recognizedword = tessocr.Recognize(image).Replace("\n", "");
                                            recognizedword = System.Text.RegularExpressions.Regex.Replace(recognizedword, @"[^\w\.@-]", string.Empty);
                                            prevword       = recognizedword;
                                        }
                                    }
                                    sb.Append("\n");
                                }
                                sb.Append("\n");
                            }
#endif
                        }
                    }
                    catch (Exception ex)
                    {
                        WFLogger.NLogger.ErrorException("ERROR: Could not instantiate TesseractProcessor.", ex);
                    }
                    image.Dispose();
                }
            }
            catch (Exception ex)
            {
                WFLogger.NLogger.ErrorException("ERROR: GetImageList failed.", ex);
            }
            finally
            {
                if (file != null)
                {
                    file.Flush();
                    file.Close();
                    file.Dispose();
                }
//					image.Dispose();
            }

            data.WFState.Value = WFState.WFStateSuccess;
        }
Пример #2
0
        public override SFWorkflow.WFState Run()
        {
//			System.IO.FileStream fs = System.IO.File.Open(this.FileToProcess, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite);
//     byte [] inByte = new byte[fs.Length];
//    fs.Read(inByte, 0, inByte.Length);
//      fs.Close();

//			Image imgInFile = Image.FromStream(new System.IO.MemoryStream(inByte));
//			System.IO.MemoryStream myTempStream = new System.IO.MemoryStream();
//			imgInFile.Save( myTempStream, System.Drawing.Imaging.ImageFormat.Tiff);

            SFWorkflow.WFState retval = new SFWorkflow.WFState();

//			System.Drawing.Image image = null;

            try
            {
                foreach (System.Drawing.Image image in GetImageList(this.FileToProcess))
                {
                    //				image = new System.Drawing.Bitmap(this.FileToProcess);
                    // deskew the image
                    gmseDeskew deskew         = new gmseDeskew((System.Drawing.Bitmap)image);
                    double     imageSkewAngle = deskew.GetSkewAngle();

//					if ((imageSkewAngle > MINIMUM_DESKEW_THRESHOLD || imageSkewAngle < -(MINIMUM_DESKEW_THRESHOLD)))
//						image = ImageHelper.Rotate(image, imageSkewAngle);

                    OCR.TesseractWrapper.TesseractProcessor tessocr = new OCR.TesseractWrapper.TesseractProcessor();
                    try
                    {
                        string tessdata = System.IO.Path.Combine(basedir, TESSDATA);

                        StringBuilder sb = new StringBuilder();
                        if (true == tessocr.Init(tessdata, "eng", (int)OCR.TesseractWrapper.ePageSegMode.PSM_AUTO))
                        {
                            string prevword = string.Empty;
                            OCR.TesseractWrapper.DocumentLayout dl = tessocr.AnalyseLayout((System.Drawing.Bitmap)image);
#if false
                            foreach (OCR.TesseractWrapper.Block block in dl.Blocks)
                            {
                                foreach (OCR.TesseractWrapper.Paragraph paragraph in block.Paragraphs)
                                {
                                    foreach (OCR.TesseractWrapper.TextLine line in paragraph.Lines)
                                    {
                                        foreach (OCR.TesseractWrapper.Word word in line.Words)
                                        {
                                            System.Drawing.Rectangle part = new Rectangle(word.Left, word.Top, word.Right - word.Left + 1, word.Bottom - word.Top + 1);
                                            tessocr.UseROI = true;
                                            tessocr.ROI    = part;
                                            string recognizedword = tessocr.Recognize(image).Replace("\n", "");
                                            recognizedword = System.Text.RegularExpressions.Regex.Replace(recognizedword, @"[^\w\.@-]", string.Empty);
                                            prevword       = recognizedword;
                                        }
                                    }
                                    sb.Append("\n");
                                }
                                sb.Append("\n");
                            }
#endif
                        }
                    }
                    catch (Exception ex)
                    {
                        WFLogger.NLogger.ErrorException("ERROR: Could not instantiate TesseractProcessor.", ex);
                    }
                }
            }
            catch (Exception ex)
            {
                WFLogger.NLogger.ErrorException("ERROR: GetImageList failed.", ex);
            }
            finally
            {
//				if (image != null)
//					image.Dispose();
            }

            retval.Value = SFWorkflow.WFState.WFStateSuccess;

            return(retval);
        }