/// <summary> /// Extract text from document with specific Ocr Mode /// </summary> /// <param name="inputDocument"></param> /// <param name="oCRMode"></param> /// <returns></returns> private static string _ExtractTextWithSpecificOCRMode(string inputDocument, OCRMode ocrMode) { // Location of language data files string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // OCR language string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Create TextExtractor instance using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document to TextExtractor textExtractor.LoadDocumentFromFile(inputDocument); // Specify Ocr Mode textExtractor.OCRMode = ocrMode; // Ocr language data folder path and language textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder; textExtractor.OCRLanguage = ocrLanguage; // Return extracted text return(textExtractor.GetText()); } }
protected void Page_Load(object sender, EventArgs e) { try { string strImageName = ""; int iFileLength; HttpFileCollection files = HttpContext.Current.Request.Files; HttpPostedFile uploadfile = files["RemoteFile"]; if (uploadfile != null) { strImageName = uploadfile.FileName; iFileLength = uploadfile.ContentLength; Byte[] inputBuffer = new Byte[iFileLength]; System.IO.Stream inputStream; inputStream = uploadfile.InputStream; inputStream.Read(inputBuffer, 0, iFileLength); string strLanguage = "0"; try { strLanguage = HttpContext.Current.Request.Form["OCRLanguage"].ToString(); } catch { } string strFormat = "0"; try { strFormat = HttpContext.Current.Request.Form["FileFormat"].ToString(); } catch { } string strReturnValue = ""; try { byte[] content = OCRMode.OCR(inputBuffer, strLanguage, Convert.ToInt32(strFormat)); string strFileName = SaveOCRContentAsFile(content, Convert.ToInt32(strFormat)); strReturnValue = "OK;" + "DownLoadOCR.aspx?FileId=" + Server.UrlEncode(strFileName); } catch (Exception exp) { strReturnValue = "EXP;" + exp.Message.ToString(); } finally { } Response.Write(strReturnValue); } } catch (Exception exp) { Response.Write("EXP;" + exp.Message.ToString() + "; 0;"); } }
public static void OCRSetup(OCRMode mode) { string dir = Environment.CurrentDirectory + @"\Modules\OCR\tessdata"; switch (mode) { case OCRMode.NUMBERS: _ocr = new Tesseract(dir, "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_ONLY); _ocr.SetVariable("tessedit_char_whitelist", "1234567890"); break; case OCRMode.COMBINED: _ocr = new Tesseract(dir, "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED); break; } OCRRoutine.DoWork += new DoWorkEventHandler(OCRRoutine_doWork); OCRRoutine.ProgressChanged += new ProgressChangedEventHandler(OCRRoutine_ProgressChanged); OCRRoutine.RunWorkerCompleted += new RunWorkerCompletedEventHandler(OCRRoutine_WorkerCompleted); OCRRoutine.WorkerReportsProgress = true; }
private static String OcrImage(Tesseract ocr, Mat image, OCRMode mode, Mat imageColor) { Bgr drawCharColor = new Bgr(Color.Red); if (image.NumberOfChannels == 1) { CvInvoke.CvtColor(image, imageColor, ColorConversion.Gray2Bgr); } else { image.CopyTo(imageColor); } if (mode == OCRMode.FullPage) { ocr.SetImage(imageColor); if (ocr.Recognize() != 0) { throw new Exception("Failed to recognizer image"); } Tesseract.Character[] characters = ocr.GetCharacters(); if (characters.Length == 0) { Mat imgGrey = new Mat(); CvInvoke.CvtColor(image, imgGrey, ColorConversion.Bgr2Gray); Mat imgThresholded = new Mat(); CvInvoke.Threshold(imgGrey, imgThresholded, 65, 255, ThresholdType.Binary); ocr.SetImage(imgThresholded); characters = ocr.GetCharacters(); imageColor = imgThresholded; if (characters.Length == 0) { CvInvoke.Threshold(image, imgThresholded, 190, 255, ThresholdType.Binary); ocr.SetImage(imgThresholded); characters = ocr.GetCharacters(); imageColor = imgThresholded; } } foreach (Tesseract.Character c in characters) { CvInvoke.Rectangle(imageColor, c.Region, drawCharColor.MCvScalar); } return(ocr.GetUTF8Text()); } else { bool checkInvert = true; Rectangle[] regions; using ( ERFilterNM1 er1 = new ERFilterNM1("trained_classifierNM1.xml", 8, 0.00025f, 0.13f, 0.4f, true, 0.1f)) using (ERFilterNM2 er2 = new ERFilterNM2("trained_classifierNM2.xml", 0.3f)) { int channelCount = image.NumberOfChannels; UMat[] channels = new UMat[checkInvert ? channelCount * 2 : channelCount]; for (int i = 0; i < channelCount; i++) { UMat c = new UMat(); CvInvoke.ExtractChannel(image, c, i); channels[i] = c; } if (checkInvert) { for (int i = 0; i < channelCount; i++) { UMat c = new UMat(); CvInvoke.BitwiseNot(channels[i], c); channels[i + channelCount] = c; } } VectorOfERStat[] regionVecs = new VectorOfERStat[channels.Length]; for (int i = 0; i < regionVecs.Length; i++) { regionVecs[i] = new VectorOfERStat(); } try { for (int i = 0; i < channels.Length; i++) { er1.Run(channels[i], regionVecs[i]); er2.Run(channels[i], regionVecs[i]); } using (VectorOfUMat vm = new VectorOfUMat(channels)) { regions = ERFilter.ERGrouping(image, vm, regionVecs, ERFilter.GroupingMethod.OrientationHoriz, "trained_classifier_erGrouping.xml", 0.5f); } } finally { foreach (UMat tmp in channels) { if (tmp != null) { tmp.Dispose(); } } foreach (VectorOfERStat tmp in regionVecs) { if (tmp != null) { tmp.Dispose(); } } } Rectangle imageRegion = new Rectangle(Point.Empty, imageColor.Size); for (int i = 0; i < regions.Length; i++) { Rectangle r = ScaleRectangle(regions[i], 1.1); r.Intersect(imageRegion); regions[i] = r; } } List <Tesseract.Character> allChars = new List <Tesseract.Character>(); String allText = String.Empty; foreach (Rectangle rect in regions) { using (Mat region = new Mat(image, rect)) { ocr.SetImage(region); if (ocr.Recognize() != 0) { throw new Exception("Failed to recognize image"); } Tesseract.Character[] characters = ocr.GetCharacters(); //convert the coordinates from the local region to global for (int i = 0; i < characters.Length; i++) { Rectangle charRegion = characters[i].Region; charRegion.Offset(rect.Location); characters[i].Region = charRegion; } allChars.AddRange(characters); allText += ocr.GetUTF8Text() + Environment.NewLine; } } Bgr drawRegionColor = new Bgr(Color.Red); foreach (Rectangle rect in regions) { CvInvoke.Rectangle(imageColor, rect, drawRegionColor.MCvScalar); } foreach (Tesseract.Character c in allChars) { CvInvoke.Rectangle(imageColor, c.Region, drawCharColor.MCvScalar); } return(allText); } }