示例#1
0
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, bool hocr)
        {
            IList <Image> imageList;

            try
            {
                imageList = ImageIOHelper.GetImageList(new FileInfo(imageFile));
                OCR <Image> ocrEngine = new OCRImages();
                ocrEngine.PageSegMode = pageSegMode;
                ocrEngine.Hocr        = hocr;
                string result = ocrEngine.RecognizeText(imageList, langCode);

                // skip post-corrections if hocr output
                if (!hocr)
                {
                    // postprocess to correct common OCR errors
                    result = Processor.PostProcess(result, langCode);
                    // correct common errors caused by OCR
                    result = TextUtilities.CorrectOCRErrors(result);
                    // correct letter cases
                    result = TextUtilities.CorrectLetterCases(result);
                }

                using (StreamWriter sw = new StreamWriter(outputFile, false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            finally
            {
                imageList = null;
            }
        }
示例#2
0
        private void backgroundWorkerOcr_DoWork(object sender, DoWorkEventArgs e)
        {
            // Get the BackgroundWorker that raised this event.
            BackgroundWorker worker = sender as BackgroundWorker;

            OCRImageEntity entity    = (OCRImageEntity)e.Argument;
            OCR <Image>    ocrEngine = new OCRImages();

            ocrEngine.PageSegMode = selectedPSM;
            ocrEngine.Language    = entity.Language;

            // Assign the result of the computation to the Result property of the DoWorkEventArgs
            // object. This is will be available to the RunWorkerCompleted eventhandler.
            //e.Result = ocrEngine.RecognizeText(entity.ClonedImages, entity.Lang, entity.Rect, worker, e);
            IList <Image> images = entity.ClonedImages;

            for (int i = 0; i < images.Count; i++)
            {
                if (worker.CancellationPending)
                {
                    e.Cancel = true;
                    break;
                }

                string result = ocrEngine.RecognizeText(((List <Image>)images).GetRange(i, 1), entity.Inputfilename, entity.Rect, worker, e);
                worker.ReportProgress(i, result); // i is not really percentage
            }
        }
示例#3
0
        /// <summary>
        /// Performs OCR for bulk/batch and console operations.
        /// </summary>
        /// <param name="imageFile">Image file</param>
        /// <param name="outputFile">Output file without extension</param>
        /// <param name="langCode">language code</param>
        /// <param name="pageSegMode">page segmentation mode</param>
        /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param>
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat)
        {
            IList <Image> imageList;

            try
            {
                DirectoryInfo dir = Directory.GetParent(outputFile);
                if (dir != null && !dir.Exists)
                {
                    dir.Create();
                }

                bool postprocess = "text+" == outputFormat;

                OCR <Image> ocrEngine = new OCRImages();
                ocrEngine.PageSegMode  = pageSegMode;
                ocrEngine.Language     = langCode;
                ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty);

                // convert PDF to TIFF
                if (imageFile.ToLower().EndsWith(".pdf"))
                {
                    imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile);
                }

                imageList = ImageIOHelper.GetImageList(new FileInfo(imageFile));
                string result = ocrEngine.RecognizeText(imageList, imageFile);

                // post-corrections for text+ output
                if (postprocess)
                {
                    // postprocess to correct common OCR errors
                    result = Processor.PostProcess(result, langCode);
                    // correct common errors caused by OCR
                    result = TextUtilities.CorrectOCRErrors(result);
                    // correct letter cases
                    result = TextUtilities.CorrectLetterCases(result);
                }

                //if (outputFormat == "pdf") // not yet supported
                //{
                //    byte[] bytes = null; // get the byte array
                //    File.WriteAllBytes(outputFile, bytes);
                //}
                //else
                {
                    string filename = outputFile + "." + outputFormat.Replace("+", string.Empty).Replace("text", "txt").Replace("hocr", "html");
                    using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding()))
                    {
                        sw.Write(result);
                    }
                }
            }
            finally
            {
                imageList = null;
            }
        }
示例#4
0
        private void AutoOCR()
        {
            FileInfo imageFile;

            try
            {
                imageFile = new FileInfo(queue.Dequeue());
            }
            catch
            {
                return;
            }

            this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { imageFile.FullName });

            if (curLangCode == null)
            {
                this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { "    **  " + Properties.Resources.selectLanguage + "  **" });
                //queue.Clear();
                return;
            }

            IList <Image> imageList = ImageIOHelper.GetImageList(imageFile);

            if (imageList == null)
            {
                this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { "    **  " + Properties.Resources.Cannotprocess + imageFile.Name + "  **" });
                return;
            }

            try
            {
                OCR <Image> ocrEngine = new OCRImages();
                ocrEngine.PSM = selectedPSM;
                string result = ocrEngine.RecognizeText(imageList, curLangCode);

                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, curLangCode);
                // correct common errors caused by OCR
                result = TextUtilities.CorrectOCRErrors(result);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(Path.Combine(outputFolder, imageFile.Name + ".txt"), false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            catch (Exception e)
            {
                // Sets the UI culture to the selected language.
                Thread.CurrentThread.CurrentUICulture = new CultureInfo(selectedUILanguage);

                this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { "    **  " + Properties.Resources.Cannotprocess + imageFile.Name + "  **" });
                Console.WriteLine(e.StackTrace);
            }
        }
示例#5
0
        void PerformOCR(string[] args)
        {
            try
            {
                if (args[0] == "-?" || args[0] == "-help" || args.Length == 1 || args.Length == 3)
                {
                    Console.WriteLine("Usage: vietocr imagefile outputfile [-l langcode]");
                    return;
                }
                FileInfo imageFile  = new FileInfo(args[0]);
                FileInfo outputFile = new FileInfo(args[1]);

                if (!imageFile.Exists)
                {
                    Console.WriteLine("Input file does not exist.");
                    return;
                }

                string curLangCode;

                if (args.Length == 2)
                {
                    curLangCode = "eng"; //default language
                }
                else
                {
                    curLangCode = args[3];
                }

                IList <Image> imageList = ImageIOHelper.GetImageList(imageFile);

                OCR <Image> ocrEngine = new OCRImages();
                string      result    = ocrEngine.RecognizeText(imageList, curLangCode);

                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, curLangCode);
                // correct common errors caused by OCR
                result = TextUtilities.CorrectOCRErrors(result);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(outputFile.FullName + ".txt", false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Error: " + e.Message);
            }
        }
示例#6
0
        /// <summary>
        /// Performs OCR for bulk/batch and console operations.
        /// </summary>
        /// <param name="imageFile">Image file</param>
        /// <param name="outputFile">Output file without extension</param>
        /// <param name="langCode">language code</param>
        /// <param name="pageSegMode">page segmentation mode</param>
        /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param>
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat)
        {
            DirectoryInfo dir = Directory.GetParent(outputFile);

            if (dir != null && !dir.Exists)
            {
                dir.Create();
            }

            bool postprocess = "text+" == outputFormat;

            OCR <Image> ocrEngine = new OCRImages();

            ocrEngine.PageSegMode  = pageSegMode;
            ocrEngine.Language     = langCode;
            ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty);
            ocrEngine.OutputFile   = outputFile;

            // convert PDF to TIFF
            if (imageFile.ToLower().EndsWith(".pdf"))
            {
                imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile);
            }

            ocrEngine.ProcessFile(imageFile);

            // post-corrections for text+ output
            if (postprocess)
            {
                string filename = outputFile + ".txt";
                string result   = File.ReadAllText(filename);
                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, langCode);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
        }
示例#7
0
        protected void setSegmentedRegions()
        {
            if (!this.segmentedRegionsToolStripMenuItem.Checked || imageList == null || this.toolStripBtnActualSize.Enabled)
            {
                pictureBox1.SegmentedRegions = null;
                pictureBox1.Refresh();
                return;
            }

            OCR <Image> ocrEngine = new OCRImages();
            Dictionary <Color, List <Rectangle> > map = pictureBox1.SegmentedRegions;

            if (map == null)
            {
                map = new Dictionary <Color, List <Rectangle> >();
            }

            Bitmap image = (Bitmap)imageList[imageIndex];

            List <Rectangle> regions = new List <Rectangle>();

            if (toolStripMenuItemBlock.Checked)
            {
                if (!map.ContainsKey(Color.Gray))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Block);
                    map.Add(Color.Gray, regions);
                }
            }
            else
            {
                map.Remove(Color.Gray);
            }

            if (toolStripMenuItemPara.Checked)
            {
                if (!map.ContainsKey(Color.Green))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Para);
                    map.Add(Color.Green, regions);
                }
            }
            else
            {
                map.Remove(Color.Green);
            }

            if (toolStripMenuItemTextLine.Checked)
            {
                if (!map.ContainsKey(Color.Red))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.TextLine);
                    map.Add(Color.Red, regions);
                }
            }
            else
            {
                map.Remove(Color.Red);
            }

            if (toolStripMenuItemWord.Checked)
            {
                if (!map.ContainsKey(Color.Blue))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Word);
                    map.Add(Color.Blue, regions);
                }
            }
            else
            {
                map.Remove(Color.Blue);
            }

            if (toolStripMenuItemSymbol.Checked)
            {
                if (!map.ContainsKey(Color.Magenta))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Symbol);
                    map.Add(Color.Magenta, regions);
                }
            }
            else
            {
                map.Remove(Color.Magenta);
            }

            pictureBox1.SegmentedRegions = map;
            pictureBox1.Refresh();
            //pictureBox1.Update();
        }
示例#8
0
        void PerformOCR(string[] args)
        {
            try
            {
                if (args[0] == "-?" || args[0] == "-help" || args.Length == 1 || args.Length == 3 || args.Length == 5)
                {
                    Console.WriteLine("Usage: vietocr imagefile outputfile [-l lang] [-psm pagesegmode]");
                    return;
                }
                FileInfo imageFile  = new FileInfo(args[0]);
                FileInfo outputFile = new FileInfo(args[1]);

                if (!imageFile.Exists)
                {
                    Console.WriteLine("Input file does not exist.");
                    return;
                }

                string curLangCode = "eng"; //default language
                string psm         = "3";   // or alternatively, "PSM_AUTO"; // 3 - Fully automatic page segmentation, but no OSD (default)

                if (args.Length == 4)
                {
                    if (args[2].Equals("-l"))
                    {
                        curLangCode = args[3];
                    }
                    else if (args[2].Equals("-psm"))
                    {
                        psm = args[3];
                    }
                }
                else if (args.Length == 6)
                {
                    curLangCode = args[3];
                    psm         = args[5];
                    try
                    {
                        Int16.Parse(psm);
                    }
                    catch
                    {
                        Console.WriteLine("Invalid input value.");
                        return;
                    }
                }

                IList <Image> imageList = ImageIOHelper.GetImageList(imageFile);

                OCR <Image> ocrEngine = new OCRImages();
                ocrEngine.PSM = psm;
                string result = ocrEngine.RecognizeText(imageList, curLangCode);

                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, curLangCode);
                // correct common errors caused by OCR
                result = TextUtilities.CorrectOCRErrors(result);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(outputFile.FullName + ".txt", false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Error: " + e.Message);
            }
        }