Example #1
0
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, bool hocr)
        {
            IList<Image> imageList;

            try
            {
                imageList = ImageIOHelper.GetImageList(new FileInfo(imageFile));
                OCR<Image> ocrEngine = new OCRImages();
                ocrEngine.PageSegMode = pageSegMode;
                ocrEngine.Hocr = hocr;
                string result = ocrEngine.RecognizeText(imageList, langCode);

                // skip post-corrections if hocr output
                if (!hocr)
                {
                    // postprocess to correct common OCR errors
                    result = Processor.PostProcess(result, langCode);
                    // correct common errors caused by OCR
                    result = TextUtilities.CorrectOCRErrors(result);
                    // correct letter cases
                    result = TextUtilities.CorrectLetterCases(result);
                }

                using (StreamWriter sw = new StreamWriter(outputFile, false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            finally
            {
                imageList = null;
            }
        }
Example #2
0
        /// <summary>
        /// Performs OCR for bulk/batch and console operations.
        /// </summary>
        /// <param name="imageFile">Image file</param>
        /// <param name="outputFile">Output file without extension</param>
        /// <param name="langCode">language code</param>
        /// <param name="pageSegMode">page segmentation mode</param>
        /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param>
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat)
        {
            IList<Image> imageList;

            try
            {
                DirectoryInfo dir = Directory.GetParent(outputFile);
                if (dir != null && !dir.Exists)
                {
                    dir.Create();
                }

                bool postprocess = "text+" == outputFormat;

                OCR<Image> ocrEngine = new OCRImages();
                ocrEngine.PageSegMode = pageSegMode;
                ocrEngine.Language = langCode;
                ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty);

                // convert PDF to TIFF
                if (imageFile.ToLower().EndsWith(".pdf"))
                {
                    imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile);
                }

                imageList = ImageIOHelper.GetImageList(new FileInfo(imageFile));
                string result = ocrEngine.RecognizeText(imageList, imageFile);

                // post-corrections for text+ output
                if (postprocess)
                {
                    // postprocess to correct common OCR errors
                    result = Processor.PostProcess(result, langCode);
                    // correct common errors caused by OCR
                    result = TextUtilities.CorrectOCRErrors(result);
                    // correct letter cases
                    result = TextUtilities.CorrectLetterCases(result);
                }

                //if (outputFormat == "pdf") // not yet supported
                //{
                //    byte[] bytes = null; // get the byte array
                //    File.WriteAllBytes(outputFile, bytes);
                //} 
                //else 
                {
                    string filename = outputFile + "." + outputFormat.Replace("+", string.Empty).Replace("text", "txt").Replace("hocr", "html");
                    using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding()))
                    {
                        sw.Write(result);
                    }
                }
            }
            finally
            {
                imageList = null;
            }
        }
Example #3
0
        void PerformOCR(string[] args)
        {
            try
            {
                if (args[0] == "-?" || args[0] == "-help" || args.Length == 1 || args.Length == 3)
                {
                    Console.WriteLine("Usage: vietocr imagefile outputfile [-l langcode]");
                    return;
                }
                FileInfo imageFile = new FileInfo(args[0]);
                FileInfo outputFile = new FileInfo(args[1]);

                if (!imageFile.Exists)
                {
                    Console.WriteLine("Input file does not exist.");
                    return;
                }

                string curLangCode;

                if (args.Length == 2)
                {
                    curLangCode = "eng"; //default language
                }
                else
                {
                    curLangCode = args[3];
                }

                IList<Image> imageList = ImageIOHelper.GetImageList(imageFile);

                OCR<Image> ocrEngine = new OCRImages();
                string result = ocrEngine.RecognizeText(imageList, curLangCode);

                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, curLangCode);
                // correct common errors caused by OCR
                result = TextUtilities.CorrectOCRErrors(result);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(outputFile.FullName + ".txt", false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Error: " + e.Message);
            }
        }
Example #4
0
 public void RecognizeTextTest()
 {
     string selectedImageFile = "samples/vietsample1.tif";
     FileInfo imageFile = new FileInfo(selectedImageFile);
     IList<Image> imageList = ImageIOHelper.GetImageList(imageFile);
     entity = new OCRImageEntity(imageList, selectedImageFile, - 1, Rectangle.Empty, lang);
     OCRImages target = new OCRImages();
     target.Language = entity.Language;
     IList<Image> images = entity.ClonedImages;
     string expected = "Đôi Mắt Người Sơn Tây";
     string actual;
     actual = target.RecognizeText(images, selectedImageFile);
     Assert.IsTrue(actual.Contains(expected));
 }
Example #5
0
        public void RecognizeTextTest()
        {
            string        selectedImageFile = "samples/vietsample1.tif";
            FileInfo      imageFile         = new FileInfo(selectedImageFile);
            IList <Image> imageList         = ImageIOHelper.GetImageList(imageFile);

            entity = new OCRImageEntity(imageList, selectedImageFile, -1, Rectangle.Empty, lang);
            OCRImages target = new OCRImages();

            target.Language = entity.Language;
            IList <Image> images   = entity.ClonedImages;
            string        expected = "Đôi Mắt Người Sơn Tây";
            string        actual;

            actual = target.RecognizeText(images, selectedImageFile);
            Assert.IsTrue(actual.Contains(expected));
        }
Example #6
0
        void setSegmentedRegions()
        {
            if (!this.segmentedRegionsToolStripMenuItem.Checked || imageList == null || this.toolStripBtnActualSize.Enabled)
            {
                pictureBox1.SegmentedRegions = null;
                pictureBox1.Refresh();
                return;
            }

            OCR<Image> ocrEngine = new OCRImages();
            Dictionary<Color, List<Rectangle>> map = pictureBox1.SegmentedRegions;
            if (map == null)
            {
                map = new Dictionary<Color, List<Rectangle>>();
            }

            Bitmap image = (Bitmap)imageList[imageIndex];

            List<Rectangle> regions = new List<Rectangle>();

            if (toolStripMenuItemBlock.Checked)
            {
                if (!map.ContainsKey(Color.Gray))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Block);
                    map.Add(Color.Gray, regions);
                }
            }
            else
            {
                map.Remove(Color.Gray);
            }

            if (toolStripMenuItemPara.Checked)
            {
                if (!map.ContainsKey(Color.Green))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Para);
                    map.Add(Color.Green, regions);
                }
            }
            else
            {
                map.Remove(Color.Green);
            }

            if (toolStripMenuItemTextLine.Checked)
            {
                if (!map.ContainsKey(Color.Red))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.TextLine);
                    map.Add(Color.Red, regions);
                }
            }
            else
            {
                map.Remove(Color.Red);
            }

            if (toolStripMenuItemWord.Checked)
            {
                if (!map.ContainsKey(Color.Blue))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Word);
                    map.Add(Color.Blue, regions);
                }
            }
            else
            {
                map.Remove(Color.Blue);
            }

            if (toolStripMenuItemSymbol.Checked)
            {
                if (!map.ContainsKey(Color.Magenta))
                {
                    regions = ocrEngine.GetSegmentedRegions(image, PageIteratorLevel.Symbol);
                    map.Add(Color.Magenta, regions);
                }
            }
            else
            {
                map.Remove(Color.Magenta);
            }

            pictureBox1.SegmentedRegions = map;
            pictureBox1.Refresh();
            //pictureBox1.Update();
        }
Example #7
0
        private void backgroundWorkerOcr_DoWork(object sender, DoWorkEventArgs e)
        {
            // Get the BackgroundWorker that raised this event.
            BackgroundWorker worker = sender as BackgroundWorker;

            OCRImageEntity entity = (OCRImageEntity)e.Argument;
            OCR<Image> ocrEngine = new OCRImages();
            ocrEngine.PageSegMode = selectedPSM;
            ocrEngine.Language = entity.Language;

            // Assign the result of the computation to the Result property of the DoWorkEventArgs
            // object. This is will be available to the RunWorkerCompleted eventhandler.
            //e.Result = ocrEngine.RecognizeText(entity.ClonedImages, entity.Lang, entity.Rect, worker, e);
            IList<Image> images = entity.ClonedImages;

            for (int i = 0; i < images.Count; i++)
            {
                if (worker.CancellationPending)
                {
                    e.Cancel = true;
                    break;
                }

                string result = ocrEngine.RecognizeText(((List<Image>)images).GetRange(i, 1), entity.Inputfilename, entity.Rect, worker, e);
                worker.ReportProgress(i, result); // i is not really percentage
            }
        }
Example #8
0
        private void AutoOCR()
        {
            FileInfo imageFile;
            try
            {
                imageFile = new FileInfo(queue.Dequeue());
            }
            catch
            {
                return;
            }

            this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { imageFile.FullName });

            if (curLangCode == null)
            {
                this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { "    **  " + Properties.Resources.selectLanguage + "  **" });
                //queue.Clear();
                return;
            }

            IList<Image> imageList = ImageIOHelper.GetImageList(imageFile);
            if (imageList == null)
            {
                this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { "    **  " + Properties.Resources.Cannotprocess + imageFile.Name + "  **" });
                return;
            }

            try
            {
                OCR<Image> ocrEngine = new OCRImages();
                ocrEngine.PSM = selectedPSM;
                string result = ocrEngine.RecognizeText(imageList, curLangCode);

                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, curLangCode);
                // correct common errors caused by OCR
                result = TextUtilities.CorrectOCRErrors(result);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(Path.Combine(outputFolder, imageFile.Name + ".txt"), false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            catch (Exception e)
            {
                // Sets the UI culture to the selected language.
                Thread.CurrentThread.CurrentUICulture = new CultureInfo(selectedUILanguage);

                this.statusForm.TextBox.BeginInvoke(new UpdateStatusEvent(this.WorkerUpdate), new Object[] { "    **  " + Properties.Resources.Cannotprocess + imageFile.Name + "  **" });
                Console.WriteLine(e.StackTrace);
            }
        }
Example #9
0
        void PerformOCR(string[] args)
        {
            try
            {
                if (args[0] == "-?" || args[0] == "-help" || args.Length == 1 || args.Length == 3 || args.Length == 5)
                {
                    Console.WriteLine("Usage: vietocr imagefile outputfile [-l lang] [-psm pagesegmode]");
                    return;
                }
                FileInfo imageFile = new FileInfo(args[0]);
                FileInfo outputFile = new FileInfo(args[1]);

                if (!imageFile.Exists)
                {
                    Console.WriteLine("Input file does not exist.");
                    return;
                }

                string curLangCode = "eng"; //default language
                string psm = "3"; // or alternatively, "PSM_AUTO"; // 3 - Fully automatic page segmentation, but no OSD (default)

                if (args.Length == 4)
                {
                    if (args[2].Equals("-l"))
                    {
                        curLangCode = args[3];
                    }
                    else if (args[2].Equals("-psm"))
                    {
                        psm = args[3];
                    }
                }
                else if (args.Length == 6)
                {
                    curLangCode = args[3];
                    psm = args[5];
                    try
                    {
                        Int16.Parse(psm);
                    }
                    catch
                    {
                        Console.WriteLine("Invalid input value.");
                        return;
                    }
                }

                IList<Image> imageList = ImageIOHelper.GetImageList(imageFile);

                OCR<Image> ocrEngine = new OCRImages();
                ocrEngine.PSM = psm;
                string result = ocrEngine.RecognizeText(imageList, curLangCode);

                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, curLangCode);
                // correct common errors caused by OCR
                result = TextUtilities.CorrectOCRErrors(result);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(outputFile.FullName + ".txt", false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Error: " + e.Message);
            }
        }