コード例 #1
0
ファイル: OCRHelper.cs プロジェクト: hoanganhmta/OCRSoft
        /// <summary>
        /// Performs OCR for bulk/batch and console operations.
        /// </summary>
        /// <param name="imageFile">Image file</param>
        /// <param name="outputFile">Output file without extension</param>
        /// <param name="langCode">language code</param>
        /// <param name="pageSegMode">page segmentation mode</param>
        /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param>
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat)
        {
            IList <Image> imageList;

            try
            {
                DirectoryInfo dir = Directory.GetParent(outputFile);
                if (dir != null && !dir.Exists)
                {
                    dir.Create();
                }

                bool postprocess = "text+" == outputFormat;

                OCR <Image> ocrEngine = new OCRImages();
                ocrEngine.PageSegMode  = pageSegMode;
                ocrEngine.Language     = langCode;
                ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty);

                // convert PDF to TIFF
                if (imageFile.ToLower().EndsWith(".pdf"))
                {
                    imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile);
                }

                imageList = ImageIOHelper.GetImageList(new FileInfo(imageFile));
                string result = ocrEngine.RecognizeText(imageList, imageFile);

                // post-corrections for text+ output
                if (postprocess)
                {
                    // postprocess to correct common OCR errors
                    result = Processor.PostProcess(result, langCode);
                    // correct common errors caused by OCR
                    result = TextUtilities.CorrectOCRErrors(result);
                    // correct letter cases
                    result = TextUtilities.CorrectLetterCases(result);
                }

                //if (outputFormat == "pdf") // not yet supported
                //{
                //    byte[] bytes = null; // get the byte array
                //    File.WriteAllBytes(outputFile, bytes);
                //}
                //else
                {
                    string filename = outputFile + "." + outputFormat.Replace("+", string.Empty).Replace("text", "txt").Replace("hocr", "html");
                    using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding()))
                    {
                        sw.Write(result);
                    }
                }
            }
            finally
            {
                imageList = null;
            }
        }
コード例 #2
0
ファイル: GUIWithTools.cs プロジェクト: hoanganhmta/OCRSoft
        private void backgroundWorkerMergePdf_DoWork(object sender, DoWorkEventArgs e)
        {
            ArrayList args = (ArrayList)e.Argument;

            string[] inputFiles = (string[])args[0];
            string   outputFile = (string)args[1];

            PdfUtilities.MergePdf(inputFiles, outputFile);
            e.Result = outputFile;
        }
コード例 #3
0
        /// <summary>
        /// Performs OCR for bulk/batch and console operations.
        /// </summary>
        /// <param name="imageFile">Image file</param>
        /// <param name="outputFile">Output file without extension</param>
        /// <param name="langCode">language code</param>
        /// <param name="pageSegMode">page segmentation mode</param>
        /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param>
        public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat)
        {
            DirectoryInfo dir = Directory.GetParent(outputFile);

            if (dir != null && !dir.Exists)
            {
                dir.Create();
            }

            bool postprocess = "text+" == outputFormat;

            OCR <Image> ocrEngine = new OCRImages();

            ocrEngine.PageSegMode  = pageSegMode;
            ocrEngine.Language     = langCode;
            ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty);
            ocrEngine.OutputFile   = outputFile;

            // convert PDF to TIFF
            if (imageFile.ToLower().EndsWith(".pdf"))
            {
                imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile);
            }

            ocrEngine.ProcessFile(imageFile);

            // post-corrections for text+ output
            if (postprocess)
            {
                string filename = outputFile + ".txt";
                string result   = File.ReadAllText(filename);
                // postprocess to correct common OCR errors
                result = Processor.PostProcess(result, langCode);
                // correct letter cases
                result = TextUtilities.CorrectLetterCases(result);

                using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding()))
                {
                    sw.Write(result);
                }
            }
        }
コード例 #4
0
ファイル: GUIWithTools.cs プロジェクト: hoanganhmta/OCRSoft
        private void backgroundWorkerSplitPdf_DoWork(object sender, DoWorkEventArgs e)
        {
            SplitPdfArgs args = (SplitPdfArgs)e.Argument;

            if (args.Pages)
            {
                PdfUtilities.SplitPdf(args.InputFilename, args.OutputFilename, args.FromPage, args.ToPage);
            }
            else
            {
                string outputFilename = String.Empty;

                if (args.OutputFilename.EndsWith(".pdf"))
                {
                    outputFilename = args.OutputFilename.Substring(0, args.OutputFilename.LastIndexOf(".pdf"));
                }

                int pageCount = PdfUtilities.GetPdfPageCount(args.InputFilename);
                if (pageCount == 0)
                {
                    throw new ApplicationException("Split PDF failed.");
                }

                int pageRange = Int32.Parse(args.NumOfPages);
                int startPage = 1;

                while (startPage <= pageCount)
                {
                    int    endPage    = startPage + pageRange - 1;
                    string outputFile = outputFilename + startPage + ".pdf";
                    PdfUtilities.SplitPdf(args.InputFilename, outputFile, startPage.ToString(), endPage.ToString());
                    startPage = endPage + 1;
                }
            }

            e.Result = args.OutputFilename;
        }
コード例 #5
0
        /// <summary>
        /// Get image(s) from file
        /// </summary>
        /// <param name="imageFile">file name</param>
        /// <returns>list of images</returns>
        public static IList <Image> GetImageList(FileInfo imageFile)
        {
            string workingTiffFileName = null;

            Image image = null;

            try
            {
                // convert PDF to TIFF
                if (imageFile.Name.ToLower().EndsWith(".pdf"))
                {
                    workingTiffFileName = PdfUtilities.ConvertPdf2Tiff(imageFile.FullName);
                    imageFile           = new FileInfo(workingTiffFileName);
                }

                // read in the image
                image = Image.FromFile(imageFile.FullName);

                IList <Image> images = new List <Image>();

                int count;
                if (image.RawFormat.Equals(ImageFormat.Gif))
                {
                    count = image.GetFrameCount(FrameDimension.Time);
                }
                else
                {
                    count = image.GetFrameCount(FrameDimension.Page);
                }

                for (int i = 0; i < count; i++)
                {
                    // save each frame to a bytestream
                    using (MemoryStream byteStream = new MemoryStream())
                    {
                        image.SelectActiveFrame(FrameDimension.Page, i);
                        image.Save(byteStream, ImageFormat.Png);

                        // and then create a new Image from it
                        images.Add(Image.FromStream(byteStream));
                    }
                }

                return(images);
            }
            catch (OutOfMemoryException e)
            {
                throw new ApplicationException(e.Message, e);
            }
            catch (System.Runtime.InteropServices.ExternalException e)
            {
                throw new ApplicationException(e.Message + "\nIt might have run out of memory due to handling too many images or too large a file.", e);
            }
            finally
            {
                if (image != null)
                {
                    image.Dispose();
                }

                if (workingTiffFileName != null && File.Exists(workingTiffFileName))
                {
                    File.Delete(workingTiffFileName);
                }
            }
        }