/// <summary> /// Performs OCR for bulk/batch and console operations. /// </summary> /// <param name="imageFile">Image file</param> /// <param name="outputFile">Output file without extension</param> /// <param name="langCode">language code</param> /// <param name="pageSegMode">page segmentation mode</param> /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param> public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat) { IList <Image> imageList; try { DirectoryInfo dir = Directory.GetParent(outputFile); if (dir != null && !dir.Exists) { dir.Create(); } bool postprocess = "text+" == outputFormat; OCR <Image> ocrEngine = new OCRImages(); ocrEngine.PageSegMode = pageSegMode; ocrEngine.Language = langCode; ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty); // convert PDF to TIFF if (imageFile.ToLower().EndsWith(".pdf")) { imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile); } imageList = ImageIOHelper.GetImageList(new FileInfo(imageFile)); string result = ocrEngine.RecognizeText(imageList, imageFile); // post-corrections for text+ output if (postprocess) { // postprocess to correct common OCR errors result = Processor.PostProcess(result, langCode); // correct common errors caused by OCR result = TextUtilities.CorrectOCRErrors(result); // correct letter cases result = TextUtilities.CorrectLetterCases(result); } //if (outputFormat == "pdf") // not yet supported //{ // byte[] bytes = null; // get the byte array // File.WriteAllBytes(outputFile, bytes); //} //else { string filename = outputFile + "." + outputFormat.Replace("+", string.Empty).Replace("text", "txt").Replace("hocr", "html"); using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding())) { sw.Write(result); } } } finally { imageList = null; } }
private void backgroundWorkerMergePdf_DoWork(object sender, DoWorkEventArgs e) { ArrayList args = (ArrayList)e.Argument; string[] inputFiles = (string[])args[0]; string outputFile = (string)args[1]; PdfUtilities.MergePdf(inputFiles, outputFile); e.Result = outputFile; }
/// <summary> /// Performs OCR for bulk/batch and console operations. /// </summary> /// <param name="imageFile">Image file</param> /// <param name="outputFile">Output file without extension</param> /// <param name="langCode">language code</param> /// <param name="pageSegMode">page segmentation mode</param> /// <param name="outputFormat">format of output file. Possible values: <code>text</code>, <code>text+</code> (with post-corrections), <code>hocr</code></param> public static void PerformOCR(string imageFile, string outputFile, string langCode, string pageSegMode, string outputFormat) { DirectoryInfo dir = Directory.GetParent(outputFile); if (dir != null && !dir.Exists) { dir.Create(); } bool postprocess = "text+" == outputFormat; OCR <Image> ocrEngine = new OCRImages(); ocrEngine.PageSegMode = pageSegMode; ocrEngine.Language = langCode; ocrEngine.OutputFormat = outputFormat.Replace("+", string.Empty); ocrEngine.OutputFile = outputFile; // convert PDF to TIFF if (imageFile.ToLower().EndsWith(".pdf")) { imageFile = PdfUtilities.ConvertPdf2Tiff(imageFile); } ocrEngine.ProcessFile(imageFile); // post-corrections for text+ output if (postprocess) { string filename = outputFile + ".txt"; string result = File.ReadAllText(filename); // postprocess to correct common OCR errors result = Processor.PostProcess(result, langCode); // correct letter cases result = TextUtilities.CorrectLetterCases(result); using (StreamWriter sw = new StreamWriter(filename, false, new System.Text.UTF8Encoding())) { sw.Write(result); } } }
private void backgroundWorkerSplitPdf_DoWork(object sender, DoWorkEventArgs e) { SplitPdfArgs args = (SplitPdfArgs)e.Argument; if (args.Pages) { PdfUtilities.SplitPdf(args.InputFilename, args.OutputFilename, args.FromPage, args.ToPage); } else { string outputFilename = String.Empty; if (args.OutputFilename.EndsWith(".pdf")) { outputFilename = args.OutputFilename.Substring(0, args.OutputFilename.LastIndexOf(".pdf")); } int pageCount = PdfUtilities.GetPdfPageCount(args.InputFilename); if (pageCount == 0) { throw new ApplicationException("Split PDF failed."); } int pageRange = Int32.Parse(args.NumOfPages); int startPage = 1; while (startPage <= pageCount) { int endPage = startPage + pageRange - 1; string outputFile = outputFilename + startPage + ".pdf"; PdfUtilities.SplitPdf(args.InputFilename, outputFile, startPage.ToString(), endPage.ToString()); startPage = endPage + 1; } } e.Result = args.OutputFilename; }
/// <summary> /// Get image(s) from file /// </summary> /// <param name="imageFile">file name</param> /// <returns>list of images</returns> public static IList <Image> GetImageList(FileInfo imageFile) { string workingTiffFileName = null; Image image = null; try { // convert PDF to TIFF if (imageFile.Name.ToLower().EndsWith(".pdf")) { workingTiffFileName = PdfUtilities.ConvertPdf2Tiff(imageFile.FullName); imageFile = new FileInfo(workingTiffFileName); } // read in the image image = Image.FromFile(imageFile.FullName); IList <Image> images = new List <Image>(); int count; if (image.RawFormat.Equals(ImageFormat.Gif)) { count = image.GetFrameCount(FrameDimension.Time); } else { count = image.GetFrameCount(FrameDimension.Page); } for (int i = 0; i < count; i++) { // save each frame to a bytestream using (MemoryStream byteStream = new MemoryStream()) { image.SelectActiveFrame(FrameDimension.Page, i); image.Save(byteStream, ImageFormat.Png); // and then create a new Image from it images.Add(Image.FromStream(byteStream)); } } return(images); } catch (OutOfMemoryException e) { throw new ApplicationException(e.Message, e); } catch (System.Runtime.InteropServices.ExternalException e) { throw new ApplicationException(e.Message + "\nIt might have run out of memory due to handling too many images or too large a file.", e); } finally { if (image != null) { image.Dispose(); } if (workingTiffFileName != null && File.Exists(workingTiffFileName)) { File.Delete(workingTiffFileName); } } }