/// <summary> /// Get Image Contents /// </summary> private static string _GetImageContet(FileInfo fileInfo) { //Read all file content... using (TextExtractor extractor = new TextExtractor()) { // Load document extractor.LoadDocumentFromFile(fileInfo.FullName); //Set option to repair text extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts; // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // Read all text return(extractor.GetText()); } }
private void btnGetData_Click(object sender, EventArgs e) { StringBuilder result = new StringBuilder(); RectangleF[] selection = pdfViewerControl1.SelectionInPoints; using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; extractor.LoadDocumentFromFile(pdfViewerControl1.InputFile); extractor.OCRMode = OCRMode.Auto; extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; extractor.OCRResolution = 300; for (int i = 0; i < selection.Length; i++) { extractor.SetExtractionArea(selection[i]); result.AppendLine(_dataLabels[i]); result.AppendLine(extractor.GetText(pdfViewerControl1.CurrentPageIndex, pdfViewerControl1.CurrentPageIndex)); result.AppendLine(); } } MessageBox.Show(result.ToString()); }
/// <summary> /// Extract text from document with specific Ocr Mode /// </summary> /// <param name="inputDocument"></param> /// <param name="oCRMode"></param> /// <returns></returns> private static string _ExtractTextWithSpecificOCRMode(string inputDocument, OCRMode ocrMode) { // Location of language data files string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // OCR language string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Create TextExtractor instance using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document to TextExtractor textExtractor.LoadDocumentFromFile(inputDocument); // Specify Ocr Mode textExtractor.OCRMode = ocrMode; // Ocr language data folder path and language textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder; textExtractor.OCRLanguage = ocrLanguage; // Return extracted text return(textExtractor.GetText()); } }
protected void Page_Load(object sender, EventArgs e) { String inputFile = Server.MapPath(@".\bin\sample_ocr.pdf"); // Location of language files String ocrLanguageDataFolder = Server.MapPath(@".\tessdata"); // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of "tessdata" folder containing language data file extractor.OCRLanguageDataFolder = ocrLanguageDataFolder; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentations errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Load PDF document extractor.LoadDocumentFromFile(inputFile); // Write extracted text to output stream Response.Clear(); Response.ContentType = "text/html"; Response.Write("<pre>"); // Write extracted text to output stream Response.Write(extractor.GetText()); Response.Write("</pre>"); Response.End(); } }
private void Button_Extract(object sender, RoutedEventArgs e) { if (_pdfFile != null) { string text = extractor.GetText(0, 0); // extract from the first page only (for demonstration purposes) textBox1.Text = text; } }
/// <summary> /// Get PDF File Content /// </summary> private static string _GetPdfFileContent(FileInfo fileInfo) { //Read all file content... using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { //Load Document textExtractor.LoadDocumentFromFile(fileInfo.FullName); return(textExtractor.GetText()); } }
static void Main(string[] args) { try { //Read all file content... using (TextExtractor extractor = new TextExtractor()) { // Load document extractor.LoadDocumentFromFile("sample.png"); // Extractor Progress event Console.WriteLine("Text Extraction in progress: \n"); extractor.ProgressChanged += Extractor_ProgressChanged; // Set option to repair text extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts; // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; //Read all text var allExtractedText = extractor.GetText(); Console.WriteLine("\n\nExtracted Text:\n\n{0}", allExtractedText); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine("Press enter key to exit..."); Console.ReadLine(); }
/// <summary> /// Check whether OCR Operation is required /// </summary> /// <param name="filePath"></param> private static void _CheckOCRRequired(string filePath) { //Read all file content... using (TextExtractor extractor = new TextExtractor()) { extractor.RegistrationKey = "demo"; extractor.RegistrationName = "demo"; // Load document extractor.LoadDocumentFromFile(filePath); Console.WriteLine("\n*******************\n\nFilePath: {0}", filePath); int pageIndex = 0; // Identify OCR operation is recommended for page if (extractor.IsOCRRecommendedForPage(pageIndex)) { Console.WriteLine("\nOCR Recommended: True"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; } else { Console.WriteLine("\nOCR Recommended: False"); } //Read all text var allExtractedText = extractor.GetText(); Console.WriteLine("\nExtracted Text:\n{0}\n\n", allExtractedText); } }
static void Main(string[] args) { try { //Read all text from pdf file using (TextExtractor extractor = new TextExtractor()) { // Load PDF document extractor.LoadDocumentFromFile("sample.pdf"); // Set the font repairing OCR mode extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; //Read all text string allText = extractor.GetText(); Console.WriteLine("Extracted Text: \n\n" + allText); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.ReadLine(); }
static void Main(string[] args) { try { // Files string fileName = "hindi_text_with_image.pdf"; string destFileName = "output_hindi_text_with_image.pdf"; string destFileName_serachable = "output_hindi_text_with_image_searchable.pdf"; // Read all text from pdf file string allTextExtracted = ""; using (TextExtractor extractor = new TextExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); // Read all text directly allTextExtracted = extractor.GetText(); } // Get image from pdf file MemoryStream memoryStream = new MemoryStream(); using (ImageExtractor extractor = new ImageExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); if (extractor.GetFirstImage()) { extractor.SaveCurrentImageToStream(memoryStream, ImageFormat.Png); } } // Load image from file to System.Drawing.Image object (we need it to get the image resolution) using (System.Drawing.Image sysImage = System.Drawing.Image.FromStream(memoryStream)) { // Compute image size in PDF units (Points) float widthInPoints = sysImage.Width / sysImage.HorizontalResolution * 72f; float heightInPoints = sysImage.Height / sysImage.VerticalResolution * 72f; // Create new PDF document using (Document outPdfDocument = new Document()) { outPdfDocument.RegistrationName = "demo"; outPdfDocument.RegistrationKey = "demo"; // Create page of computed size Page page = new Page(widthInPoints, heightInPoints); // Add page to the document outPdfDocument.Pages.Add(page); Canvas canvas = page.Canvas; // Create Bytescout.PDF.Image object from loaded image Image pdfImage = new Image(sysImage); // Draw the image canvas.DrawImage(pdfImage, 0, 0, widthInPoints, heightInPoints); // Dispose the System.Drawing.Image object to free resources sysImage.Dispose(); // Create brush SolidBrush transparentBrush = new SolidBrush(new ColorGray(0)); // ... and make it transparent transparentBrush.Opacity = 0; // Draw text with transparent brush // Need to set Font which supports hindi characters. Font font16 = new Font("Arial Unicode MS", 16); canvas.DrawString(allTextExtracted, font16, transparentBrush, 40, 40); // Save document to file outPdfDocument.Save(destFileName); } } // Make PDF file with hindi text searchable to OCR. using (SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker()) { //Load PDF document searchablePDFMaker.LoadDocumentFromFile(destFileName); // Set the location of "tessdata" folder containing language data files /* * It used following files for hindi language support. Need to put these files into "testdata" folder. Below location contains these files. * https://github.com/tesseract-ocr/tessdata/tree/3.04.00 * hin.traineddata * hin.cube.bigrams * hin.cube.lm * hin.cube.nn * hin.cube.params * hin.cube.word-freq * hin.tesseract_cube.nn */ searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language searchablePDFMaker.OCRLanguage = "hin"; // Need to set Font which supports hindi characters searchablePDFMaker.LabelingFont = "Arial Unicode MS"; // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; searchablePDFMaker.MakePDFSearchable(destFileName_serachable); } // Open document in default PDF viewer app Process.Start(destFileName_serachable); } catch (Exception ex) { Console.WriteLine("ERROR:" + ex.Message); } Console.ReadLine(); }
static void Main(string[] args) { try { //Read all text from noisy image file using (TextExtractor extractor = new TextExtractor()) { // Load noisy image document extractor.LoadDocumentFromFile("sample.png"); // Set the font repairing OCR mode extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. Console.WriteLine("Please wait while PDF Extractor SDK is processing noisy image to read data..."); // Automatically deskew skewed scans extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. //Read all text string allText = extractor.GetText(); Console.Clear(); Console.WriteLine("Extracted Text: \n\n" + allText); } } catch (Exception ex) { Console.Clear(); Console.WriteLine("Exception: " + ex.Message); } Console.ReadLine(); }