Exemplo n.º 1
        /// <summary>
        /// Get Image Contents
        /// </summary>
        private static string _GetImageContet(FileInfo fileInfo)
            //Read all file content...
            using (TextExtractor extractor = new TextExtractor())
                // Load document

                //Set option to repair text
                extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts;

                // Enable Optical Character Recognition (OCR)
                // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                extractor.OCRMode = OCRMode.Auto;

                // Set the location of OCR language data files
                extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                // Set OCR language
                extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                // Find more language files at https://github.com/bytescout/ocrdata

                // Set PDF document rendering resolution
                extractor.OCRResolution = 300;

                // Read all text
Exemplo n.º 2
        private void btnGetData_Click(object sender, EventArgs e)
            StringBuilder result = new StringBuilder();

            RectangleF[] selection = pdfViewerControl1.SelectionInPoints;

            using (TextExtractor extractor = new TextExtractor())
                extractor.RegistrationName = "demo";
                extractor.RegistrationKey  = "demo";

                extractor.OCRMode = OCRMode.Auto;
                extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";
                extractor.OCRResolution         = 300;

                for (int i = 0; i < selection.Length; i++)
                    result.AppendLine(extractor.GetText(pdfViewerControl1.CurrentPageIndex, pdfViewerControl1.CurrentPageIndex));

        /// <summary>
        /// Extract text from document with specific Ocr Mode
        /// </summary>
        /// <param name="inputDocument"></param>
        /// <param name="oCRMode"></param>
        /// <returns></returns>
        private static string _ExtractTextWithSpecificOCRMode(string inputDocument, OCRMode ocrMode)
            // Location of language data files
            string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // OCR language
            string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

            // Find more language files at https://github.com/bytescout/ocrdata

            // Create TextExtractor instance
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                // Load document to TextExtractor

                // Specify Ocr Mode
                textExtractor.OCRMode = ocrMode;

                // Ocr language data folder path and language
                textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                textExtractor.OCRLanguage           = ocrLanguage;

                // Return extracted text
        protected void Page_Load(object sender, EventArgs e)
            String inputFile = Server.MapPath(@".\bin\sample_ocr.pdf");

            // Location of language files
            String ocrLanguageDataFolder = Server.MapPath(@".\tessdata");

            // Create Bytescout.PDFExtractor.TextExtractor instance
            using (TextExtractor extractor = new TextExtractor())
                extractor.RegistrationName = "demo";
                extractor.RegistrationKey  = "demo";

                // Enable Optical Character Recognition (OCR)
                // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                extractor.OCRMode = OCRMode.Auto;
                // Set the location of "tessdata" folder containing language data file
                extractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                // Set OCR language
                extractor.OCRLanguage = "eng";         // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
                // Set PDF document rendering resolution
                extractor.OCRResolution = 300;

                // You can also apply various preprocessing filters
                // to improve the recognition on low-quality scans.

                // Automatically deskew skewed scans

                // Repair broken letters

                // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentations errors)

                // Remove noise

                // Apply Gamma Correction

                // Load PDF document

                // Write extracted text to output stream
                Response.ContentType = "text/html";

                // Write extracted text to output stream

        private void Button_Extract(object sender, RoutedEventArgs e)
            if (_pdfFile != null)
                string text = extractor.GetText(0, 0);                 // extract from the first page only (for demonstration purposes)

                textBox1.Text = text;
Exemplo n.º 6
        /// <summary>
        /// Get PDF File Content
        /// </summary>
        private static string _GetPdfFileContent(FileInfo fileInfo)
            //Read all file content...
            using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                //Load Document

Exemplo n.º 7
        static void Main(string[] args)
                //Read all file content...
                using (TextExtractor extractor = new TextExtractor())
                    // Load document

                    // Extractor Progress event
                    Console.WriteLine("Text Extraction in progress: \n");
                    extractor.ProgressChanged += Extractor_ProgressChanged;

                    // Set option to repair text
                    extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts;

                    // Enable Optical Character Recognition (OCR)
                    // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                    extractor.OCRMode = OCRMode.Auto;

                    // Set the location of OCR language data files
                    extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

                    // Set OCR language
                    extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    extractor.OCRResolution = 300;

                    //Read all text
                    var allExtractedText = extractor.GetText();
                    Console.WriteLine("\n\nExtracted Text:\n\n{0}", allExtractedText);
            catch (Exception ex)

            Console.WriteLine("Press enter key to exit...");
        /// <summary>
        /// Check whether OCR Operation is required
        /// </summary>
        /// <param name="filePath"></param>
        private static void _CheckOCRRequired(string filePath)
            //Read all file content...
            using (TextExtractor extractor = new TextExtractor())
                extractor.RegistrationKey  = "demo";
                extractor.RegistrationName = "demo";

                // Load document
                Console.WriteLine("\n*******************\n\nFilePath: {0}", filePath);

                int pageIndex = 0;

                // Identify OCR operation is recommended for page
                if (extractor.IsOCRRecommendedForPage(pageIndex))
                    Console.WriteLine("\nOCR Recommended: True");

                    // Enable Optical Character Recognition (OCR)
                    // in .Auto mode (SDK automatically checks if needs to use OCR or not)
                    extractor.OCRMode = OCRMode.Auto;

                    // Set the location of language data files
                    extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

                    // Set OCR language
                    extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    extractor.OCRResolution = 300;
                    Console.WriteLine("\nOCR Recommended: False");

                //Read all text
                var allExtractedText = extractor.GetText();
                Console.WriteLine("\nExtracted Text:\n{0}\n\n", allExtractedText);
Exemplo n.º 9
        static void Main(string[] args)
                //Read all text from pdf file
                using (TextExtractor extractor = new TextExtractor())
                    // Load PDF document

                    // Set the font repairing OCR mode
                    extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts;

                    // Set the location of OCR language data files
                    extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

                    // Set OCR language
                    extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    extractor.OCRResolution = 300;

                    //Read all text
                    string allText = extractor.GetText();

                    Console.WriteLine("Extracted Text: \n\n" + allText);
            catch (Exception ex)

Exemplo n.º 10
        static void Main(string[] args)
                // Files
                string fileName                = "hindi_text_with_image.pdf";
                string destFileName            = "output_hindi_text_with_image.pdf";
                string destFileName_serachable = "output_hindi_text_with_image_searchable.pdf";

                // Read all text from pdf file
                string allTextExtracted = "";
                using (TextExtractor extractor = new TextExtractor())
                    // Load PDF document

                    // Read all text directly
                    allTextExtracted = extractor.GetText();

                // Get image from pdf file
                MemoryStream memoryStream = new MemoryStream();
                using (ImageExtractor extractor = new ImageExtractor())
                    // Load PDF document

                    if (extractor.GetFirstImage())
                        extractor.SaveCurrentImageToStream(memoryStream, ImageFormat.Png);

                // Load image from file to System.Drawing.Image object (we need it to get the image resolution)
                using (System.Drawing.Image sysImage = System.Drawing.Image.FromStream(memoryStream))
                    // Compute image size in PDF units (Points)
                    float widthInPoints  = sysImage.Width / sysImage.HorizontalResolution * 72f;
                    float heightInPoints = sysImage.Height / sysImage.VerticalResolution * 72f;

                    // Create new PDF document
                    using (Document outPdfDocument = new Document())
                        outPdfDocument.RegistrationName = "demo";
                        outPdfDocument.RegistrationKey  = "demo";

                        // Create page of computed size
                        Page page = new Page(widthInPoints, heightInPoints);

                        // Add page to the document

                        Canvas canvas = page.Canvas;

                        // Create Bytescout.PDF.Image object from loaded image
                        Image pdfImage = new Image(sysImage);

                        // Draw the image
                        canvas.DrawImage(pdfImage, 0, 0, widthInPoints, heightInPoints);

                        // Dispose the System.Drawing.Image object to free resources

                        // Create brush
                        SolidBrush transparentBrush = new SolidBrush(new ColorGray(0));

                        // ... and make it transparent
                        transparentBrush.Opacity = 0;

                        // Draw text with transparent brush
                        // Need to set Font which supports hindi characters.
                        Font font16 = new Font("Arial Unicode MS", 16);
                        canvas.DrawString(allTextExtracted, font16, transparentBrush, 40, 40);

                        // Save document to file

                // Make PDF file with hindi text searchable to OCR.
                using (SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker())
                    //Load PDF document

                    // Set the location of "tessdata" folder containing language data files

                     * It used following files for hindi language support. Need to put these files into "testdata" folder. Below location contains these files.
                     * https://github.com/tesseract-ocr/tessdata/tree/3.04.00
                     * hin.traineddata
                     * hin.cube.bigrams
                     * hin.cube.lm
                     * hin.cube.nn
                     * hin.cube.params
                     * hin.cube.word-freq
                     * hin.tesseract_cube.nn
                    searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

                    // Set OCR language
                    searchablePDFMaker.OCRLanguage = "hin";

                    // Need to set Font which supports hindi characters
                    searchablePDFMaker.LabelingFont = "Arial Unicode MS";

                    // Set PDF document rendering resolution
                    searchablePDFMaker.OCRResolution = 300;


                // Open document in default PDF viewer app
            catch (Exception ex)
                Console.WriteLine("ERROR:" + ex.Message);

        static void Main(string[] args)
                //Read all text from noisy image file
                using (TextExtractor extractor = new TextExtractor())
                    // Load noisy image document

                    // Set the font repairing OCR mode
                    extractor.OCRMode = OCRMode.TextFromImagesAndVectorsAndRepairedFonts;

                    // Set the location of OCR language data files
                    extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                    // Set OCR language
                    extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                                                   // Find more language files at https://github.com/bytescout/ocrdata

                    // Set document rendering resolution
                    extractor.OCRResolution = 300;

                    // You can also apply various preprocessing filters
                    // to improve the recognition on low-quality scans.

                    Console.WriteLine("Please wait while PDF Extractor SDK is processing noisy image to read data...");

                    // Automatically deskew skewed scans

                    // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)

                    // Repair broken letters

                    // Remove noise

                    // Apply Gamma Correction

                    // Add Contrast

                    // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
                    // filters for your specific document.
                    // See "OCR Analyser" example.

                    //Read all text
                    string allText = extractor.GetText();

                    Console.WriteLine("Extracted Text: \n\n" + allText);
            catch (Exception ex)
                Console.WriteLine("Exception: " + ex.Message);
