SearchablePDFMaker C# (CSharp) Code Examples

Example #1

0

Show file

File: Program.cs Project: babylon3389/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker();

            searchablePDFMaker.RegistrationName = "demo";
            searchablePDFMaker.RegistrationKey  = "demo";

            // Load sample PDF document
            searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf");

            // Set the location of "tessdata" folder containing language data files
            searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

            // Set OCR language
            searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata

            // Set PDF document rendering resolution
            searchablePDFMaker.OCRResolution = 300;

            // Save extracted text to file
            searchablePDFMaker.MakePDFSearchable("output.pdf");

            searchablePDFMaker.Dispose();

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.pdf");
        }

Example #2

0

Show file

        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker();

            searchablePDFMaker.RegistrationName = "demo";
            searchablePDFMaker.RegistrationKey  = "demo";

            // Load sample PDF document
            searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf");

            // Set the location of language data files
            searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

            // Set OCR language
            searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

            // Set PDF document rendering resolution
            searchablePDFMaker.OCRResolution = 300;

            // Save extracted text to file
            searchablePDFMaker.MakePDFSearchable("output.pdf");

            // Cleanup
            searchablePDFMaker.Dispose();

            // Open output file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

Example #3

0

Show file

File: Program.cs Project: bytescout/pdf-extractor-sdk-samples-c-sharp

        static void Main(string[] args)
        {
            try
            {
                /*
                 * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply
                 * recognized text over the scanned document. Such fonts contain only basic characters
                 * from ISO-8859-1 charset.
                 * If you run OCR for one of the languages with characters that are not present in the default
                 * encoding, you should explicitly specify the font that contains the required characters
                 * using ".LabelingFont" property.
                 * If you run the application in Windows with a selected locale that matches OCR language,
                 * it will be enough to specify the usual font "Arial". But if your app will run in an unknown
                 * environment (for example, in some virtual machine) you will need to install some full Unicode
                 * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker:
                 *
                 * //searchablePDFMaker.LabelingFont = "Arial Unicode MS";
                 */
                using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo"))
                {
                    // Load sample PDF document
                    searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf");

                    // Extractor Progress event
                    Console.WriteLine("Searchable PDF making in progress: \n");
                    searchablePDFMaker.ProgressChanged += SearchablePDF_ProgressChanged;

                    // Set the location of OCR language data files
                    searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                    // Set OCR language
                    searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    searchablePDFMaker.OCRResolution = 300;

                    // Save extracted text to file
                    searchablePDFMaker.MakePDFSearchable("output.pdf");

                    // Open result document in default associated application (for demo purpose)
                    ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf");
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            Console.WriteLine("\n\n Press enter key to exit...");
            Console.ReadLine();
        }

Example #4

0

Show file

File: Program.cs Project: atkins126/ByteScout-SDK-SourceCode

        static void Main(string[] args)
        {
            /*
             * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply
             * recognized text over the scanned document. Such fonts contain only basic characters
             * from ISO-8859-1 charset.
             * If you run OCR for one of the languages with characters that are not present in the default
             * encoding, you should explicitly specify the font that contains the required characters
             * using ".LabelingFont" property.
             * If you run the application in Windows with a selected locale that matches OCR language,
             * it will be enough to specify the usual font "Arial". But if your app will run in an unknown
             * environment (for example, in some virtual machine) you will need to install some full Unicode
             * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker:
             *
             * //searchablePDFMaker.LabelingFont = "Arial Unicode MS";
             */

            // Create Bytescout.PDFExtractor.TextExtractor instance
            SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker();

            searchablePDFMaker.RegistrationName = "demo";
            searchablePDFMaker.RegistrationKey  = "demo";

            // Load sample PDF document
            searchablePDFMaker.LoadDocumentFromFile("sample_ocr_withText.pdf");

            // Set the location of language data files
            searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

            // Set PDF document rendering resolution
            searchablePDFMaker.OCRResolution = 300;

            // Discard Existing Text in document
            searchablePDFMaker.DiscardExistingDocumentText = true;

            // Save extracted text to file
            searchablePDFMaker.MakePDFSearchable("output.pdf");

            // Cleanup
            searchablePDFMaker.Dispose();

            // Open output file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

Example #5

0

Show file

File: Program.cs Project: jboddiford/ByteScout-SDK-SourceCode

        private static void ThreadProc(object stateInfo)
        {
            int threadIndex             = (int)((object[])stateInfo)[0];
            ManualResetEvent doneEvent  = (ManualResetEvent)((object[])stateInfo)[1];
            string           inputFile  = (string)((object[])stateInfo)[2];
            string           outputFile = (string)((object[])stateInfo)[3];
            int startPage = (int)((object[])stateInfo)[4];
            int endPage   = (int)((object[])stateInfo)[5];

            try
            {
                Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage);

                Stopwatch stopwatch = Stopwatch.StartNew();

                // Extract a piece of document
                string chunk = string.Format("temp-{0}-{1}", startPage, endPage);
                using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo"))
                    splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1);

                // Process the piece
                using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo"))
                {
                    searchablePdfMaker.OCRDetectPageRotation = true;
                    searchablePdfMaker.OCRLanguageDataFolder = @"C:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata";
                    searchablePdfMaker.LoadDocumentFromFile(chunk);

                    // 300 DPI resolution is recommended.
                    // Using of higher values will slow down the processing but does not guarantee the higher quality.
                    searchablePdfMaker.OCRResolution = 300;

                    searchablePdfMaker.MakePDFSearchable(outputFile);
                }

                File.Delete(chunk);

                Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed);
            }
            finally
            {
                // Signal the thread is finished
                doneEvent.Set();

                // Release semaphore
                ThreadLimiter.Release();
            }
        }

Example #6

0

Show file

File: Program.cs Project: bytescout/data-extraction-suite-samples-c-sharp

        static void Main(string[] args)
        {
            try
            {
                using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo"))
                {
                    // Load sample PDF document
                    searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf");

                    // Extractor Progress event
                    Console.WriteLine("Searchable PDF making in progress: \n");
                    searchablePDFMaker.ProgressChanged += SearchablePDF_ProgressChanged;

                    // Set the location of OCR language data files
                    searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                    // Set OCR language
                    searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
                    // Find more language files at https://github.com/bytescout/ocrdata

                    // Set PDF document rendering resolution
                    searchablePDFMaker.OCRResolution = 300;

                    // Save extracted text to file
                    searchablePDFMaker.MakePDFSearchable("output.pdf");

                    // Open result document in default associated application (for demo purpose)
                    ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf");
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            Console.WriteLine("\n\n Press enter key to exit...");
            Console.ReadLine();
        }

Example #7

0

Show file

File: Program.cs Project: bytescout/bytescout-showcases

        static void Main(string[] args)
        {
            try
            {
                // Files
                string fileName                = "hindi_text_with_image.pdf";
                string destFileName            = "output_hindi_text_with_image.pdf";
                string destFileName_serachable = "output_hindi_text_with_image_searchable.pdf";

                // Read all text from pdf file
                string allTextExtracted = "";
                using (TextExtractor extractor = new TextExtractor())
                {
                    // Load PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    // Read all text directly
                    allTextExtracted = extractor.GetText();
                }

                // Get image from pdf file
                MemoryStream memoryStream = new MemoryStream();
                using (ImageExtractor extractor = new ImageExtractor())
                {
                    // Load PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    if (extractor.GetFirstImage())
                    {
                        extractor.SaveCurrentImageToStream(memoryStream, ImageFormat.Png);
                    }
                }

                // Load image from file to System.Drawing.Image object (we need it to get the image resolution)
                using (System.Drawing.Image sysImage = System.Drawing.Image.FromStream(memoryStream))
                {
                    // Compute image size in PDF units (Points)
                    float widthInPoints  = sysImage.Width / sysImage.HorizontalResolution * 72f;
                    float heightInPoints = sysImage.Height / sysImage.VerticalResolution * 72f;

                    // Create new PDF document
                    using (Document outPdfDocument = new Document())
                    {
                        outPdfDocument.RegistrationName = "demo";
                        outPdfDocument.RegistrationKey  = "demo";

                        // Create page of computed size
                        Page page = new Page(widthInPoints, heightInPoints);

                        // Add page to the document
                        outPdfDocument.Pages.Add(page);

                        Canvas canvas = page.Canvas;

                        // Create Bytescout.PDF.Image object from loaded image
                        Image pdfImage = new Image(sysImage);

                        // Draw the image
                        canvas.DrawImage(pdfImage, 0, 0, widthInPoints, heightInPoints);

                        // Dispose the System.Drawing.Image object to free resources
                        sysImage.Dispose();

                        // Create brush
                        SolidBrush transparentBrush = new SolidBrush(new ColorGray(0));

                        // ... and make it transparent
                        transparentBrush.Opacity = 0;

                        // Draw text with transparent brush
                        // Need to set Font which supports hindi characters.
                        Font font16 = new Font("Arial Unicode MS", 16);
                        canvas.DrawString(allTextExtracted, font16, transparentBrush, 40, 40);

                        // Save document to file
                        outPdfDocument.Save(destFileName);
                    }
                }


                // Make PDF file with hindi text searchable to OCR.
                using (SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker())
                {
                    //Load PDF document
                    searchablePDFMaker.LoadDocumentFromFile(destFileName);

                    // Set the location of "tessdata" folder containing language data files

                    /*
                     * It used following files for hindi language support. Need to put these files into "testdata" folder. Below location contains these files.
                     * https://github.com/tesseract-ocr/tessdata/tree/3.04.00
                     * hin.traineddata
                     * hin.cube.bigrams
                     * hin.cube.lm
                     * hin.cube.nn
                     * hin.cube.params
                     * hin.cube.word-freq
                     * hin.tesseract_cube.nn
                     */
                    searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

                    // Set OCR language
                    searchablePDFMaker.OCRLanguage = "hin";

                    // Need to set Font which supports hindi characters
                    searchablePDFMaker.LabelingFont = "Arial Unicode MS";

                    // Set PDF document rendering resolution
                    searchablePDFMaker.OCRResolution = 300;

                    searchablePDFMaker.MakePDFSearchable(destFileName_serachable);
                }

                // Open document in default PDF viewer app
                Process.Start(destFileName_serachable);
            }
            catch (Exception ex)
            {
                Console.WriteLine("ERROR:" + ex.Message);
            }

            Console.ReadLine();
        }

Example #8

0

Show file

        static void Main(string[] args)
        {
            MemoryStream searchablePDFStream = new MemoryStream();

            // STEP-1: Make Searchable PDF
            // STEP-2: Get search text result from that searchable PDF
            // STEP-3: Remove sensitive data

            // Create Bytescout.PDFExtractor.SearchablePDFMaker instance
            using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo"))
            {
                // Load sample PDF document
                searchablePDFMaker.LoadDocumentFromFile("sampleScannedPDF_EmailAddress.pdf");

                // Set the location of language data files
                searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

                // Set OCR language
                searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

                // Set PDF document rendering resolution
                searchablePDFMaker.OCRResolution = 300;

                // Save extracted text to file
                searchablePDFMaker.MakePDFSearchable(searchablePDFStream);

                // Prepare TextExtractor
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load stream into TextExtractor
                    textExtractor.LoadDocumentFromStream(searchablePDFStream);

                    // Search email Addresses
                    // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
                    string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b";

                    // Enable RegexSearch
                    textExtractor.RegexSearch = true;

                    // Set word matching options
                    textExtractor.WordMatchingMode = WordMatchingMode.None;

                    ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false);

                    // Create Bytescout.PDFExtractor.Remover instance
                    using (var remover = new Remover2("demo", "demo"))
                    {
                        // Load sample PDF document
                        remover.LoadDocumentFromStream(searchablePDFStream);

                        // Mask removed text
                        remover.MaskRemovedText = true;

                        // Make output file unsearchable
                        remover.MakePDFUnsearchable = true;

                        // Provide text to remove
                        remover.AddTextToRemove(searchResults);

                        // Remove text objects find by SearchResults.
                        remover.PerformRemoval("result1.pdf");
                    }
                }
            }

            // Open output file in default application
            ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }

Example #9

0

Show file

        private static void ThreadProc(object stateInfo)
        {
            int threadIndex             = (int)((object[])stateInfo)[0];
            ManualResetEvent doneEvent  = (ManualResetEvent)((object[])stateInfo)[1];
            string           inputFile  = (string)((object[])stateInfo)[2];
            string           outputFile = (string)((object[])stateInfo)[3];
            int startPage = (int)((object[])stateInfo)[4];
            int endPage   = (int)((object[])stateInfo)[5];

            try
            {
                Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage);

                Stopwatch stopwatch = Stopwatch.StartNew();

                // Extract a piece of document
                string chunk = string.Format("temp-{0}-{1}", startPage, endPage);
                using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo"))
                    splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1);

                /*
                 * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply
                 * recognized text over the scanned document. Such fonts contain only basic characters
                 * from ISO-8859-1 charset.
                 * If you run OCR for one of the languages with characters that are not present in the default
                 * encoding, you should explicitly specify the font that contains the required characters
                 * using ".LabelingFont" property.
                 * If you run the application in Windows with a selected locale that matches OCR language,
                 * it will be enough to specify the usual font "Arial". But if your app will run in an unknown
                 * environment (for example, in some virtual machine) you will need to install some full Unicode
                 * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker:
                 *
                 * //searchablePDFMaker.LabelingFont = "Arial Unicode MS";
                 */
                // Process the piece
                using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo"))
                {
                    searchablePdfMaker.OCRDetectPageRotation = true;
                    searchablePdfMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";
                    searchablePdfMaker.LoadDocumentFromFile(chunk);

                    // 300 DPI resolution is recommended.
                    // Using of higher values will slow down the processing but does not guarantee the higher quality.
                    searchablePdfMaker.OCRResolution = 300;

                    searchablePdfMaker.MakePDFSearchable(outputFile);
                }

                File.Delete(chunk);

                Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed);
            }
            finally
            {
                // Signal the thread is finished
                doneEvent.Set();

                // Release semaphore
                ThreadLimiter.Release();
            }
        }

C# (CSharp) SearchablePDFMaker Examples