예제 #1
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName      = "demo";
            extractor.RegistrationKey       = "demo";
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net2.00\tessdata";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Apply predefined profiles
            extractor.Profiles = "scanned, no-layout";
            // Extract text to file
            extractor.SaveTextToFile("result1.txt");


            extractor.Reset();


            // Load another document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Load and apply custom profiles
            extractor.LoadProfiles("profiles.json");
            extractor.Profiles = "keep-formatting, ocr-forced-200dpi";
            // Extract text to file
            extractor.SaveTextToFile("result2.txt");


            extractor.Dispose();
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample_english_arabic.pdf");

            // Enable Arabic (and other RTL languages) text detection
            extractor.RTLTextAutoDetectionEnabled = true;

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
예제 #3
0
        static void Main()
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Get PDF files
            string[] pdfFiles = Directory.GetFiles(".", "*.pdf");

            foreach (string file in pdfFiles)
            {
                // Load document
                extractor.LoadDocumentFromFile(file);

                // Save extracted text to .txt file
                extractor.SaveTextToFile(Path.ChangeExtension(file, ".txt"));

                // Reset the extractor before load another file
                extractor.Reset();
            }

            // Cleanup
            extractor.Dispose();
        }
        static void Main(string[] args)
        {
            // When processing huge PDF documents you may run into OutOfMemoryException.
            // This example demonstrates a way to spare the memory by disabling page data caching.

            // Create Bytescout.PDFExtractor.TextExtractor instance
            using (TextExtractor extractor = new TextExtractor("demo", "demo"))
            {
                try
                {
                    // Load sample PDF document
                    extractor.LoadDocumentFromFile("sample2.pdf");

                    // Disable page data caching, so processed pages wiil be disposed automatically
                    extractor.PageDataCaching = PageDataCaching.None;

                    // Save extracted text to file
                    extractor.SaveTextToFile("output.txt");
                }
                catch (PDFExtractorException exception)
                {
                    Console.Write(exception.ToString());
                }
            }

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        private static void ConvertPdfToTxt(object state)
        {
            // Get filename and event from params
            string           file      = (string)((object[])state)[0];
            ManualResetEvent doneEvent = (ManualResetEvent)((object[])state)[1];

            string resultFileName = Path.GetFileName(file) + ".txt";

            try
            {
                Console.WriteLine("Converting " + file);

                using (TextExtractor extractor = new TextExtractor("demo", "demo"))
                {
                    extractor.LoadDocumentFromFile(file);
                    extractor.SaveTextToFile(resultFileName);
                }

                Console.WriteLine("Finished " + resultFileName);
            }
            finally
            {
                // Signal the thread is finished
                doneEvent.Set();
                // Release semaphore
                ThreadLimiter.Release();
            }
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\DocumentWithWatermark.pdf");

            // Filter text using text filter
            extractor.AddFilter(@"^COPY$", caseSensitive: true, useRegex: true);

            // Filter text using appearance filter
            // extractor.AddFilter("Arial", fontSize: 203, exclude: true);

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\columns.pdf");

            // Extract text by columns (useful if PDF document is designed in column layout like a newspaper)
            extractor.ExtractColumnByColumn = true;

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Input file Url
            var inputUrl = @"https://bytescout-com.s3.amazonaws.com/files/demo-files/cloud-api/pdf-to-text/sample.pdf";

            // Get Input Stream
            var inpStream = GetStreamFromUrl(inputUrl);

            // Load sample PDF document
            extractor.LoadDocumentFromStream(inpStream);

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            try
            {
                //Files
                string fileName     = "hindiText.pdf";
                string destFileName = "extractedText.txt";

                //Read all text from pdf file
                using (TextExtractor extractor = new TextExtractor())
                {
                    // Load PDF document
                    extractor.LoadDocumentFromFile(fileName);

                    //Option 1: Extract all text and write to destination file
                    extractor.SaveTextToFile(destFileName, encoding: Encoding.Unicode);

                    Console.WriteLine("All extracted text (hindi) written successfully to destination text file.");

                    //Option 2: Read all text to string variable
                    //string allText = extractor.GetText();
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            Console.WriteLine();
            Console.WriteLine("Press any key...");
            Console.ReadLine();
        }
예제 #10
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of "tessdata" folder containing language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng";             // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata
            // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentations errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            extractor.Dispose();

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.txt");
        }
예제 #11
0
        private static void ThreadProc(object stateInfo)
        {
            int threadIndex = (int)((object[])stateInfo)[0];
            ManualResetEvent allFinishedEvent = (ManualResetEvent)((object[])stateInfo)[1];
            string           inputFile        = (string)((object[])stateInfo)[2];
            string           outputFile       = (string)((object[])stateInfo)[3];
            int startPage = (int)((object[])stateInfo)[4];
            int endPage   = (int)((object[])stateInfo)[5];

            try
            {
                Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage);

                Stopwatch stopwatch = Stopwatch.StartNew();

                // Process the piece
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Set page separator. Default is '\f' (Form Feed)
                    textExtractor.PageSeparator = Environment.NewLine;
                    // Since we are only extracting text, disable the caching to reduce memory usage
                    textExtractor.PageDataCaching = PageDataCaching.None;

                    textExtractor.OCRMode = OCRMode.Auto;
                    textExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\";
                    textExtractor.OCRLanguage           = "eng";
                    // 300 DPI resolution is recommended.
                    // Using of higher values will slow down the processing but does not guarantee the higher quality.
                    textExtractor.OCRResolution = 300;

                    textExtractor.LoadDocumentFromFile(inputFile);

                    textExtractor.SaveTextToFile(startPage, endPage, outputFile);
                }

                Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed);
            }
            finally
            {
                // If it was the last thread, signal the main thread about the finish.
                if (Interlocked.Decrement(ref _runningThreadsCounter) == 0)
                {
                    allFinishedEvent.Set();
                }

                // Release semaphore
                _threadLimiter.Release();
            }
        }
예제 #12
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample2.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.txt");
        }
예제 #13
0
        public MainWindow()
        {
            InitializeComponent();
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@"C:\Users\toky\Documents\Autogids_Autogids_20180131_008.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Open output file in default associated application
            System.Diagnostics.Process.Start("output.txt");
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample scanned document
            extractor.LoadDocumentFromFile("InvoiceWithNoise.png");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;

            // Add profiles to fix issues with date.
            // To deal with wrong V in dates you can use a regular expression. The following will replace only V characters which are located between numbers:
            extractor.LoadProfiles("profiles.json");
            extractor.Profiles = "ocr-dateIssue";

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;

            // Enables max use of CPU and max use of multiple threads during OCR
            extractor.OCRMaximizeCPUUtilization = true;

            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile(@".\sample2.pdf");

            // Save extracted text to file
            extractor.SaveTextToFile(@".\result.txt");

            // Cleanup
            extractor.Dispose();

            // Open result file in default associated application
            ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt");

            Process.Start(processStartInfo);
        }
예제 #17
0
        static void Main(string[] args)
        {
            // Create Bytescout.PDFExtractor.TextExtractor instance
            TextExtractor extractor = new TextExtractor();

            extractor.RegistrationName = "demo";
            extractor.RegistrationKey  = "demo";

            // Load sample PDF document
            extractor.LoadDocumentFromFile("sample_ocr.pdf");

            // Enable Optical Character Recognition (OCR)
            // in .Auto mode (SDK automatically checks if needs to use OCR or not)
            extractor.OCRMode = OCRMode.Auto;

            // Set the location of OCR language data files
            extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

            // Set OCR language
            extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
            // Find more language files at https://github.com/bytescout/ocrdata

            // Set PDF document rendering resolution
            extractor.OCRResolution = 300;


            // You can also apply various preprocessing filters
            // to improve the recognition on low-quality scans.

            // Automatically deskew skewed scans
            //extractor.OCRImagePreprocessingFilters.AddDeskew();

            // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
            //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
            //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();

            // Repair broken letters
            //extractor.OCRImagePreprocessingFilters.AddDilate();

            // Remove noise
            //extractor.OCRImagePreprocessingFilters.AddMedian();

            // Apply Gamma Correction
            //extractor.OCRImagePreprocessingFilters.AddGammaCorrection();

            // Add Contrast
            //extractor.OCRImagePreprocessingFilters.AddContrast(20);


            // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
            // filters for your specific document.
            // See "OCR Analyser" example.


            // Save extracted text to file
            extractor.SaveTextToFile("output.txt");

            // Cleanup
            extractor.Dispose();

            // Open result document in default associated application (for demo purpose)
            ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");

            processStartInfo.UseShellExecute = true;
            Process.Start(processStartInfo);
        }
예제 #18
0
        static void Main(string[] args)
        {
            // Input document
            string inputDocument = @".\sample_ocr.pdf";

            // Document page index
            int pageIndex = 0;

            // Area of the document page to perform the analysis (optional).
            // RectangleF.Empty means the full page.
            RectangleF rectangle = RectangleF.Empty; // new RectangleF(100, 50, 350, 250);

            // Location of language data files
            string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\";

            // OCR language
            string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder

            // Find more language files at https://github.com/bytescout/ocrdata


            // Create OCRAnalyzer instance and activate it with your registration information
            using (OCRAnalyzer ocrAnalyzer = new OCRAnalyzer("demo", "demo"))
            {
                // Display analysis progress
                ocrAnalyzer.ProgressChanged += (object sender, string message, double progress, ref bool cancel) =>
                {
                    Console.WriteLine(message);
                };

                // Load document to OCRAnalyzer
                ocrAnalyzer.LoadDocumentFromFile(inputDocument);

                // Setup OCRAnalyzer
                ocrAnalyzer.OCRLanguage           = ocrLanguage;
                ocrAnalyzer.OCRLanguageDataFolder = ocrLanguageDataFolder;

                // Set page area for analysis (optional)
                ocrAnalyzer.SetExtractionArea(rectangle);

                // Perform analysis and get results
                OCRAnalysisResults analysisResults = ocrAnalyzer.AnalyzeByOCRConfidence(pageIndex);


                // Now extract the text using detected OCR parameters

                string outputDocument = @".\result.txt";

                // Create TextExtractor instance
                using (TextExtractor textExtractor = new TextExtractor("demo", "demo"))
                {
                    // Load document to TextExtractor
                    textExtractor.LoadDocumentFromFile(inputDocument);

                    // Setup TextExtractor
                    textExtractor.OCRMode = OCRMode.Auto;
                    textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder;
                    textExtractor.OCRLanguage           = ocrLanguage;

                    // Apply analysis results to TextExtractor instance
                    ocrAnalyzer.ApplyResults(analysisResults, textExtractor);

                    // Set extraction area (optional)
                    textExtractor.SetExtractionArea(rectangle);

                    // Save extracted text to file
                    textExtractor.SaveTextToFile(outputDocument);

                    // Open result document in default associated application (for demo purpose)
                    ProcessStartInfo processStartInfo = new ProcessStartInfo(outputDocument);
                    processStartInfo.UseShellExecute = true;
                    Process.Start(processStartInfo);
                }
            }
        }