static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net2.00\tessdata"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Apply predefined profiles extractor.Profiles = "scanned, no-layout"; // Extract text to file extractor.SaveTextToFile("result1.txt"); extractor.Reset(); // Load another document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Load and apply custom profiles extractor.LoadProfiles("profiles.json"); extractor.Profiles = "keep-formatting, ocr-forced-200dpi"; // Extract text to file extractor.SaveTextToFile("result2.txt"); extractor.Dispose(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample_english_arabic.pdf"); // Enable Arabic (and other RTL languages) text detection extractor.RTLTextAutoDetectionEnabled = true; // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main() { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Get PDF files string[] pdfFiles = Directory.GetFiles(".", "*.pdf"); foreach (string file in pdfFiles) { // Load document extractor.LoadDocumentFromFile(file); // Save extracted text to .txt file extractor.SaveTextToFile(Path.ChangeExtension(file, ".txt")); // Reset the extractor before load another file extractor.Reset(); } // Cleanup extractor.Dispose(); }
static void Main(string[] args) { // When processing huge PDF documents you may run into OutOfMemoryException. // This example demonstrates a way to spare the memory by disabling page data caching. // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor("demo", "demo")) { try { // Load sample PDF document extractor.LoadDocumentFromFile("sample2.pdf"); // Disable page data caching, so processed pages wiil be disposed automatically extractor.PageDataCaching = PageDataCaching.None; // Save extracted text to file extractor.SaveTextToFile("output.txt"); } catch (PDFExtractorException exception) { Console.Write(exception.ToString()); } } // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
private static void ConvertPdfToTxt(object state) { // Get filename and event from params string file = (string)((object[])state)[0]; ManualResetEvent doneEvent = (ManualResetEvent)((object[])state)[1]; string resultFileName = Path.GetFileName(file) + ".txt"; try { Console.WriteLine("Converting " + file); using (TextExtractor extractor = new TextExtractor("demo", "demo")) { extractor.LoadDocumentFromFile(file); extractor.SaveTextToFile(resultFileName); } Console.WriteLine("Finished " + resultFileName); } finally { // Signal the thread is finished doneEvent.Set(); // Release semaphore ThreadLimiter.Release(); } }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\DocumentWithWatermark.pdf"); // Filter text using text filter extractor.AddFilter(@"^COPY$", caseSensitive: true, useRegex: true); // Filter text using appearance filter // extractor.AddFilter("Arial", fontSize: 203, exclude: true); // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\columns.pdf"); // Extract text by columns (useful if PDF document is designed in column layout like a newspaper) extractor.ExtractColumnByColumn = true; // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Input file Url var inputUrl = @"https://bytescout-com.s3.amazonaws.com/files/demo-files/cloud-api/pdf-to-text/sample.pdf"; // Get Input Stream var inpStream = GetStreamFromUrl(inputUrl); // Load sample PDF document extractor.LoadDocumentFromStream(inpStream); // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); Process.Start(processStartInfo); }
static void Main(string[] args) { try { //Files string fileName = "hindiText.pdf"; string destFileName = "extractedText.txt"; //Read all text from pdf file using (TextExtractor extractor = new TextExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); //Option 1: Extract all text and write to destination file extractor.SaveTextToFile(destFileName, encoding: Encoding.Unicode); Console.WriteLine("All extracted text (hindi) written successfully to destination text file."); //Option 2: Read all text to string variable //string allText = extractor.GetText(); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine(); Console.WriteLine("Press any key..."); Console.ReadLine(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of "tessdata" folder containing language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Find more language files at https://github.com/tesseract-ocr/tessdata/tree/3.04.00 // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentations errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Save extracted text to file extractor.SaveTextToFile("output.txt"); extractor.Dispose(); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); }
private static void ThreadProc(object stateInfo) { int threadIndex = (int)((object[])stateInfo)[0]; ManualResetEvent allFinishedEvent = (ManualResetEvent)((object[])stateInfo)[1]; string inputFile = (string)((object[])stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Process the piece using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Set page separator. Default is '\f' (Form Feed) textExtractor.PageSeparator = Environment.NewLine; // Since we are only extracting text, disable the caching to reduce memory usage textExtractor.PageDataCaching = PageDataCaching.None; textExtractor.OCRMode = OCRMode.Auto; textExtractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata\"; textExtractor.OCRLanguage = "eng"; // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. textExtractor.OCRResolution = 300; textExtractor.LoadDocumentFromFile(inputFile); textExtractor.SaveTextToFile(startPage, endPage, outputFile); } Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // If it was the last thread, signal the main thread about the finish. if (Interlocked.Decrement(ref _runningThreadsCounter) == 0) { allFinishedEvent.Set(); } // Release semaphore _threadLimiter.Release(); } }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample2.pdf"); // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); }
public MainWindow() { InitializeComponent(); // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@"C:\Users\toky\Documents\Autogids_Autogids_20180131_008.pdf"); // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Open output file in default associated application System.Diagnostics.Process.Start("output.txt"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample scanned document extractor.LoadDocumentFromFile("InvoiceWithNoise.png"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // Add profiles to fix issues with date. // To deal with wrong V in dates you can use a regular expression. The following will replace only V characters which are located between numbers: extractor.LoadProfiles("profiles.json"); extractor.Profiles = "ocr-dateIssue"; // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // Enables max use of CPU and max use of multiple threads during OCR extractor.OCRMaximizeCPUUtilization = true; // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile(@".\sample2.pdf"); // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); Process.Start(processStartInfo); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Load sample PDF document extractor.LoadDocumentFromFile("sample_ocr.pdf"); // Enable Optical Character Recognition (OCR) // in .Auto mode (SDK automatically checks if needs to use OCR or not) extractor.OCRMode = OCRMode.Auto; // Set the location of OCR language data files extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution extractor.OCRResolution = 300; // You can also apply various preprocessing filters // to improve the recognition on low-quality scans. // Automatically deskew skewed scans //extractor.OCRImagePreprocessingFilters.AddDeskew(); // Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors) //extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover(); //extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover(); // Repair broken letters //extractor.OCRImagePreprocessingFilters.AddDilate(); // Remove noise //extractor.OCRImagePreprocessingFilters.AddMedian(); // Apply Gamma Correction //extractor.OCRImagePreprocessingFilters.AddGammaCorrection(); // Add Contrast //extractor.OCRImagePreprocessingFilters.AddContrast(20); // (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing // filters for your specific document. // See "OCR Analyser" example. // Save extracted text to file extractor.SaveTextToFile("output.txt"); // Cleanup extractor.Dispose(); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { // Input document string inputDocument = @".\sample_ocr.pdf"; // Document page index int pageIndex = 0; // Area of the document page to perform the analysis (optional). // RectangleF.Empty means the full page. RectangleF rectangle = RectangleF.Empty; // new RectangleF(100, 50, 350, 250); // Location of language data files string ocrLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // OCR language string ocrLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Create OCRAnalyzer instance and activate it with your registration information using (OCRAnalyzer ocrAnalyzer = new OCRAnalyzer("demo", "demo")) { // Display analysis progress ocrAnalyzer.ProgressChanged += (object sender, string message, double progress, ref bool cancel) => { Console.WriteLine(message); }; // Load document to OCRAnalyzer ocrAnalyzer.LoadDocumentFromFile(inputDocument); // Setup OCRAnalyzer ocrAnalyzer.OCRLanguage = ocrLanguage; ocrAnalyzer.OCRLanguageDataFolder = ocrLanguageDataFolder; // Set page area for analysis (optional) ocrAnalyzer.SetExtractionArea(rectangle); // Perform analysis and get results OCRAnalysisResults analysisResults = ocrAnalyzer.AnalyzeByOCRConfidence(pageIndex); // Now extract the text using detected OCR parameters string outputDocument = @".\result.txt"; // Create TextExtractor instance using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document to TextExtractor textExtractor.LoadDocumentFromFile(inputDocument); // Setup TextExtractor textExtractor.OCRMode = OCRMode.Auto; textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder; textExtractor.OCRLanguage = ocrLanguage; // Apply analysis results to TextExtractor instance ocrAnalyzer.ApplyResults(analysisResults, textExtractor); // Set extraction area (optional) textExtractor.SetExtractionArea(rectangle); // Save extracted text to file textExtractor.SaveTextToFile(outputDocument); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo(outputDocument); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); } } }