static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker(); searchablePDFMaker.RegistrationName = "demo"; searchablePDFMaker.RegistrationKey = "demo"; // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf"); // Set the location of "tessdata" folder containing language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in /tessdata // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable("output.pdf"); searchablePDFMaker.Dispose(); // Open output file in default associated application System.Diagnostics.Process.Start("output.pdf"); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker(); searchablePDFMaker.RegistrationName = "demo"; searchablePDFMaker.RegistrationKey = "demo"; // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf"); // Set the location of language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable("output.pdf"); // Cleanup searchablePDFMaker.Dispose(); // Open output file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { try { /* * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply * recognized text over the scanned document. Such fonts contain only basic characters * from ISO-8859-1 charset. * If you run OCR for one of the languages with characters that are not present in the default * encoding, you should explicitly specify the font that contains the required characters * using ".LabelingFont" property. * If you run the application in Windows with a selected locale that matches OCR language, * it will be enough to specify the usual font "Arial". But if your app will run in an unknown * environment (for example, in some virtual machine) you will need to install some full Unicode * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker: * * //searchablePDFMaker.LabelingFont = "Arial Unicode MS"; */ using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo")) { // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf"); // Extractor Progress event Console.WriteLine("Searchable PDF making in progress: \n"); searchablePDFMaker.ProgressChanged += SearchablePDF_ProgressChanged; // Set the location of OCR language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable("output.pdf"); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine("\n\n Press enter key to exit..."); Console.ReadLine(); }
static void Main(string[] args) { /* * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply * recognized text over the scanned document. Such fonts contain only basic characters * from ISO-8859-1 charset. * If you run OCR for one of the languages with characters that are not present in the default * encoding, you should explicitly specify the font that contains the required characters * using ".LabelingFont" property. * If you run the application in Windows with a selected locale that matches OCR language, * it will be enough to specify the usual font "Arial". But if your app will run in an unknown * environment (for example, in some virtual machine) you will need to install some full Unicode * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker: * * //searchablePDFMaker.LabelingFont = "Arial Unicode MS"; */ // Create Bytescout.PDFExtractor.TextExtractor instance SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker(); searchablePDFMaker.RegistrationName = "demo"; searchablePDFMaker.RegistrationKey = "demo"; // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sample_ocr_withText.pdf"); // Set the location of language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Discard Existing Text in document searchablePDFMaker.DiscardExistingDocumentText = true; // Save extracted text to file searchablePDFMaker.MakePDFSearchable("output.pdf"); // Cleanup searchablePDFMaker.Dispose(); // Open output file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
private static void ThreadProc(object stateInfo) { int threadIndex = (int)((object[])stateInfo)[0]; ManualResetEvent doneEvent = (ManualResetEvent)((object[])stateInfo)[1]; string inputFile = (string)((object[])stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Extract a piece of document string chunk = string.Format("temp-{0}-{1}", startPage, endPage); using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1); // Process the piece using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo")) { searchablePdfMaker.OCRDetectPageRotation = true; searchablePdfMaker.OCRLanguageDataFolder = @"C:\Program Files\Bytescout PDF Extractor SDK\net4.00\tessdata"; searchablePdfMaker.LoadDocumentFromFile(chunk); // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. searchablePdfMaker.OCRResolution = 300; searchablePdfMaker.MakePDFSearchable(outputFile); } File.Delete(chunk); Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // Signal the thread is finished doneEvent.Set(); // Release semaphore ThreadLimiter.Release(); } }
static void Main(string[] args) { try { using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo")) { // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sample_ocr.pdf"); // Extractor Progress event Console.WriteLine("Searchable PDF making in progress: \n"); searchablePDFMaker.ProgressChanged += SearchablePDF_ProgressChanged; // Set the location of OCR language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Find more language files at https://github.com/bytescout/ocrdata // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable("output.pdf"); // Open result document in default associated application (for demo purpose) ProcessStartInfo processStartInfo = new ProcessStartInfo("output.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); } } catch (Exception ex) { Console.WriteLine(ex.Message); } Console.WriteLine("\n\n Press enter key to exit..."); Console.ReadLine(); }
static void Main(string[] args) { try { // Files string fileName = "hindi_text_with_image.pdf"; string destFileName = "output_hindi_text_with_image.pdf"; string destFileName_serachable = "output_hindi_text_with_image_searchable.pdf"; // Read all text from pdf file string allTextExtracted = ""; using (TextExtractor extractor = new TextExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); // Read all text directly allTextExtracted = extractor.GetText(); } // Get image from pdf file MemoryStream memoryStream = new MemoryStream(); using (ImageExtractor extractor = new ImageExtractor()) { // Load PDF document extractor.LoadDocumentFromFile(fileName); if (extractor.GetFirstImage()) { extractor.SaveCurrentImageToStream(memoryStream, ImageFormat.Png); } } // Load image from file to System.Drawing.Image object (we need it to get the image resolution) using (System.Drawing.Image sysImage = System.Drawing.Image.FromStream(memoryStream)) { // Compute image size in PDF units (Points) float widthInPoints = sysImage.Width / sysImage.HorizontalResolution * 72f; float heightInPoints = sysImage.Height / sysImage.VerticalResolution * 72f; // Create new PDF document using (Document outPdfDocument = new Document()) { outPdfDocument.RegistrationName = "demo"; outPdfDocument.RegistrationKey = "demo"; // Create page of computed size Page page = new Page(widthInPoints, heightInPoints); // Add page to the document outPdfDocument.Pages.Add(page); Canvas canvas = page.Canvas; // Create Bytescout.PDF.Image object from loaded image Image pdfImage = new Image(sysImage); // Draw the image canvas.DrawImage(pdfImage, 0, 0, widthInPoints, heightInPoints); // Dispose the System.Drawing.Image object to free resources sysImage.Dispose(); // Create brush SolidBrush transparentBrush = new SolidBrush(new ColorGray(0)); // ... and make it transparent transparentBrush.Opacity = 0; // Draw text with transparent brush // Need to set Font which supports hindi characters. Font font16 = new Font("Arial Unicode MS", 16); canvas.DrawString(allTextExtracted, font16, transparentBrush, 40, 40); // Save document to file outPdfDocument.Save(destFileName); } } // Make PDF file with hindi text searchable to OCR. using (SearchablePDFMaker searchablePDFMaker = new SearchablePDFMaker()) { //Load PDF document searchablePDFMaker.LoadDocumentFromFile(destFileName); // Set the location of "tessdata" folder containing language data files /* * It used following files for hindi language support. Need to put these files into "testdata" folder. Below location contains these files. * https://github.com/tesseract-ocr/tessdata/tree/3.04.00 * hin.traineddata * hin.cube.bigrams * hin.cube.lm * hin.cube.nn * hin.cube.params * hin.cube.word-freq * hin.tesseract_cube.nn */ searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\Redistributable\net2.00\tessdata\"; // Set OCR language searchablePDFMaker.OCRLanguage = "hin"; // Need to set Font which supports hindi characters searchablePDFMaker.LabelingFont = "Arial Unicode MS"; // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; searchablePDFMaker.MakePDFSearchable(destFileName_serachable); } // Open document in default PDF viewer app Process.Start(destFileName_serachable); } catch (Exception ex) { Console.WriteLine("ERROR:" + ex.Message); } Console.ReadLine(); }
static void Main(string[] args) { MemoryStream searchablePDFStream = new MemoryStream(); // STEP-1: Make Searchable PDF // STEP-2: Get search text result from that searchable PDF // STEP-3: Remove sensitive data // Create Bytescout.PDFExtractor.SearchablePDFMaker instance using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo")) { // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sampleScannedPDF_EmailAddress.pdf"); // Set the location of language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable(searchablePDFStream); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load stream into TextExtractor textExtractor.LoadDocumentFromStream(searchablePDFStream); // Search email Addresses // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Create Bytescout.PDFExtractor.Remover instance using (var remover = new Remover2("demo", "demo")) { // Load sample PDF document remover.LoadDocumentFromStream(searchablePDFStream); // Mask removed text remover.MaskRemovedText = true; // Make output file unsearchable remover.MakePDFUnsearchable = true; // Provide text to remove remover.AddTextToRemove(searchResults); // Remove text objects find by SearchResults. remover.PerformRemoval("result1.pdf"); } } } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
private static void ThreadProc(object stateInfo) { int threadIndex = (int)((object[])stateInfo)[0]; ManualResetEvent doneEvent = (ManualResetEvent)((object[])stateInfo)[1]; string inputFile = (string)((object[])stateInfo)[2]; string outputFile = (string)((object[])stateInfo)[3]; int startPage = (int)((object[])stateInfo)[4]; int endPage = (int)((object[])stateInfo)[5]; try { Console.WriteLine("Thread #{0} started with the page range from {1} to {2}.", threadIndex, startPage, endPage); Stopwatch stopwatch = Stopwatch.StartNew(); // Extract a piece of document string chunk = string.Format("temp-{0}-{1}", startPage, endPage); using (DocumentSplitter splitter = new DocumentSplitter("demo", "demo")) splitter.ExtractPageRange(inputFile, chunk, startPage + 1, endPage + 1); /* * By default, "SearchablePDFMaker" uses one of the standard PDF fonts to apply * recognized text over the scanned document. Such fonts contain only basic characters * from ISO-8859-1 charset. * If you run OCR for one of the languages with characters that are not present in the default * encoding, you should explicitly specify the font that contains the required characters * using ".LabelingFont" property. * If you run the application in Windows with a selected locale that matches OCR language, * it will be enough to specify the usual font "Arial". But if your app will run in an unknown * environment (for example, in some virtual machine) you will need to install some full Unicode * font (e.g. "Arial Unicode MS") and then use it with SearchablePDFMaker: * * //searchablePDFMaker.LabelingFont = "Arial Unicode MS"; */ // Process the piece using (SearchablePDFMaker searchablePdfMaker = new SearchablePDFMaker("demo", "demo")) { searchablePdfMaker.OCRDetectPageRotation = true; searchablePdfMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; searchablePdfMaker.LoadDocumentFromFile(chunk); // 300 DPI resolution is recommended. // Using of higher values will slow down the processing but does not guarantee the higher quality. searchablePdfMaker.OCRResolution = 300; searchablePdfMaker.MakePDFSearchable(outputFile); } File.Delete(chunk); Console.WriteLine("Thread #{0} finished in {1}.", threadIndex, stopwatch.Elapsed); } finally { // Signal the thread is finished doneEvent.Set(); // Release semaphore ThreadLimiter.Release(); } }