static void Main(string[] args) { // Create Bytescout.PDFExtractor.TextExtractor instance TextExtractor extractor = new TextExtractor(); extractor.RegistrationName = "demo"; extractor.RegistrationKey = "demo"; // Input file Url var inputUrl = @"https://bytescout-com.s3.amazonaws.com/files/demo-files/cloud-api/pdf-to-text/sample.pdf"; // Get Input Stream var inpStream = GetStreamFromUrl(inputUrl); // Load sample PDF document extractor.LoadDocumentFromStream(inpStream); // Save extracted text to file extractor.SaveTextToFile(@".\result.txt"); // Cleanup extractor.Dispose(); // Open result file in default associated application ProcessStartInfo processStartInfo = new ProcessStartInfo(@".\result.txt"); Process.Start(processStartInfo); }
static void Main(string[] args) { MemoryStream searchablePDFStream = new MemoryStream(); // STEP-1: Make Searchable PDF // STEP-2: Get search text result from that searchable PDF // STEP-3: Remove sensitive data // Create Bytescout.PDFExtractor.SearchablePDFMaker instance using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo")) { // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sampleScannedPDF_EmailAddress.pdf"); // Set the location of language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable(searchablePDFStream); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load stream into TextExtractor textExtractor.LoadDocumentFromStream(searchablePDFStream); // Search email Addresses // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Create Bytescout.PDFExtractor.Remover instance using (var remover = new Remover2("demo", "demo")) { // Load sample PDF document remover.LoadDocumentFromStream(searchablePDFStream); // Mask removed text remover.MaskRemovedText = true; // Make output file unsearchable remover.MakePDFUnsearchable = true; // Provide text to remove remover.AddTextToRemove(searchResults); // Remove text objects find by SearchResults. remover.PerformRemoval("result1.pdf"); } } } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }