static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover2 instance Remover2 remover = new Remover2("demo", "demo"); // Mask removed text, which ultimately black out region remover.MaskRemovedText = true; // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_EmailSSN.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_EmailSSN.pdf"); // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; // Search SSN in format 202-55-0130 string regexPatternSSN = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Search email Addresses string regexPatternEmail = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b"; // Search results for SSN ISearchResult[] searchResultsSSN = textExtractor.FindAll(0, regexPatternSSN, caseSensitive: false); // Search results for Email ISearchResult[] searchResultEmail = textExtractor.FindAll(0, regexPatternEmail, caseSensitive: false); // Remove SSN result text objects find by SearchResults. remover.AddTextToRemove(searchResultsSSN); // Remove Email result text objects find by SearchResults. remover.AddTextToRemove(searchResultEmail); // Perform removal of specified objects remover.PerformRemoval(@"result1.pdf"); } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); // Clean up. remover.Dispose(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover instance Remover remover = new Remover("demo", "demo"); // Load sample PDF document remover.LoadDocumentFromFile(@"sample1.pdf"); // Search Keyword string SearchKeyword = "Martian dichotomy"; // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"sample1.pdf"); // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, SearchKeyword, caseSensitive: false); // Remove text objects find by SearchResults. // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable // to split PDF text objects. remover.RemoveText(searchResults, @"result1.pdf"); } // Open output file in default application System.Diagnostics.Process.Start("result1.pdf"); // Clean up. remover.Dispose(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover2 instance Remover2 remover = new Remover2("demo", "demo"); // Mask removed text, which ultimately black out region remover.MaskRemovedText = true; // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Search SSN in format 202-55-0130 // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; // Search results ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Remove text objects find by SearchResults. remover.AddTextToRemove(searchResults); // Perform removal of specified objects remover.PerformRemoval(@"result1.pdf"); } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); // Clean up. remover.Dispose(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover instance Remover remover = new Remover("demo", "demo"); // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_SSNNo.pdf"); // Search SSN in format 202-55-0130 // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Remove text objects find by SearchResults. // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable // to split PDF text objects. remover.RemoveText(searchResults, @"result1.pdf"); } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); // Clean up. remover.Dispose(); }
static void Main(string[] args) { // Create Bytescout.PDFExtractor.Remover instance Remover remover = new Remover("demo", "demo"); // Load sample PDF document remover.LoadDocumentFromFile(@"samplePDF_EmailAddress.pdf"); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(@"samplePDF_EmailAddress.pdf"); // Search email Addresses // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Remove text objects find by SearchResults. // NOTE: The removed text might be larger than the specified rectangle. Currently the Remover is unable // to split PDF text objects. remover.RemoveText(searchResults, @"result1.pdf"); } // Open output file in default application System.Diagnostics.Process.Start("result1.pdf"); // Clean up. remover.Dispose(); }
private void BtnFindAll_Click(object sender, EventArgs e) { if (tbSearchExpression.Text.Length > 1) { // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load document into TextExtractor textExtractor.LoadDocumentFromFile(pdfViewerControl1.InputFile); // Set options from UI textExtractor.RegexSearch = cbRegex.Checked; textExtractor.WordMatchingMode = WordMatchingMode.None; // Search for text in all pages and store rectangles of found pieces for (int pageIndex = 0; pageIndex < textExtractor.GetPageCount(); pageIndex++) { ISearchResult[] searchResults = textExtractor.FindAll(pageIndex, tbSearchExpression.Text, caseSensitive: true); if (searchResults.Length > 0) { _foundTextRectangles[pageIndex] = searchResults.Select(searchResult => searchResult.Bounds).ToArray(); } } } // Select fount rectangles in PDF Viewer if (_foundTextRectangles.ContainsKey(pdfViewerControl1.CurrentPageIndex)) { pdfViewerControl1.SelectionInPoints = _foundTextRectangles[pdfViewerControl1.CurrentPageIndex]; } } else { MessageBox.Show(@"Try larger search string"); } }
static void Main(string[] args) { MemoryStream searchablePDFStream = new MemoryStream(); // STEP-1: Make Searchable PDF // STEP-2: Get search text result from that searchable PDF // STEP-3: Remove sensitive data // Create Bytescout.PDFExtractor.SearchablePDFMaker instance using (var searchablePDFMaker = new SearchablePDFMaker("demo", "demo")) { // Load sample PDF document searchablePDFMaker.LoadDocumentFromFile("sampleScannedPDF_EmailAddress.pdf"); // Set the location of language data files searchablePDFMaker.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"; // Set OCR language searchablePDFMaker.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder // Set PDF document rendering resolution searchablePDFMaker.OCRResolution = 300; // Save extracted text to file searchablePDFMaker.MakePDFSearchable(searchablePDFStream); // Prepare TextExtractor using (TextExtractor textExtractor = new TextExtractor("demo", "demo")) { // Load stream into TextExtractor textExtractor.LoadDocumentFromStream(searchablePDFStream); // Search email Addresses // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = @"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b"; // Enable RegexSearch textExtractor.RegexSearch = true; // Set word matching options textExtractor.WordMatchingMode = WordMatchingMode.None; ISearchResult[] searchResults = textExtractor.FindAll(0, regexPattern, caseSensitive: false); // Create Bytescout.PDFExtractor.Remover instance using (var remover = new Remover2("demo", "demo")) { // Load sample PDF document remover.LoadDocumentFromStream(searchablePDFStream); // Mask removed text remover.MaskRemovedText = true; // Make output file unsearchable remover.MakePDFUnsearchable = true; // Provide text to remove remover.AddTextToRemove(searchResults); // Remove text objects find by SearchResults. remover.PerformRemoval("result1.pdf"); } } } // Open output file in default application ProcessStartInfo processStartInfo = new ProcessStartInfo("result1.pdf"); processStartInfo.UseShellExecute = true; Process.Start(processStartInfo); }
static void Main(string[] args) { string inputDocument = @".\samplePDF_SSNNo.pdf"; string outputDocument = @".\samplePDF_SSNNo_edited.pdf"; try { // Create Bytescout.PDFExtractor.TextExtractor instance using (TextExtractor extractor = new TextExtractor("demo", "demo")) { // Create Bytescout.PDFExtractor.Remover2 instance using (Remover2 remover = new Remover2("demo", "demo")) { // Load sample PDF document extractor.LoadDocumentFromFile("samplePDF_SSNNo.pdf"); remover.LoadDocumentFromFile(inputDocument); extractor.RegexSearch = true; // Enable the regular expressions int pageCount = extractor.GetPageCount(); // Search through pages for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) { // Search SSN in format 202-55-0130 using regular expression. // See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx string regexPattern = "[0-9]{3}-[0-9]{2}-[0-9]{4}"; // Search each page for the pattern ISearchResult[] searchResults = extractor.FindAll(pageIndex, regexPattern, caseSensitive: false); foreach (var element in searchResults) { Console.WriteLine("Found SSN No: " + element.Text); // Add rectangle of the found SSN to Remover remover.AddTextToRemove(pageIndex, element.Bounds); } } // Mask replaced text with black rectangle remover.MaskRemovedText = true; // Change the color of the mask rectangle, if necessary //remover.MaskColor = Color.Red; remover.PerformRemoval(outputDocument); Console.WriteLine("Found SSNs removed, result saved to file \"" + outputDocument + "\""); } } // Open result file in default associated application (for the demonstration purpose) var processStartInfo = new ProcessStartInfo(outputDocument) { UseShellExecute = true }; Process.Start(processStartInfo); } catch (Exception ex) { Console.WriteLine("Error: " + ex.Message); } Console.WriteLine(); Console.WriteLine("Press enter key to continue..."); Console.ReadLine(); }