static void Main(string[] args) { Console.WriteLine("RegexExtractText Sample:"); using (Library lib = new Library()) { Console.WriteLine("Initialized the library."); String sInput = Library.ResourceDirectory + "Sample_Input/RegexExtractText.pdf"; String sOutput = "../RegexExtractText-out.json"; // Uncomment only one regular expression you are interested in seeing the match information of (as a JSON file). // Phone numbers String sRegex = "((1-)?(\\()?\\d{3}(\\))?(\\s)?(-)?\\d{3}-\\d{4})"; // Email addresses //String sRegex = "(\\b[\\w.!#$%&'*+\\/=?^`{|}~-]+@[\\w-]+(?:\\.[\\w-]+)*\\b)"; // URLs //String sRegex = "((https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,}))"; if (args.Length > 0) { sInput = args[0]; } using (Document doc = new Document(sInput)) { int nPages = doc.NumPages; Console.WriteLine("Input file: " + sInput); // This will hold the JSON stream that we will print to the output JSON file. DocTextFinderJson result = new DocTextFinderJson(); result.documentJson = new List <MatchObject>(); WordFinderConfig wordConfig = new WordFinderConfig(); // Need to set this to true so phrases will be concatenated properly. wordConfig.NoHyphenDetection = true; // Create a DocTextFinder with the default wordfinder parameters. using (DocTextFinder docTextFinder = new DocTextFinder(doc, wordConfig)) { // Retrieve the phrases matching a regular expression. IList <DocTextFinderMatch> docMatches = docTextFinder.GetMatchList(0, nPages - 1, sRegex); // Iterate through the matches and add match information to the DocTextFinderJson object. foreach (DocTextFinderMatch wInfo in docMatches) { // This object will store the match phrase and an array of quads for the match. MatchObject matchObject = new MatchObject(); // This list will store the page number and quad location for each match quad. List <MatchQuadInformation> matchQuadInformationList = new List <MatchQuadInformation>(); // Set the match phrase in the matchObject. matchObject.matchPhrase = wInfo.MatchString; // Get the quads. IList <DocTextFinderQuadInfo> QuadInfo = wInfo.QuadInfo; foreach (DocTextFinderQuadInfo qInfo in QuadInfo) { MatchQuadInformation temp = new MatchQuadInformation(); temp.pageNumber = qInfo.PageNum; // Iterate through the quads and insert the quad information into the matchQuadInformation object. foreach (Quad quad in qInfo.Quads) { QuadLocation quadLocation = new QuadLocation(); quadLocation.topLeft = new TopLeft(); quadLocation.bottomLeft = new BottomLeft(); quadLocation.topRight = new TopRight(); quadLocation.bottomRight = new BottomRight(); quadLocation.topLeft.x = quad.TopLeft.H; quadLocation.topLeft.y = quad.TopLeft.V; quadLocation.bottomLeft.x = quad.BottomLeft.H; quadLocation.bottomLeft.y = quad.BottomLeft.V; quadLocation.topRight.x = quad.TopRight.H; quadLocation.topRight.y = quad.TopRight.V; quadLocation.bottomRight.x = quad.BottomRight.H; quadLocation.bottomRight.y = quad.BottomRight.V; temp.quadLocation = quadLocation; matchQuadInformationList.Add(temp); } } matchObject.matchQuads = matchQuadInformationList; result.documentJson.Add(matchObject); } // Save the output JSON file. Console.WriteLine("Writing JSON to " + sOutput); string json = JsonConvert.SerializeObject(result.documentJson, Formatting.Indented); System.IO.File.WriteAllText(sOutput, json); } } } }
static void Main(string[] args) { Console.WriteLine("RegexTextSearch Sample:"); using (Library lib = new Library()) { Console.WriteLine("Initialized the library."); String sInput = Library.ResourceDirectory + "Sample_Input/RegexTextSearch.pdf"; String sOutput = "RegexTextSearch-out.pdf"; // Highlight occurrences of the phrases that match this regular expression. // Uncomment only the one you are interested in seeing displayed with highlights. // Phone numbers String sRegex = "((1-)?(\\()?\\d{3}(\\))?(\\s)?(-)?\\d{3}-\\d{4})"; // Email addresses //String sRegex = "(\\b[\\w.!#$%&'*+\\/=?^`{|}~-]+@[\\w-]+(?:\\.[\\w-]+)*\\b)"; // URLs //String sRegex = "((https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,}))"; if (args.Length > 0) { sInput = args[0]; } using (Document doc = new Document(sInput)) { int nPages = doc.NumPages; Console.WriteLine("Input file: " + sInput); WordFinderConfig wordConfig = new WordFinderConfig(); // Need to set this to true so phrases will be concatenated properly wordConfig.NoHyphenDetection = true; // Create a DocTextFinder with the default wordfinder parameters using (DocTextFinder docTextFinder = new DocTextFinder(doc, wordConfig)) { // Retrieve the phrases matching a regular expression IList <DocTextFinderMatch> docMatches = docTextFinder.GetMatchList(0, nPages - 1, sRegex); foreach (DocTextFinderMatch wInfo in docMatches) { // Show the matching phrase Console.WriteLine(wInfo.MatchString); // Get the quads IList <DocTextFinderQuadInfo> QuadInfo = wInfo.QuadInfo; // Iterate through the quad info and create highlights foreach (DocTextFinderQuadInfo qInfo in QuadInfo) { Page docpage = doc.GetPage(qInfo.PageNum); // Highlight the matched string words var highlight = new HighlightAnnotation(docpage, qInfo.Quads); highlight.NormalAppearance = highlight.GenerateAppearance(); } } // Save the document with the highlighted matched strings doc.Save(SaveFlags.Full, sOutput); } } } }
static void Main(string[] args) { Console.WriteLine("AddRegexRedaction Sample:"); using (Library lib = new Library()) { Console.WriteLine("Initialized the library."); String sInput = Library.ResourceDirectory + "Sample_Input/AddRegexRedaction.pdf"; String sOutput1 = "AddRegexRedaction-out.pdf"; String sOutput2 = "AddRegexRedaction-out-applied.pdf"; // Highlight and redact occurrences of the phrases that match this regular expression. // Uncomment only the one you are interested in seeing displayed redacted. // Phone numbers String sRegex = "((1-)?(\\()?\\d{3}(\\))?(\\s)?(-)?\\d{3}-\\d{4})"; // Email addresses //String sRegex = "(\\b[\\w.!#$%&'*+\\/=?^`{|}~-]+@[\\w-]+(?:\\.[\\w-]+)*\\b)"; // URLs //String sRegex = "((https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,}))"; if (args.Length > 0) { sInput = args[0]; } using (Document doc = new Document(sInput)) { int nPages = doc.NumPages; Console.WriteLine("Input file: " + sInput); // Create a WordFinder configuration WordFinderConfig wordConfig = new WordFinderConfig(); // Need to set this to true so phrases will be concatenated properly wordConfig.NoHyphenDetection = true; // Create a DocTextFinder with the default wordfinder parameters using (DocTextFinder docTextFinder = new DocTextFinder(doc, wordConfig)) { // Retrieve the phrases and words matching a regular expression IList <DocTextFinderMatch> docMatches = docTextFinder.GetMatchList(0, nPages - 1, sRegex); // Redaction color will be red Color red = new Color(1.0, 0.0, 0.0); foreach (DocTextFinderMatch wInfo in docMatches) { // Show the matching phrase Console.WriteLine(wInfo.MatchString); // Get the quads IList <DocTextFinderQuadInfo> QuadInfo = wInfo.QuadInfo; // Iterate through the quad info and create highlights foreach (DocTextFinderQuadInfo qInfo in QuadInfo) { Page docpage = doc.GetPage(qInfo.PageNum); Redaction red_fill = new Redaction(docpage, qInfo.Quads, red); /* fill the "normal" appearance with 25% red */ red_fill.FillNormal = true; red_fill.SetFillColor(red, 0.25); } } } // Save the document with the highlighted matched strings doc.Save(SaveFlags.Full, sOutput1); Console.WriteLine("Wrote a PDF document with unapplied redactions."); // Apply all the redactions in the document doc.ApplyRedactions(); // Save the document with the redacted matched strings doc.Save(SaveFlags.Full, sOutput2); Console.WriteLine("Wrote a redacted PDF document."); } } }