/* This function is copied primarily from the TextExtract sample, * but modified to skip writing out the text that it finds */ static void FindTextUntagged(Document doc, String splitTextString, List <int> listOfPageNumsToSplit) { // setup the WordFinderConfig WordFinderConfig wordConfig = new WordFinderConfig(); wordConfig.IgnoreCharGaps = false; wordConfig.IgnoreLineGaps = false; wordConfig.NoAnnots = false; wordConfig.NoEncodingGuess = false; // Std Roman treatment for custom encoding; overrides the noEncodingGuess option wordConfig.UnknownToStdEnc = false; wordConfig.DisableTaggedPDF = false; // legacy mode WordFinder creation wordConfig.NoXYSort = true; wordConfig.PreserveSpaces = false; wordConfig.NoLigatureExp = false; wordConfig.NoHyphenDetection = false; wordConfig.TrustNBSpace = false; wordConfig.NoExtCharOffset = false; // text extraction efficiency wordConfig.NoStyleInfo = false; // text extraction efficiency WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig); int nPages = doc.NumPages; IList <Word> pageWords = null; for (int i = 0; i < nPages; i++) { pageWords = wordFinder.GetWordList(i); String textToExtract = ""; // By default, this searches the entire page word list. // You could limit it to the first X (e.g. 200) number of words as shown below if you know that the // search string will fall within a certain number of words. If you wanted to only look within // a specific quadrant of a page (e.g. lower right corner), you would need to get the bounding box // of each Word and compare that to your target area. int wordLoop = Math.Min(pageWords.Count, 200); for (int wordnum = 0; wordnum < pageWords.Count; wordnum++) //for (int wordnum = 0; wordnum < wordLoop; wordnum++) // limit by the fixt X number of Words { Word wInfo; wInfo = pageWords[wordnum]; string s = wInfo.Text; // Check for hyphenated words that break across a line. if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) && ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)) { // For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check // words against a dictionary to determine if the hyphenated word is actually one word or two. string[] splitstrs = s.Split(new Char[] { '-', '\u00ad' }); textToExtract += splitstrs[0] + splitstrs[1]; } else { textToExtract += s; } // Check for space adjacency and add a space if necessary. if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace) { textToExtract += " "; } // Check for a line break and add one if necessary if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine) { textToExtract += "\n"; } } // if (textToExtract.ToUpper().Contains(splitTextString)) { Console.WriteLine("Found " + splitTextString + " on page " + i); listOfPageNumsToSplit.Add(i); } // Release requested WordList for (int wordnum = 0; wordnum < pageWords.Count; wordnum++) { pageWords[wordnum].Dispose(); } } }
static void ExtractTextUntagged(Document doc, WordFinder wordFinder) { int nPages = doc.NumPages; IList <Word> pageWords = null; System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-untagged-out.txt"); Console.WriteLine("Writing TextExtract-untagged-out.txt"); for (int i = 0; i < nPages; i++) { pageWords = wordFinder.GetWordList(i); String textToExtract = ""; for (int wordnum = 0; wordnum < pageWords.Count; wordnum++) { Word wInfo; wInfo = pageWords[wordnum]; string s = wInfo.Text; // Check for hyphenated words that break across a line. if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) && ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)) { // Remove the hyphen and combine the two parts of the word before adding to the extracted text. // Note that we pass in the Unicode character for soft hyphen as well as the regular hyphen. // // In untagged PDF, it's not uncommon to find a mixture of hard and soft hyphens that may // not be used for their intended purposes. // (Soft hyphens are intended only for words that break across lines.) // // For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check // words against a dictionary to determine if the hyphenated word is actually one word or two. string[] splitstrs = s.Split(new Char[] { '-', '\u00ad' }); textToExtract += splitstrs[0] + splitstrs[1]; } else { textToExtract += s; } // Check for space adjacency and add a space if necessary. if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace) { textToExtract += " "; } // Check for a line break and add one if necessary if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine) { textToExtract += "\n"; } } logfile.WriteLine("<page " + (i + 1) + ">"); logfile.WriteLine(textToExtract); // Release requested WordList for (int wordnum = 0; wordnum < pageWords.Count; wordnum++) { pageWords[wordnum].Dispose(); } } Console.WriteLine("Extracted " + nPages + " pages."); logfile.Close(); }
static void ExtractTextTagged(Document doc, WordFinder wordFinder) { int nPages = doc.NumPages; IList <Word> pageWords = null; System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-tagged-out.txt"); Console.WriteLine("Writing TextExtract-tagged-out.txt"); for (int i = 0; i < nPages; i++) { pageWords = wordFinder.GetWordList(i); String textToExtract = ""; for (int wordnum = 0; wordnum < pageWords.Count; wordnum++) { Word wInfo; wInfo = pageWords[wordnum]; string s = wInfo.Text; // In most tagged PDFs, soft hyphens are used only to break words across lines, so we'll // check for any soft hyphens and remove them from our text output. // // Note that we're not checking for the LastWordOnLine flag, unlike untagged PDF. For Tagged PDF, // words are not flagged as being the last on the line if they are not at the end of a sentence. if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen)) { // Remove the hyphen and combine the two parts of the word before adding to the extracted text. // Note that we pass in the Unicode character for soft hyphen. string[] splitstrs = s.Split(new Char[] { '\u00ad' }); textToExtract += splitstrs[0] + splitstrs[1]; } else { textToExtract += s; } // Check for space adjacency and add a space if necessary. if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace) { textToExtract += " "; } // Check for a line break and add one if necessary. // Normally this is accomplished using WordAttributeFlags.LastWordOnLine, // but for tagged PDFs, the LastWordOnLine flag is set according to the // tags in the PDF, not according to visual line breaks in the document. // // To preserve the visual line breaks in the document, we'll check whether // the word is the last word in the region. If you instead prefer to // break lines according to the tags in the PDF, use // (wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine, // similar to the untagged case. if (wInfo.IsLastWordInRegion) { textToExtract += "\n"; } } logfile.WriteLine("<page " + (i + 1) + ">"); logfile.WriteLine(textToExtract); // Release requested WordList for (int wordnum = 0; wordnum < pageWords.Count; wordnum++) { pageWords[wordnum].Dispose(); } } Console.WriteLine("Extracted " + nPages + " pages."); logfile.Close(); }
static void Main(string[] args) { Console.WriteLine("Redactions Sample:"); using (Library lib = new Library()) { Console.WriteLine("Initialized the library."); String sInput = "../../Resources/Sample_Input/sample.pdf"; String sOutput1 = "../Redactions-out.pdf"; String sOutput2 = "../Redactions-out-applied.pdf"; if (args.Length > 0) { sInput = args[0]; } Console.WriteLine("Input file: " + sInput); Document doc = new Document(sInput); Page docpage = doc.GetPage(0); // // Redact occurrences of the word "rain" on the page. // Redact occurrences of the word "cloudy" on the page, changing the display details. // // For a more in-depth example of using the WordFinder, see the TextExtract sample. // // The TextExtract sample is described here. // http://dev.datalogics.com/adobe-pdf-library/sample-program-descriptions/net-sample-programs/extracting-text-from-pdf-files // List <Quad> cloudyQuads = new List <Quad>(); List <Quad> rainQuads = new List <Quad>(); WordFinderConfig wordConfig = new WordFinderConfig(); WordFinder wf = new WordFinder(doc, WordFinderVersion.Latest, wordConfig); IList <Word> words = wf.GetWordList(docpage.PageNumber); foreach (Word w in words) { Console.WriteLine(" " + w.Text.ToLower()); // Store the Quads of all "Cloudy" words in a list for later use in // creating the redaction object. if (w.Text.ToLower().Equals("cloudy") || ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation && w.Text.ToLower().StartsWith("cloudy"))) { cloudyQuads.AddRange(w.Quads); } // Store the Quads of all "Rain" words if (w.Text.ToLower().Equals("rain") || ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation && w.Text.ToLower().StartsWith("rain"))) { rainQuads.AddRange(w.Quads); } } Console.WriteLine("Found Cloudy instances: " + cloudyQuads.Count); Color red = new Color(1.0, 0.0, 0.0); Redaction not_cloudy = new Redaction(docpage, cloudyQuads, red); Console.WriteLine("Found rain instances: " + rainQuads.Count); Redaction no_rain = new Redaction(docpage, rainQuads); no_rain.InternalColor = new Color(0.0, 1.0, 0.0); doc.Save(SaveFlags.Full, sOutput1); Console.WriteLine("Wrote a pdf doc with unapplied redactions."); // actually all the redactions in the document doc.ApplyRedactions(); doc.Save(SaveFlags.Full, sOutput2); Console.WriteLine("Wrote a redacted pdf doc."); } }
static void Main(string[] args) { Console.WriteLine("UnderlinesAndHighlights Sample:"); // ReSharper disable once UnusedVariable using (Library lib = new Library()) { Console.WriteLine("Initialized the library."); String sInput = Library.ResourceDirectory + "Sample_Input/sample.pdf"; String sOutput = "UnderlinesAndHighlights-out.pdf"; if (args.Length > 0) { sInput = args[0]; } if (args.Length > 1) { sOutput = args[1]; } Document doc = new Document(sInput); Console.WriteLine("Opened a document " + sInput); Page docpage = doc.GetPage(0); // // Highlight occurrences of the word "cloudy" on the page. // Underline occurrences of the word "rain" on the page. // // For a more in-depth example of using the WordFinder, see the TextExtraction sample. // List <Quad> cloudyQuads = new List <Quad>(); List <Quad> rainQuads = new List <Quad>(); WordFinderConfig wfc = new WordFinderConfig(); WordFinder wf = new WordFinder(doc, WordFinderVersion.Latest, wfc); IList <Word> words = wf.GetWordList(docpage.PageNumber); foreach (Word w in words) { // Store the Quads of all "Cloudy" words in a list for later use in // creating the annotation. if (w.Text.ToLower().Equals("cloudy") || ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation && w.Text.ToLower().StartsWith("cloudy"))) { cloudyQuads.AddRange(w.Quads); } // Store the Quads of all "Rain" words if (w.Text.ToLower().Equals("rain") || ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation && w.Text.ToLower().StartsWith("rain"))) { rainQuads.AddRange(w.Quads); } } HighlightAnnotation highlights = new HighlightAnnotation(docpage, cloudyQuads); highlights.Color = new Color(1.0, 0.75, 1.0); highlights.NormalAppearance = highlights.GenerateAppearance(); UnderlineAnnotation underlines = new UnderlineAnnotation(docpage, rainQuads); underlines.Color = new Color(0.0, 0.0, 0.0); underlines.NormalAppearance = underlines.GenerateAppearance(); // Read back the text that was annotated. Console.WriteLine("Cloudy text: {0}", highlights.GetAnnotatedText(true)); Console.WriteLine("Rainy text: {0}", underlines.GetAnnotatedText(false)); doc.Save(SaveFlags.Full, sOutput); } }
static void Main(string[] args) { Console.WriteLine("ListWords Sample:"); using (Library lib = new Library()) { Console.WriteLine("Initialized the library."); String sInput = Library.ResourceDirectory + "Sample_Input/sample.pdf"; if (args.Length > 0) { sInput = args[0]; } Console.WriteLine("Input file: " + sInput); Document doc = new Document(sInput); int nPages = doc.NumPages; WordFinderConfig wordConfig = new WordFinderConfig(); wordConfig.IgnoreCharGaps = true; wordConfig.IgnoreLineGaps = false; wordConfig.NoAnnots = true; wordConfig.NoEncodingGuess = true; // leave non-Roman single-byte font alone // Std Roman treatment for custom encoding; overrides the noEncodingGuess option wordConfig.UnknownToStdEnc = false; wordConfig.DisableTaggedPDF = true; // legacy mode WordFinder creation wordConfig.NoXYSort = false; wordConfig.PreserveSpaces = false; wordConfig.NoLigatureExp = false; wordConfig.NoHyphenDetection = false; wordConfig.TrustNBSpace = false; wordConfig.NoExtCharOffset = false; // text extraction efficiency wordConfig.NoStyleInfo = false; // text extraction efficiency WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig); IList <Word> pageWords = null; for (int i = 0; i < nPages; i++) { pageWords = wordFinder.GetWordList(i); foreach (Word wInfo in pageWords) { string s = wInfo.Text; IList <Quad> QuadList = wInfo.Quads; foreach (Quad Q in QuadList) { Console.WriteLine(Q); } foreach (StyleTransition st in wInfo.StyleTransitions) { Console.WriteLine(st); } IList <StyleTransition> styleList = wInfo.StyleTransitions; foreach (StyleTransition st in styleList) { Console.WriteLine(st); } Console.WriteLine(wInfo.Attributes); Console.WriteLine(s); } } Console.WriteLine("Pages=" + nPages); } }