/* This function is copied primarily from the TextExtract sample,
         * but modified to skip writing out the text that it finds
         */
        static void FindTextUntagged(Document doc, String splitTextString, List <int> listOfPageNumsToSplit)
        {
            // setup the WordFinderConfig
            WordFinderConfig wordConfig = new WordFinderConfig();

            wordConfig.IgnoreCharGaps  = false;
            wordConfig.IgnoreLineGaps  = false;
            wordConfig.NoAnnots        = false;
            wordConfig.NoEncodingGuess = false;
            // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
            wordConfig.UnknownToStdEnc   = false;
            wordConfig.DisableTaggedPDF  = false;   // legacy mode WordFinder creation
            wordConfig.NoXYSort          = true;
            wordConfig.PreserveSpaces    = false;
            wordConfig.NoLigatureExp     = false;
            wordConfig.NoHyphenDetection = false;
            wordConfig.TrustNBSpace      = false;
            wordConfig.NoExtCharOffset   = false;   // text extraction efficiency
            wordConfig.NoStyleInfo       = false;   // text extraction efficiency

            WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);

            int          nPages    = doc.NumPages;
            IList <Word> pageWords = null;

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                // By default, this searches the entire page word list.
                // You could limit it to the first X (e.g. 200) number of words as shown below if you know that the
                // search string will fall within a certain number of words.  If you wanted to only look within
                // a specific quadrant of a page (e.g. lower right corner), you would need to get the bounding box
                // of each Word and compare that to your target area.
                int wordLoop = Math.Min(pageWords.Count, 200);

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                //for (int wordnum = 0; wordnum < wordLoop; wordnum++)  // limit by the fixt X number of Words
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // Check for hyphenated words that break across a line.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
                        ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
                    {
                        // For the purposes of this sample, we'll remove all hyphens.  In practice, you may need to check
                        // words against a dictionary to determine if the hyphenated word is actually one word or two.
                        string[] splitstrs = s.Split(new Char[] { '-', '\u00ad' });
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                    {
                        textToExtract += s;
                    }

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary
                    if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
                    {
                        textToExtract += "\n";
                    }
                }

                //
                if (textToExtract.ToUpper().Contains(splitTextString))
                {
                    Console.WriteLine("Found " + splitTextString + " on page " + i);
                    listOfPageNumsToSplit.Add(i);
                }

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    pageWords[wordnum].Dispose();
                }
            }
        }
        static void ExtractTextUntagged(Document doc, WordFinder wordFinder)
        {
            int          nPages    = doc.NumPages;
            IList <Word> pageWords = null;

            System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-untagged-out.txt");
            Console.WriteLine("Writing TextExtract-untagged-out.txt");

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // Check for hyphenated words that break across a line.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
                        ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
                    {
                        // Remove the hyphen and combine the two parts of the word before adding to the extracted text.
                        // Note that we pass in the Unicode character for soft hyphen as well as the regular hyphen.
                        //
                        // In untagged PDF, it's not uncommon to find a mixture of hard and soft hyphens that may
                        // not be used for their intended purposes.
                        // (Soft hyphens are intended only for words that break across lines.)
                        //
                        // For the purposes of this sample, we'll remove all hyphens.  In practice, you may need to check
                        // words against a dictionary to determine if the hyphenated word is actually one word or two.
                        string[] splitstrs = s.Split(new Char[] { '-', '\u00ad' });
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                    {
                        textToExtract += s;
                    }

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary
                    if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
                    {
                        textToExtract += "\n";
                    }
                }

                logfile.WriteLine("<page " + (i + 1) + ">");
                logfile.WriteLine(textToExtract);

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    pageWords[wordnum].Dispose();
                }
            }
            Console.WriteLine("Extracted " + nPages + " pages.");
            logfile.Close();
        }
        static void ExtractTextTagged(Document doc, WordFinder wordFinder)
        {
            int          nPages    = doc.NumPages;
            IList <Word> pageWords = null;

            System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-tagged-out.txt");
            Console.WriteLine("Writing TextExtract-tagged-out.txt");

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // In most tagged PDFs, soft hyphens are used only to break words across lines, so we'll
                    // check for any soft hyphens and remove them from our text output.
                    //
                    // Note that we're not checking for the LastWordOnLine flag, unlike untagged PDF.  For Tagged PDF,
                    // words are not flagged as being the last on the line if they are not at the end of a sentence.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen))
                    {
                        // Remove the hyphen and combine the two parts of the word before adding to the extracted text.
                        // Note that we pass in the Unicode character for soft hyphen.
                        string[] splitstrs = s.Split(new Char[] { '\u00ad' });
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                    {
                        textToExtract += s;
                    }

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary.
                    // Normally this is accomplished using WordAttributeFlags.LastWordOnLine,
                    // but for tagged PDFs, the LastWordOnLine flag is set according to the
                    // tags in the PDF, not according to visual line breaks in the document.
                    //
                    // To preserve the visual line breaks in the document, we'll check whether
                    // the word is the last word in the region.  If you instead prefer to
                    // break lines according to the tags in the PDF, use
                    // (wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine,
                    // similar to the untagged case.
                    if (wInfo.IsLastWordInRegion)
                    {
                        textToExtract += "\n";
                    }
                }

                logfile.WriteLine("<page " + (i + 1) + ">");
                logfile.WriteLine(textToExtract);

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    pageWords[wordnum].Dispose();
                }
            }
            Console.WriteLine("Extracted " + nPages + " pages.");
            logfile.Close();
        }
        static void Main(string[] args)
        {
            Console.WriteLine("Redactions Sample:");

            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");
                String sInput   = "../../Resources/Sample_Input/sample.pdf";
                String sOutput1 = "../Redactions-out.pdf";
                String sOutput2 = "../Redactions-out-applied.pdf";

                if (args.Length > 0)
                {
                    sInput = args[0];
                }

                Console.WriteLine("Input file: " + sInput);

                Document doc = new Document(sInput);

                Page docpage = doc.GetPage(0);
                //
                // Redact occurrences of the word "rain" on the page.
                // Redact occurrences of the word "cloudy" on the page, changing the display details.
                //
                // For a more in-depth example of using the WordFinder, see the TextExtract sample.
                //
                // The TextExtract sample is described here.
                // http://dev.datalogics.com/adobe-pdf-library/sample-program-descriptions/net-sample-programs/extracting-text-from-pdf-files
                //

                List <Quad> cloudyQuads = new List <Quad>();

                List <Quad> rainQuads = new List <Quad>();

                WordFinderConfig wordConfig = new WordFinderConfig();
                WordFinder       wf         = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);

                IList <Word> words = wf.GetWordList(docpage.PageNumber);

                foreach (Word w in words)
                {
                    Console.WriteLine(" " + w.Text.ToLower());
                    // Store the Quads of all "Cloudy" words in a list for later use in
                    // creating the redaction object.
                    if (w.Text.ToLower().Equals("cloudy") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("cloudy")))
                    {
                        cloudyQuads.AddRange(w.Quads);
                    }

                    // Store the Quads of all "Rain" words
                    if (w.Text.ToLower().Equals("rain") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("rain")))
                    {
                        rainQuads.AddRange(w.Quads);
                    }
                }

                Console.WriteLine("Found Cloudy instances: " + cloudyQuads.Count);
                Color red = new Color(1.0, 0.0, 0.0);

                Redaction not_cloudy = new Redaction(docpage, cloudyQuads, red);

                Console.WriteLine("Found rain instances: " + rainQuads.Count);
                Redaction no_rain = new Redaction(docpage, rainQuads);
                no_rain.InternalColor = new Color(0.0, 1.0, 0.0);
                doc.Save(SaveFlags.Full, sOutput1);

                Console.WriteLine("Wrote a pdf doc with unapplied redactions.");

                // actually all the redactions in the document
                doc.ApplyRedactions();

                doc.Save(SaveFlags.Full, sOutput2);

                Console.WriteLine("Wrote a redacted pdf doc.");
            }
        }
Ejemplo n.º 5
0
        static void Main(string[] args)
        {
            Console.WriteLine("UnderlinesAndHighlights Sample:");

            // ReSharper disable once UnusedVariable
            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");

                String sInput  = Library.ResourceDirectory + "Sample_Input/sample.pdf";
                String sOutput = "UnderlinesAndHighlights-out.pdf";

                if (args.Length > 0)
                {
                    sInput = args[0];
                }

                if (args.Length > 1)
                {
                    sOutput = args[1];
                }

                Document doc = new Document(sInput);

                Console.WriteLine("Opened a document " + sInput);

                Page docpage = doc.GetPage(0);

                //
                // Highlight occurrences of the word "cloudy" on the page.
                // Underline occurrences of the word "rain" on the page.
                //
                // For a more in-depth example of using the WordFinder, see the TextExtraction sample.
                //
                List <Quad>      cloudyQuads = new List <Quad>();
                List <Quad>      rainQuads   = new List <Quad>();
                WordFinderConfig wfc         = new WordFinderConfig();
                WordFinder       wf          = new WordFinder(doc, WordFinderVersion.Latest, wfc);
                IList <Word>     words       = wf.GetWordList(docpage.PageNumber);
                foreach (Word w in words)
                {
                    // Store the Quads of all "Cloudy" words in a list for later use in
                    // creating the annotation.
                    if (w.Text.ToLower().Equals("cloudy") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) ==
                         WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("cloudy")))
                    {
                        cloudyQuads.AddRange(w.Quads);
                    }

                    // Store the Quads of all "Rain" words
                    if (w.Text.ToLower().Equals("rain") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) ==
                         WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("rain")))
                    {
                        rainQuads.AddRange(w.Quads);
                    }
                }

                HighlightAnnotation highlights = new HighlightAnnotation(docpage, cloudyQuads);
                highlights.Color            = new Color(1.0, 0.75, 1.0);
                highlights.NormalAppearance = highlights.GenerateAppearance();

                UnderlineAnnotation underlines = new UnderlineAnnotation(docpage, rainQuads);
                underlines.Color            = new Color(0.0, 0.0, 0.0);
                underlines.NormalAppearance = underlines.GenerateAppearance();

                // Read back the text that was annotated.
                Console.WriteLine("Cloudy text: {0}", highlights.GetAnnotatedText(true));
                Console.WriteLine("Rainy text: {0}", underlines.GetAnnotatedText(false));

                doc.Save(SaveFlags.Full, sOutput);
            }
        }
        static void Main(string[] args)
        {
            Console.WriteLine("ListWords Sample:");

            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");

                String sInput = Library.ResourceDirectory + "Sample_Input/sample.pdf";

                if (args.Length > 0)
                {
                    sInput = args[0];
                }

                Console.WriteLine("Input file: " + sInput);

                Document doc = new Document(sInput);

                int nPages = doc.NumPages;

                WordFinderConfig wordConfig = new WordFinderConfig();
                wordConfig.IgnoreCharGaps  = true;
                wordConfig.IgnoreLineGaps  = false;
                wordConfig.NoAnnots        = true;
                wordConfig.NoEncodingGuess = true;              // leave non-Roman single-byte font alone

                // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
                wordConfig.UnknownToStdEnc = false;

                wordConfig.DisableTaggedPDF  = true;    // legacy mode WordFinder creation
                wordConfig.NoXYSort          = false;
                wordConfig.PreserveSpaces    = false;
                wordConfig.NoLigatureExp     = false;
                wordConfig.NoHyphenDetection = false;
                wordConfig.TrustNBSpace      = false;
                wordConfig.NoExtCharOffset   = false;           // text extraction efficiency
                wordConfig.NoStyleInfo       = false;           // text extraction efficiency

                WordFinder   wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);
                IList <Word> pageWords  = null;
                for (int i = 0; i < nPages; i++)
                {
                    pageWords = wordFinder.GetWordList(i);
                    foreach (Word wInfo in pageWords)
                    {
                        string       s        = wInfo.Text;
                        IList <Quad> QuadList = wInfo.Quads;

                        foreach (Quad Q in QuadList)
                        {
                            Console.WriteLine(Q);
                        }

                        foreach (StyleTransition st in wInfo.StyleTransitions)
                        {
                            Console.WriteLine(st);
                        }

                        IList <StyleTransition> styleList = wInfo.StyleTransitions;
                        foreach (StyleTransition st in styleList)
                        {
                            Console.WriteLine(st);
                        }

                        Console.WriteLine(wInfo.Attributes);
                        Console.WriteLine(s);
                    }
                }
                Console.WriteLine("Pages=" + nPages);
            }
        }