C# (CSharp) WordFinder.GetWordList примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: WordFinder

Метод/Функция: GetWordList

Примеров на hotexamples.com: 6

C# (CSharp) WordFinder.GetWordList - 6 примеров найдено. Это лучшие примеры C# (CSharp) кода для WordFinder.GetWordList, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Find(27)

GetWordList(6)

FindWords(4)

GetCoordinatesOfSearchTarget(4)

GetSearchResults(3)

GetWord(2)

RotateMatrix(2)

ParallelFind(2)

SolveConundrum(1)

SearchFaulknerByPage(1)

Matches(1)

GetWords(1)

GetWordStart(1)

ConductSearch(1)

GetSentence(1)

Exceed64x64(1)

GetNextWordStart(1)

GetCount(1)

For(1)

FindWordsWithPaths(1)

FindWordsInString(1)

FindUnscrambledWords(1)

FindAllWords(1)

ToList(1)

Пример #1

Показать файл

Файл: SplitPDFVariations.cs Проект: datalogics-seu/DLE-notshipped

        /* This function is copied primarily from the TextExtract sample,
         * but modified to skip writing out the text that it finds
         */
        static void FindTextUntagged(Document doc, String splitTextString, List <int> listOfPageNumsToSplit)
        {
            // setup the WordFinderConfig
            WordFinderConfig wordConfig = new WordFinderConfig();

            wordConfig.IgnoreCharGaps  = false;
            wordConfig.IgnoreLineGaps  = false;
            wordConfig.NoAnnots        = false;
            wordConfig.NoEncodingGuess = false;
            // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
            wordConfig.UnknownToStdEnc   = false;
            wordConfig.DisableTaggedPDF  = false;   // legacy mode WordFinder creation
            wordConfig.NoXYSort          = true;
            wordConfig.PreserveSpaces    = false;
            wordConfig.NoLigatureExp     = false;
            wordConfig.NoHyphenDetection = false;
            wordConfig.TrustNBSpace      = false;
            wordConfig.NoExtCharOffset   = false;   // text extraction efficiency
            wordConfig.NoStyleInfo       = false;   // text extraction efficiency

            WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);

            int          nPages    = doc.NumPages;
            IList <Word> pageWords = null;

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                // By default, this searches the entire page word list.
                // You could limit it to the first X (e.g. 200) number of words as shown below if you know that the
                // search string will fall within a certain number of words.  If you wanted to only look within
                // a specific quadrant of a page (e.g. lower right corner), you would need to get the bounding box
                // of each Word and compare that to your target area.
                int wordLoop = Math.Min(pageWords.Count, 200);

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                //for (int wordnum = 0; wordnum < wordLoop; wordnum++)  // limit by the fixt X number of Words
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // Check for hyphenated words that break across a line.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
                        ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
                    {
                        // For the purposes of this sample, we'll remove all hyphens.  In practice, you may need to check
                        // words against a dictionary to determine if the hyphenated word is actually one word or two.
                        string[] splitstrs = s.Split(new Char[] { '-', '\u00ad' });
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                    {
                        textToExtract += s;
                    }

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary
                    if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
                    {
                        textToExtract += "\n";
                    }
                }

                //
                if (textToExtract.ToUpper().Contains(splitTextString))
                {
                    Console.WriteLine("Found " + splitTextString + " on page " + i);
                    listOfPageNumsToSplit.Add(i);
                }

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    pageWords[wordnum].Dispose();
                }
            }
        }

Пример #2

Показать файл

Файл: TextExtract.cs Проект: yanrbts/adobe-pdf-library-samples

        static void ExtractTextUntagged(Document doc, WordFinder wordFinder)
        {
            int          nPages    = doc.NumPages;
            IList <Word> pageWords = null;

            System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-untagged-out.txt");
            Console.WriteLine("Writing TextExtract-untagged-out.txt");

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // Check for hyphenated words that break across a line.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
                        ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
                    {
                        // Remove the hyphen and combine the two parts of the word before adding to the extracted text.
                        // Note that we pass in the Unicode character for soft hyphen as well as the regular hyphen.
                        //
                        // In untagged PDF, it's not uncommon to find a mixture of hard and soft hyphens that may
                        // not be used for their intended purposes.
                        // (Soft hyphens are intended only for words that break across lines.)
                        //
                        // For the purposes of this sample, we'll remove all hyphens.  In practice, you may need to check
                        // words against a dictionary to determine if the hyphenated word is actually one word or two.
                        string[] splitstrs = s.Split(new Char[] { '-', '\u00ad' });
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                    {
                        textToExtract += s;
                    }

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary
                    if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
                    {
                        textToExtract += "\n";
                    }
                }

                logfile.WriteLine("<page " + (i + 1) + ">");
                logfile.WriteLine(textToExtract);

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    pageWords[wordnum].Dispose();
                }
            }
            Console.WriteLine("Extracted " + nPages + " pages.");
            logfile.Close();
        }

Пример #3

Показать файл

Файл: TextExtract.cs Проект: yanrbts/adobe-pdf-library-samples

        static void ExtractTextTagged(Document doc, WordFinder wordFinder)
        {
            int          nPages    = doc.NumPages;
            IList <Word> pageWords = null;

            System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-tagged-out.txt");
            Console.WriteLine("Writing TextExtract-tagged-out.txt");

            for (int i = 0; i < nPages; i++)
            {
                pageWords = wordFinder.GetWordList(i);

                String textToExtract = "";

                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    Word wInfo;
                    wInfo = pageWords[wordnum];
                    string s = wInfo.Text;

                    // In most tagged PDFs, soft hyphens are used only to break words across lines, so we'll
                    // check for any soft hyphens and remove them from our text output.
                    //
                    // Note that we're not checking for the LastWordOnLine flag, unlike untagged PDF.  For Tagged PDF,
                    // words are not flagged as being the last on the line if they are not at the end of a sentence.
                    if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen))
                    {
                        // Remove the hyphen and combine the two parts of the word before adding to the extracted text.
                        // Note that we pass in the Unicode character for soft hyphen.
                        string[] splitstrs = s.Split(new Char[] { '\u00ad' });
                        textToExtract += splitstrs[0] + splitstrs[1];
                    }
                    else
                    {
                        textToExtract += s;
                    }

                    // Check for space adjacency and add a space if necessary.
                    if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
                    {
                        textToExtract += " ";
                    }
                    // Check for a line break and add one if necessary.
                    // Normally this is accomplished using WordAttributeFlags.LastWordOnLine,
                    // but for tagged PDFs, the LastWordOnLine flag is set according to the
                    // tags in the PDF, not according to visual line breaks in the document.
                    //
                    // To preserve the visual line breaks in the document, we'll check whether
                    // the word is the last word in the region.  If you instead prefer to
                    // break lines according to the tags in the PDF, use
                    // (wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine,
                    // similar to the untagged case.
                    if (wInfo.IsLastWordInRegion)
                    {
                        textToExtract += "\n";
                    }
                }

                logfile.WriteLine("<page " + (i + 1) + ">");
                logfile.WriteLine(textToExtract);

                // Release requested WordList
                for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
                {
                    pageWords[wordnum].Dispose();
                }
            }
            Console.WriteLine("Extracted " + nPages + " pages.");
            logfile.Close();
        }

Пример #4

Показать файл

Файл: Redactions.cs Проект: yanrbts/adobe-pdf-library-samples

        static void Main(string[] args)
        {
            Console.WriteLine("Redactions Sample:");

            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");
                String sInput   = "../../Resources/Sample_Input/sample.pdf";
                String sOutput1 = "../Redactions-out.pdf";
                String sOutput2 = "../Redactions-out-applied.pdf";

                if (args.Length > 0)
                {
                    sInput = args[0];
                }

                Console.WriteLine("Input file: " + sInput);

                Document doc = new Document(sInput);

                Page docpage = doc.GetPage(0);
                //
                // Redact occurrences of the word "rain" on the page.
                // Redact occurrences of the word "cloudy" on the page, changing the display details.
                //
                // For a more in-depth example of using the WordFinder, see the TextExtract sample.
                //
                // The TextExtract sample is described here.
                // http://dev.datalogics.com/adobe-pdf-library/sample-program-descriptions/net-sample-programs/extracting-text-from-pdf-files
                //

                List <Quad> cloudyQuads = new List <Quad>();

                List <Quad> rainQuads = new List <Quad>();

                WordFinderConfig wordConfig = new WordFinderConfig();
                WordFinder       wf         = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);

                IList <Word> words = wf.GetWordList(docpage.PageNumber);

                foreach (Word w in words)
                {
                    Console.WriteLine(" " + w.Text.ToLower());
                    // Store the Quads of all "Cloudy" words in a list for later use in
                    // creating the redaction object.
                    if (w.Text.ToLower().Equals("cloudy") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("cloudy")))
                    {
                        cloudyQuads.AddRange(w.Quads);
                    }

                    // Store the Quads of all "Rain" words
                    if (w.Text.ToLower().Equals("rain") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) == WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("rain")))
                    {
                        rainQuads.AddRange(w.Quads);
                    }
                }

                Console.WriteLine("Found Cloudy instances: " + cloudyQuads.Count);
                Color red = new Color(1.0, 0.0, 0.0);

                Redaction not_cloudy = new Redaction(docpage, cloudyQuads, red);

                Console.WriteLine("Found rain instances: " + rainQuads.Count);
                Redaction no_rain = new Redaction(docpage, rainQuads);
                no_rain.InternalColor = new Color(0.0, 1.0, 0.0);
                doc.Save(SaveFlags.Full, sOutput1);

                Console.WriteLine("Wrote a pdf doc with unapplied redactions.");

                // actually all the redactions in the document
                doc.ApplyRedactions();

                doc.Save(SaveFlags.Full, sOutput2);

                Console.WriteLine("Wrote a redacted pdf doc.");
            }
        }

Пример #5

Показать файл

        static void Main(string[] args)
        {
            Console.WriteLine("UnderlinesAndHighlights Sample:");

            // ReSharper disable once UnusedVariable
            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");

                String sInput  = Library.ResourceDirectory + "Sample_Input/sample.pdf";
                String sOutput = "UnderlinesAndHighlights-out.pdf";

                if (args.Length > 0)
                {
                    sInput = args[0];
                }

                if (args.Length > 1)
                {
                    sOutput = args[1];
                }

                Document doc = new Document(sInput);

                Console.WriteLine("Opened a document " + sInput);

                Page docpage = doc.GetPage(0);

                //
                // Highlight occurrences of the word "cloudy" on the page.
                // Underline occurrences of the word "rain" on the page.
                //
                // For a more in-depth example of using the WordFinder, see the TextExtraction sample.
                //
                List <Quad>      cloudyQuads = new List <Quad>();
                List <Quad>      rainQuads   = new List <Quad>();
                WordFinderConfig wfc         = new WordFinderConfig();
                WordFinder       wf          = new WordFinder(doc, WordFinderVersion.Latest, wfc);
                IList <Word>     words       = wf.GetWordList(docpage.PageNumber);
                foreach (Word w in words)
                {
                    // Store the Quads of all "Cloudy" words in a list for later use in
                    // creating the annotation.
                    if (w.Text.ToLower().Equals("cloudy") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) ==
                         WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("cloudy")))
                    {
                        cloudyQuads.AddRange(w.Quads);
                    }

                    // Store the Quads of all "Rain" words
                    if (w.Text.ToLower().Equals("rain") ||
                        ((w.Attributes & WordAttributeFlags.HasTrailingPunctuation) ==
                         WordAttributeFlags.HasTrailingPunctuation &&
                         w.Text.ToLower().StartsWith("rain")))
                    {
                        rainQuads.AddRange(w.Quads);
                    }
                }

                HighlightAnnotation highlights = new HighlightAnnotation(docpage, cloudyQuads);
                highlights.Color            = new Color(1.0, 0.75, 1.0);
                highlights.NormalAppearance = highlights.GenerateAppearance();

                UnderlineAnnotation underlines = new UnderlineAnnotation(docpage, rainQuads);
                underlines.Color            = new Color(0.0, 0.0, 0.0);
                underlines.NormalAppearance = underlines.GenerateAppearance();

                // Read back the text that was annotated.
                Console.WriteLine("Cloudy text: {0}", highlights.GetAnnotatedText(true));
                Console.WriteLine("Rainy text: {0}", underlines.GetAnnotatedText(false));

                doc.Save(SaveFlags.Full, sOutput);
            }
        }

Пример #6

Показать файл

Файл: ListWords.cs Проект: vivekel015/adobe-pdf-library-samples

        static void Main(string[] args)
        {
            Console.WriteLine("ListWords Sample:");

            using (Library lib = new Library())
            {
                Console.WriteLine("Initialized the library.");

                String sInput = Library.ResourceDirectory + "Sample_Input/sample.pdf";

                if (args.Length > 0)
                {
                    sInput = args[0];
                }

                Console.WriteLine("Input file: " + sInput);

                Document doc = new Document(sInput);

                int nPages = doc.NumPages;

                WordFinderConfig wordConfig = new WordFinderConfig();
                wordConfig.IgnoreCharGaps  = true;
                wordConfig.IgnoreLineGaps  = false;
                wordConfig.NoAnnots        = true;
                wordConfig.NoEncodingGuess = true;              // leave non-Roman single-byte font alone

                // Std Roman treatment for custom encoding; overrides the noEncodingGuess option
                wordConfig.UnknownToStdEnc = false;

                wordConfig.DisableTaggedPDF  = true;    // legacy mode WordFinder creation
                wordConfig.NoXYSort          = false;
                wordConfig.PreserveSpaces    = false;
                wordConfig.NoLigatureExp     = false;
                wordConfig.NoHyphenDetection = false;
                wordConfig.TrustNBSpace      = false;
                wordConfig.NoExtCharOffset   = false;           // text extraction efficiency
                wordConfig.NoStyleInfo       = false;           // text extraction efficiency

                WordFinder   wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);
                IList <Word> pageWords  = null;
                for (int i = 0; i < nPages; i++)
                {
                    pageWords = wordFinder.GetWordList(i);
                    foreach (Word wInfo in pageWords)
                    {
                        string       s        = wInfo.Text;
                        IList <Quad> QuadList = wInfo.Quads;

                        foreach (Quad Q in QuadList)
                        {
                            Console.WriteLine(Q);
                        }

                        foreach (StyleTransition st in wInfo.StyleTransitions)
                        {
                            Console.WriteLine(st);
                        }

                        IList <StyleTransition> styleList = wInfo.StyleTransitions;
                        foreach (StyleTransition st in styleList)
                        {
                            Console.WriteLine(st);
                        }

                        Console.WriteLine(wInfo.Attributes);
                        Console.WriteLine(s);
                    }
                }
                Console.WriteLine("Pages=" + nPages);
            }
        }