C# (CSharp) TextExtractor.GetWordCount Examples

Programming Language: C# (CSharp)

Class/Type: TextExtractor

Method/Function: GetWordCount

Examples at hotexamples.com: 2

C# (CSharp) TextExtractor.GetWordCount - 2 examples found. These are the top rated real world C# (CSharp) examples of TextExtractor.GetWordCount extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LoadDocumentFromFile(30)

Extract(29)

GetPageCount(22)

Find(19)

SaveTextToFile(18)

FindNext(15)

ExtractText(15)

Dispose(13)

SetExtractionArea(13)

GetText(11)

GetTextFromPage(9)

FindAll(8)

IsValidFileType(6)

Begin(6)

Open(5)

SavePageTextToFile(5)

ExtractLine(4)

SaveTextToStream(4)

ExtractAll(4)

GetAsXML(4)

SavePageTextToStream(4)

GetFirstLine(3)

Reset(3)

TextExtractingWillBePotentiallySlow(3)

ResetExtractionArea(2)

PostImageAsync(2)

LoadProfiles(2)

ToString(2)

GetValue(2)

LoadDocumentFromStream(2)

GetPageRectangle(2)

Filter(2)

GetPageRect_Width(2)

GetPageRect_Height(2)

GetTextFromBitmapAsync(2)

GetWordCount(2)

Replace(1)

NextPage(1)

GetListValues(1)

IsOCRRecommendedForPage(1)

SelectStrategy(1)

ExtractFullText(1)

SupportedFormats(1)

SupportedLanguages(1)

CreateDocument(1)

AddFilter(1)

Example #1

Show file

File: PdfTron.cs Project: AbbasNaqvi/Projects

        public void ReadAdvanced(string input_path)
        {
            PDFNet.Initialize();

            try
            {
                PDFDoc doc = new PDFDoc(input_path);
                doc.InitSecurityHandler();

                Page page = doc.GetPage(1);
                if (page == null)
                {
                    ConsoleLog += "Page not found.";
                    return;
                }

                TextExtractor txt = new TextExtractor();
                txt.Begin(page); // Read the page.
                // Other options you may want to consider...
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                // ...

                // Example 1. Get all text on the page in a single string.
                // Words will be separated with space or new line characters.
                if (example1_basic)
                {
                    // Get the word count.
                    ConsoleLog += "Word Count: {0}" + txt.GetWordCount();

                    ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText();
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 2. Get XML logical structure for the page.
                if (example2_xml)
                {
                    String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                    ConsoleLog += "\n\n- GetAsXML  --------------------------\n{0}" + text;
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. Extract words one by one.
                if (example3_wordlist)
                {
                    TextExtractor.Word word;
                    for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            ConsoleLog += word.GetString();
                        }
                    }
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. A more advanced text extraction example.
                // The output is XML structure containing paragraphs, lines, words,
                // as well as style and positioning information.
                if (example4_advanced)
                {
                    Rect bbox;
                    int  cur_flow_id = -1, cur_para_id = -1;

                    TextExtractor.Line  line;
                    TextExtractor.Word  word;
                    TextExtractor.Style s, line_style;

                    // For each line on the page...
                    for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        if (line.GetNumWords() == 0)
                        {
                            continue;
                        }

                        if (cur_flow_id != line.GetFlowID())
                        {
                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    ConsoleLog += "</Para>";
                                }
                                ConsoleLog += "</Flow>";
                            }
                            cur_flow_id = line.GetFlowID();
                            ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id;
                        }

                        if (cur_para_id != line.GetParagraphID())
                        {
                            if (cur_para_id != -1)
                            {
                                ConsoleLog += "</Para>";
                            }
                            cur_para_id = line.GetParagraphID();
                            ConsoleLog += "<Para id=\"{0}\">" + cur_para_id;
                        }

                        bbox       = line.GetBBox();
                        line_style = line.GetStyle();
                        Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">");
                        PrintStyle(line_style);
                        ConsoleLog += "";

                        // For each word in the line...
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            // Output the bounding box for the word.
                            bbox        = word.GetBBox();
                            ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2;

                            int sz = word.GetStringLen();
                            if (sz == 0)
                            {
                                continue;
                            }

                            // If the word style is different from the parent style, output the new style.
                            s = word.GetStyle();
                            if (s != line_style)
                            {
                                PrintStyle(s);
                            }

                            ConsoleLog += ">\n" + word.GetString();
                            ConsoleLog += "</Word>";
                        }
                        ConsoleLog += "</Line>";
                    }

                    if (cur_flow_id != -1)
                    {
                        if (cur_para_id != -1)
                        {
                            cur_para_id = -1;
                            ConsoleLog += "</Para>";
                        }
                        ConsoleLog += "</Flow>";
                    }
                }

                // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption.
                txt.Dispose();
                doc.Close();
                ConsoleLog += "Done.";
            }
            catch (PDFNetException e)
            {
                ConsoleLog += e.Message;
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    PDFDoc doc = new PDFDoc(input_path);
                    doc.InitSecurityHandler();

                    // Example 1. Extract all text content from the document
                    ElementReader reader = new ElementReader();
                    PageIterator  itr    = doc.GetPageIterator();
                    //for (; itr.HasNext(); itr.Next()) //  Read every page
                    {
                        reader.Begin(itr.Current());

                        LowLevelTextExtractUtils u = new LowLevelTextExtractUtils();
                        u.DumpAllText(reader);
                        ConsoleLog += u.ConsoleLog;
                        reader.End();
                    }

                    // Example 2. Extract text based on the selection rectangle.
                    ConsoleLog += "----------------------------------------------------";
                    ConsoleLog += "Extract text based on the selection rectangle.";
                    ConsoleLog += "----------------------------------------------------";

                    Page   first_page = doc.GetPage(1);
                    string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                    string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                    string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                    ConsoleLog += "Field 1: {0}" + field1;
                    ConsoleLog += "Field 2: {0}" + field2;
                    ConsoleLog += "Field 3: {0}" + field3;
                    // ...

                    reader.Dispose();
                    doc.Close();
                    ConsoleLog += "Done.";
                }
                catch (PDFNetException e)
                {
                    ConsoleLog += e.Message;
                }
            }

            PDFNet.Terminate();
        }

Example #2

Show file

File: TextExtractTest.cs Project: sntshmani/jimbeam_mylabel

        static void Main(string[] args)
        {
            PDFNet.Initialize();

            // Relative path to the folder containing test files.
            string input_path = "../../TestFiles/";

            bool example1_basic     = false;
            bool example2_xml       = false;
            bool example3_wordlist  = false;
            bool example4_advanced  = true;
            bool example5_low_level = false;

            // Sample code showing how to use high-level text extraction APIs.
            try
            {
                using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
                {
                    doc.InitSecurityHandler();

                    Page page = doc.GetPage(1);
                    if (page == null)
                    {
                        Console.WriteLine("Page not found.");
                        return;
                    }

                    using (TextExtractor txt = new TextExtractor())
                    {
                        txt.Begin(page);                          // Read the page.
                        // Other options you may want to consider...
                        // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                        // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                        // ...

                        // Example 1. Get all text on the page in a single string.
                        // Words will be separated with space or new line characters.
                        if (example1_basic)
                        {
                            // Get the word count.
                            Console.WriteLine("Word Count: {0}", txt.GetWordCount());

                            Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 2. Get XML logical structure for the page.
                        if (example2_xml)
                        {
                            String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                            Console.WriteLine("\n\n- GetAsXML  --------------------------\n{0}", text);
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 3. Extract words one by one.
                        if (example3_wordlist)
                        {
                            TextExtractor.Word word;
                            for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                            {
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                                {
                                    Console.WriteLine(word.GetString());
                                }
                            }
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 3. A more advanced text extraction example.
                        // The output is XML structure containing paragraphs, lines, words,
                        // as well as style and positioning information.
                        if (example4_advanced)
                        {
                            Rect bbox;
                            int  cur_flow_id = -1, cur_para_id = -1;

                            TextExtractor.Line  line;
                            TextExtractor.Word  word;
                            TextExtractor.Style s, line_style;

                            Console.WriteLine("<PDFText>");
                            // For each line on the page...
                            for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                            {
                                if (line.GetNumWords() == 0)
                                {
                                    continue;
                                }

                                if (cur_flow_id != line.GetFlowID())
                                {
                                    if (cur_flow_id != -1)
                                    {
                                        if (cur_para_id != -1)
                                        {
                                            cur_para_id = -1;
                                            Console.WriteLine("</Para>");
                                        }
                                        Console.WriteLine("</Flow>");
                                    }
                                    cur_flow_id = line.GetFlowID();
                                    Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
                                }

                                if (cur_para_id != line.GetParagraphID())
                                {
                                    if (cur_para_id != -1)
                                    {
                                        Console.WriteLine("</Para>");
                                    }
                                    cur_para_id = line.GetParagraphID();
                                    Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
                                }

                                bbox       = line.GetBBox();
                                line_style = line.GetStyle();
                                Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
                                PrintStyle(line_style);
                                Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");

                                // For each word in the line...
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                                {
                                    // Output the bounding box for the word.
                                    bbox = word.GetBBox();
                                    Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
                                    Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
                                    int sz = word.GetStringLen();
                                    if (sz == 0)
                                    {
                                        continue;
                                    }

                                    // If the word style is different from the parent style, output the new style.
                                    s = word.GetStyle();
                                    if (s != line_style)
                                    {
                                        PrintStyle(s);
                                    }

                                    Console.Write(">{0}", word.GetString());
                                    Console.WriteLine("</Word>");
                                }
                                Console.WriteLine("</Line>");
                            }

                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    Console.WriteLine("</Para>");
                                }
                                Console.WriteLine("</Flow>");
                            }
                        }
                    }
                    Console.WriteLine("</PDFText>");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
                    {
                        doc.InitSecurityHandler();

                        // Example 1. Extract all text content from the document
                        using (ElementReader reader = new ElementReader())
                        {
                            PageIterator itr = doc.GetPageIterator();
                            //for (; itr.HasNext(); itr.Next()) //  Read every page
                            {
                                reader.Begin(itr.Current());
                                LowLevelTextExtractUtils.DumpAllText(reader);
                                reader.End();
                            }

                            // Example 2. Extract text based on the selection rectangle.
                            Console.WriteLine("----------------------------------------------------");
                            Console.WriteLine("Extract text based on the selection rectangle.");
                            Console.WriteLine("----------------------------------------------------");

                            Page   first_page = doc.GetPage(1);
                            string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                            string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                            string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                            Console.WriteLine("Field 1: {0}", field1);
                            Console.WriteLine("Field 2: {0}", field2);
                            Console.WriteLine("Field 3: {0}", field3);
                            // ...

                            Console.WriteLine("Done.");
                        }
                    }
                }
                catch (PDFNetException e)
                {
                    Console.WriteLine(e.Message);
                }
            }
        }