Ejemplos de TextExtractor.GetWordCount en C# (CSharp)

Lenguaje de programación: C# (CSharp)

Clase / Tipo: TextExtractor

Método / Función: GetWordCount

Ejemplos en hotexamples.com: 2

C# (CSharp) TextExtractor.GetWordCount - 2 ejemplos encontrados. Estos son los ejemplos en C# (CSharp) del mundo real mejor valorados de TextExtractor.GetWordCount extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

LoadDocumentFromFile(30)

Extract(29)

GetPageCount(22)

Find(19)

SaveTextToFile(18)

FindNext(15)

ExtractText(15)

Dispose(13)

SetExtractionArea(13)

GetText(11)

GetTextFromPage(9)

FindAll(8)

IsValidFileType(6)

Begin(6)

Open(5)

SavePageTextToFile(5)

ExtractLine(4)

SaveTextToStream(4)

ExtractAll(4)

GetAsXML(4)

SavePageTextToStream(4)

GetFirstLine(3)

Reset(3)

TextExtractingWillBePotentiallySlow(3)

ResetExtractionArea(2)

PostImageAsync(2)

LoadProfiles(2)

ToString(2)

GetValue(2)

LoadDocumentFromStream(2)

GetPageRectangle(2)

Filter(2)

GetPageRect_Width(2)

GetPageRect_Height(2)

GetTextFromBitmapAsync(2)

GetWordCount(2)

Replace(1)

NextPage(1)

GetListValues(1)

IsOCRRecommendedForPage(1)

SelectStrategy(1)

ExtractFullText(1)

SupportedFormats(1)

SupportedLanguages(1)

CreateDocument(1)

AddFilter(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: PdfTron.cs Proyecto: AbbasNaqvi/Projects

        public void ReadAdvanced(string input_path)
        {
            PDFNet.Initialize();

            try
            {
                PDFDoc doc = new PDFDoc(input_path);
                doc.InitSecurityHandler();

                Page page = doc.GetPage(1);
                if (page == null)
                {
                    ConsoleLog += "Page not found.";
                    return;
                }

                TextExtractor txt = new TextExtractor();
                txt.Begin(page); // Read the page.
                // Other options you may want to consider...
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                // ...

                // Example 1. Get all text on the page in a single string.
                // Words will be separated with space or new line characters.
                if (example1_basic)
                {
                    // Get the word count.
                    ConsoleLog += "Word Count: {0}" + txt.GetWordCount();

                    ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText();
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 2. Get XML logical structure for the page.
                if (example2_xml)
                {
                    String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                    ConsoleLog += "\n\n- GetAsXML  --------------------------\n{0}" + text;
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. Extract words one by one.
                if (example3_wordlist)
                {
                    TextExtractor.Word word;
                    for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            ConsoleLog += word.GetString();
                        }
                    }
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. A more advanced text extraction example.
                // The output is XML structure containing paragraphs, lines, words,
                // as well as style and positioning information.
                if (example4_advanced)
                {
                    Rect bbox;
                    int  cur_flow_id = -1, cur_para_id = -1;

                    TextExtractor.Line  line;
                    TextExtractor.Word  word;
                    TextExtractor.Style s, line_style;

                    // For each line on the page...
                    for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        if (line.GetNumWords() == 0)
                        {
                            continue;
                        }

                        if (cur_flow_id != line.GetFlowID())
                        {
                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    ConsoleLog += "</Para>";
                                }
                                ConsoleLog += "</Flow>";
                            }
                            cur_flow_id = line.GetFlowID();
                            ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id;
                        }

                        if (cur_para_id != line.GetParagraphID())
                        {
                            if (cur_para_id != -1)
                            {
                                ConsoleLog += "</Para>";
                            }
                            cur_para_id = line.GetParagraphID();
                            ConsoleLog += "<Para id=\"{0}\">" + cur_para_id;
                        }

                        bbox       = line.GetBBox();
                        line_style = line.GetStyle();
                        Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">");
                        PrintStyle(line_style);
                        ConsoleLog += "";

                        // For each word in the line...
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            // Output the bounding box for the word.
                            bbox        = word.GetBBox();
                            ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2;

                            int sz = word.GetStringLen();
                            if (sz == 0)
                            {
                                continue;
                            }

                            // If the word style is different from the parent style, output the new style.
                            s = word.GetStyle();
                            if (s != line_style)
                            {
                                PrintStyle(s);
                            }

                            ConsoleLog += ">\n" + word.GetString();
                            ConsoleLog += "</Word>";
                        }
                        ConsoleLog += "</Line>";
                    }

                    if (cur_flow_id != -1)
                    {
                        if (cur_para_id != -1)
                        {
                            cur_para_id = -1;
                            ConsoleLog += "</Para>";
                        }
                        ConsoleLog += "</Flow>";
                    }
                }

                // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption.
                txt.Dispose();
                doc.Close();
                ConsoleLog += "Done.";
            }
            catch (PDFNetException e)
            {
                ConsoleLog += e.Message;
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    PDFDoc doc = new PDFDoc(input_path);
                    doc.InitSecurityHandler();

                    // Example 1. Extract all text content from the document
                    ElementReader reader = new ElementReader();
                    PageIterator  itr    = doc.GetPageIterator();
                    //for (; itr.HasNext(); itr.Next()) //  Read every page
                    {
                        reader.Begin(itr.Current());

                        LowLevelTextExtractUtils u = new LowLevelTextExtractUtils();
                        u.DumpAllText(reader);
                        ConsoleLog += u.ConsoleLog;
                        reader.End();
                    }

                    // Example 2. Extract text based on the selection rectangle.
                    ConsoleLog += "----------------------------------------------------";
                    ConsoleLog += "Extract text based on the selection rectangle.";
                    ConsoleLog += "----------------------------------------------------";

                    Page   first_page = doc.GetPage(1);
                    string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                    string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                    string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                    ConsoleLog += "Field 1: {0}" + field1;
                    ConsoleLog += "Field 2: {0}" + field2;
                    ConsoleLog += "Field 3: {0}" + field3;
                    // ...

                    reader.Dispose();
                    doc.Close();
                    ConsoleLog += "Done.";
                }
                catch (PDFNetException e)
                {
                    ConsoleLog += e.Message;
                }
            }

            PDFNet.Terminate();
        }

Ejemplo n.º 2

Mostrar archivo

Archivo: TextExtractTest.cs Proyecto: sntshmani/jimbeam_mylabel

        static void Main(string[] args)
        {
            PDFNet.Initialize();

            // Relative path to the folder containing test files.
            string input_path = "../../TestFiles/";

            bool example1_basic     = false;
            bool example2_xml       = false;
            bool example3_wordlist  = false;
            bool example4_advanced  = true;
            bool example5_low_level = false;

            // Sample code showing how to use high-level text extraction APIs.
            try
            {
                using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
                {
                    doc.InitSecurityHandler();

                    Page page = doc.GetPage(1);
                    if (page == null)
                    {
                        Console.WriteLine("Page not found.");
                        return;
                    }

                    using (TextExtractor txt = new TextExtractor())
                    {
                        txt.Begin(page);                          // Read the page.
                        // Other options you may want to consider...
                        // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                        // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                        // ...

                        // Example 1. Get all text on the page in a single string.
                        // Words will be separated with space or new line characters.
                        if (example1_basic)
                        {
                            // Get the word count.
                            Console.WriteLine("Word Count: {0}", txt.GetWordCount());

                            Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 2. Get XML logical structure for the page.
                        if (example2_xml)
                        {
                            String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                            Console.WriteLine("\n\n- GetAsXML  --------------------------\n{0}", text);
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 3. Extract words one by one.
                        if (example3_wordlist)
                        {
                            TextExtractor.Word word;
                            for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                            {
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                                {
                                    Console.WriteLine(word.GetString());
                                }
                            }
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 3. A more advanced text extraction example.
                        // The output is XML structure containing paragraphs, lines, words,
                        // as well as style and positioning information.
                        if (example4_advanced)
                        {
                            Rect bbox;
                            int  cur_flow_id = -1, cur_para_id = -1;

                            TextExtractor.Line  line;
                            TextExtractor.Word  word;
                            TextExtractor.Style s, line_style;

                            Console.WriteLine("<PDFText>");
                            // For each line on the page...
                            for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                            {
                                if (line.GetNumWords() == 0)
                                {
                                    continue;
                                }

                                if (cur_flow_id != line.GetFlowID())
                                {
                                    if (cur_flow_id != -1)
                                    {
                                        if (cur_para_id != -1)
                                        {
                                            cur_para_id = -1;
                                            Console.WriteLine("</Para>");
                                        }
                                        Console.WriteLine("</Flow>");
                                    }
                                    cur_flow_id = line.GetFlowID();
                                    Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
                                }

                                if (cur_para_id != line.GetParagraphID())
                                {
                                    if (cur_para_id != -1)
                                    {
                                        Console.WriteLine("</Para>");
                                    }
                                    cur_para_id = line.GetParagraphID();
                                    Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
                                }

                                bbox       = line.GetBBox();
                                line_style = line.GetStyle();
                                Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
                                PrintStyle(line_style);
                                Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");

                                // For each word in the line...
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                                {
                                    // Output the bounding box for the word.
                                    bbox = word.GetBBox();
                                    Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
                                    Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
                                    int sz = word.GetStringLen();
                                    if (sz == 0)
                                    {
                                        continue;
                                    }

                                    // If the word style is different from the parent style, output the new style.
                                    s = word.GetStyle();
                                    if (s != line_style)
                                    {
                                        PrintStyle(s);
                                    }

                                    Console.Write(">{0}", word.GetString());
                                    Console.WriteLine("</Word>");
                                }
                                Console.WriteLine("</Line>");
                            }

                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    Console.WriteLine("</Para>");
                                }
                                Console.WriteLine("</Flow>");
                            }
                        }
                    }
                    Console.WriteLine("</PDFText>");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
                    {
                        doc.InitSecurityHandler();

                        // Example 1. Extract all text content from the document
                        using (ElementReader reader = new ElementReader())
                        {
                            PageIterator itr = doc.GetPageIterator();
                            //for (; itr.HasNext(); itr.Next()) //  Read every page
                            {
                                reader.Begin(itr.Current());
                                LowLevelTextExtractUtils.DumpAllText(reader);
                                reader.End();
                            }

                            // Example 2. Extract text based on the selection rectangle.
                            Console.WriteLine("----------------------------------------------------");
                            Console.WriteLine("Extract text based on the selection rectangle.");
                            Console.WriteLine("----------------------------------------------------");

                            Page   first_page = doc.GetPage(1);
                            string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                            string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                            string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                            Console.WriteLine("Field 1: {0}", field1);
                            Console.WriteLine("Field 2: {0}", field2);
                            Console.WriteLine("Field 3: {0}", field3);
                            // ...

                            Console.WriteLine("Done.");
                        }
                    }
                }
                catch (PDFNetException e)
                {
                    Console.WriteLine(e.Message);
                }
            }
        }