Exemplo n.º 1
0
        string GetPdfPageContent(PDFDoc pdfDoc, int pageNumber)
        {
            List <PdfString> matchFuncLines = new List <PdfString>();

            Page page = pdfDoc.GetPage(pageNumber);

            if (page == null)
            {
                return(null);
            }

            TextExtractor txt = new TextExtractor();

            txt.Begin(page);

            TextExtractor.Line line;
            TextExtractor.Word word;

            string        lineString    = null;
            StringBuilder stringBuilder = new StringBuilder();

            for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
            {
                if (line.GetNumWords() == 0)
                {
                    continue;
                }
                lineString = null;
                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                {
                    int sz = word.GetStringLen();
                    if (sz == 0)
                    {
                        continue;
                    }

                    lineString += word.GetString();
                }
                if (string.IsNullOrEmpty(lineString))
                {
                    continue;
                }
                stringBuilder.Append(lineString);
            }
            txt.Dispose();
            return(stringBuilder.ToString());
        }
Exemplo n.º 2
0
        public void ReadAdvanced(string input_path)
        {
            PDFNet.Initialize();

            try
            {
                PDFDoc doc = new PDFDoc(input_path);
                doc.InitSecurityHandler();

                Page page = doc.GetPage(1);
                if (page == null)
                {
                    ConsoleLog += "Page not found.";
                    return;
                }

                TextExtractor txt = new TextExtractor();
                txt.Begin(page); // Read the page.
                // Other options you may want to consider...
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                // ...

                // Example 1. Get all text on the page in a single string.
                // Words will be separated with space or new line characters.
                if (example1_basic)
                {
                    // Get the word count.
                    ConsoleLog += "Word Count: {0}" + txt.GetWordCount();

                    ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText();
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 2. Get XML logical structure for the page.
                if (example2_xml)
                {
                    String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                    ConsoleLog += "\n\n- GetAsXML  --------------------------\n{0}" + text;
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. Extract words one by one.
                if (example3_wordlist)
                {
                    TextExtractor.Word word;
                    for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            ConsoleLog += word.GetString();
                        }
                    }
                    ConsoleLog += "-----------------------------------------------------------";
                }

                // Example 3. A more advanced text extraction example.
                // The output is XML structure containing paragraphs, lines, words,
                // as well as style and positioning information.
                if (example4_advanced)
                {
                    Rect bbox;
                    int  cur_flow_id = -1, cur_para_id = -1;

                    TextExtractor.Line  line;
                    TextExtractor.Word  word;
                    TextExtractor.Style s, line_style;

                    // For each line on the page...
                    for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                    {
                        if (line.GetNumWords() == 0)
                        {
                            continue;
                        }

                        if (cur_flow_id != line.GetFlowID())
                        {
                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    ConsoleLog += "</Para>";
                                }
                                ConsoleLog += "</Flow>";
                            }
                            cur_flow_id = line.GetFlowID();
                            ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id;
                        }

                        if (cur_para_id != line.GetParagraphID())
                        {
                            if (cur_para_id != -1)
                            {
                                ConsoleLog += "</Para>";
                            }
                            cur_para_id = line.GetParagraphID();
                            ConsoleLog += "<Para id=\"{0}\">" + cur_para_id;
                        }

                        bbox       = line.GetBBox();
                        line_style = line.GetStyle();
                        Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">");
                        PrintStyle(line_style);
                        ConsoleLog += "";

                        // For each word in the line...
                        for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                        {
                            // Output the bounding box for the word.
                            bbox        = word.GetBBox();
                            ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2;

                            int sz = word.GetStringLen();
                            if (sz == 0)
                            {
                                continue;
                            }

                            // If the word style is different from the parent style, output the new style.
                            s = word.GetStyle();
                            if (s != line_style)
                            {
                                PrintStyle(s);
                            }

                            ConsoleLog += ">\n" + word.GetString();
                            ConsoleLog += "</Word>";
                        }
                        ConsoleLog += "</Line>";
                    }

                    if (cur_flow_id != -1)
                    {
                        if (cur_para_id != -1)
                        {
                            cur_para_id = -1;
                            ConsoleLog += "</Para>";
                        }
                        ConsoleLog += "</Flow>";
                    }
                }

                // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption.
                txt.Dispose();
                doc.Close();
                ConsoleLog += "Done.";
            }
            catch (PDFNetException e)
            {
                ConsoleLog += e.Message;
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    PDFDoc doc = new PDFDoc(input_path);
                    doc.InitSecurityHandler();

                    // Example 1. Extract all text content from the document
                    ElementReader reader = new ElementReader();
                    PageIterator  itr    = doc.GetPageIterator();
                    //for (; itr.HasNext(); itr.Next()) //  Read every page
                    {
                        reader.Begin(itr.Current());

                        LowLevelTextExtractUtils u = new LowLevelTextExtractUtils();
                        u.DumpAllText(reader);
                        ConsoleLog += u.ConsoleLog;
                        reader.End();
                    }

                    // Example 2. Extract text based on the selection rectangle.
                    ConsoleLog += "----------------------------------------------------";
                    ConsoleLog += "Extract text based on the selection rectangle.";
                    ConsoleLog += "----------------------------------------------------";

                    Page   first_page = doc.GetPage(1);
                    string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                    string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                    string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                    ConsoleLog += "Field 1: {0}" + field1;
                    ConsoleLog += "Field 2: {0}" + field2;
                    ConsoleLog += "Field 3: {0}" + field3;
                    // ...

                    reader.Dispose();
                    doc.Close();
                    ConsoleLog += "Done.";
                }
                catch (PDFNetException e)
                {
                    ConsoleLog += e.Message;
                }
            }

            PDFNet.Terminate();
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            PDFNet.Initialize();

            // Relative path to the folder containing test files.
            string input_path = "../../TestFiles/";

            bool example1_basic     = false;
            bool example2_xml       = false;
            bool example3_wordlist  = false;
            bool example4_advanced  = true;
            bool example5_low_level = false;

            // Sample code showing how to use high-level text extraction APIs.
            try
            {
                using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
                {
                    doc.InitSecurityHandler();

                    Page page = doc.GetPage(1);
                    if (page == null)
                    {
                        Console.WriteLine("Page not found.");
                        return;
                    }

                    using (TextExtractor txt = new TextExtractor())
                    {
                        txt.Begin(page);                          // Read the page.
                        // Other options you may want to consider...
                        // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove);
                        // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text);
                        // ...

                        // Example 1. Get all text on the page in a single string.
                        // Words will be separated with space or new line characters.
                        if (example1_basic)
                        {
                            // Get the word count.
                            Console.WriteLine("Word Count: {0}", txt.GetWordCount());

                            Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText());
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 2. Get XML logical structure for the page.
                        if (example2_xml)
                        {
                            String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info);
                            Console.WriteLine("\n\n- GetAsXML  --------------------------\n{0}", text);
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 3. Extract words one by one.
                        if (example3_wordlist)
                        {
                            TextExtractor.Word word;
                            for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                            {
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                                {
                                    Console.WriteLine(word.GetString());
                                }
                            }
                            Console.WriteLine("-----------------------------------------------------------");
                        }

                        // Example 3. A more advanced text extraction example.
                        // The output is XML structure containing paragraphs, lines, words,
                        // as well as style and positioning information.
                        if (example4_advanced)
                        {
                            Rect bbox;
                            int  cur_flow_id = -1, cur_para_id = -1;

                            TextExtractor.Line  line;
                            TextExtractor.Word  word;
                            TextExtractor.Style s, line_style;

                            Console.WriteLine("<PDFText>");
                            // For each line on the page...
                            for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine())
                            {
                                if (line.GetNumWords() == 0)
                                {
                                    continue;
                                }

                                if (cur_flow_id != line.GetFlowID())
                                {
                                    if (cur_flow_id != -1)
                                    {
                                        if (cur_para_id != -1)
                                        {
                                            cur_para_id = -1;
                                            Console.WriteLine("</Para>");
                                        }
                                        Console.WriteLine("</Flow>");
                                    }
                                    cur_flow_id = line.GetFlowID();
                                    Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id);
                                }

                                if (cur_para_id != line.GetParagraphID())
                                {
                                    if (cur_para_id != -1)
                                    {
                                        Console.WriteLine("</Para>");
                                    }
                                    cur_para_id = line.GetParagraphID();
                                    Console.WriteLine("<Para id=\"{0}\">", cur_para_id);
                                }

                                bbox       = line.GetBBox();
                                line_style = line.GetStyle();
                                Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
                                PrintStyle(line_style);
                                Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n");

                                // For each word in the line...
                                for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord())
                                {
                                    // Output the bounding box for the word.
                                    bbox = word.GetBBox();
                                    Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00"));
                                    Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\"");
                                    int sz = word.GetStringLen();
                                    if (sz == 0)
                                    {
                                        continue;
                                    }

                                    // If the word style is different from the parent style, output the new style.
                                    s = word.GetStyle();
                                    if (s != line_style)
                                    {
                                        PrintStyle(s);
                                    }

                                    Console.Write(">{0}", word.GetString());
                                    Console.WriteLine("</Word>");
                                }
                                Console.WriteLine("</Line>");
                            }

                            if (cur_flow_id != -1)
                            {
                                if (cur_para_id != -1)
                                {
                                    cur_para_id = -1;
                                    Console.WriteLine("</Para>");
                                }
                                Console.WriteLine("</Flow>");
                            }
                        }
                    }
                    Console.WriteLine("</PDFText>");
                }
            }
            catch (PDFNetException e)
            {
                Console.WriteLine(e.Message);
            }

            // Sample code showing how to use low-level text extraction APIs.
            if (example5_low_level)
            {
                try
                {
                    LowLevelTextExtractUtils util = new LowLevelTextExtractUtils();
                    using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf"))
                    {
                        doc.InitSecurityHandler();

                        // Example 1. Extract all text content from the document
                        using (ElementReader reader = new ElementReader())
                        {
                            PageIterator itr = doc.GetPageIterator();
                            //for (; itr.HasNext(); itr.Next()) //  Read every page
                            {
                                reader.Begin(itr.Current());
                                LowLevelTextExtractUtils.DumpAllText(reader);
                                reader.End();
                            }

                            // Example 2. Extract text based on the selection rectangle.
                            Console.WriteLine("----------------------------------------------------");
                            Console.WriteLine("Extract text based on the selection rectangle.");
                            Console.WriteLine("----------------------------------------------------");

                            Page   first_page = doc.GetPage(1);
                            string field1     = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader);
                            string field2     = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader);
                            string field3     = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader);

                            Console.WriteLine("Field 1: {0}", field1);
                            Console.WriteLine("Field 2: {0}", field2);
                            Console.WriteLine("Field 3: {0}", field3);
                            // ...

                            Console.WriteLine("Done.");
                        }
                    }
                }
                catch (PDFNetException e)
                {
                    Console.WriteLine(e.Message);
                }
            }
        }