public void ReadAdvanced(string input_path) { PDFNet.Initialize(); try { PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); Page page = doc.GetPage(1); if (page == null) { ConsoleLog += "Page not found."; return; } TextExtractor txt = new TextExtractor(); txt.Begin(page); // Read the page. // Other options you may want to consider... // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove); // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text); // ... // Example 1. Get all text on the page in a single string. // Words will be separated with space or new line characters. if (example1_basic) { // Get the word count. ConsoleLog += "Word Count: {0}" + txt.GetWordCount(); ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText(); ConsoleLog += "-----------------------------------------------------------"; } // Example 2. Get XML logical structure for the page. if (example2_xml) { String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info); ConsoleLog += "\n\n- GetAsXML --------------------------\n{0}" + text; ConsoleLog += "-----------------------------------------------------------"; } // Example 3. Extract words one by one. if (example3_wordlist) { TextExtractor.Word word; for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { ConsoleLog += word.GetString(); } } ConsoleLog += "-----------------------------------------------------------"; } // Example 3. A more advanced text extraction example. // The output is XML structure containing paragraphs, lines, words, // as well as style and positioning information. if (example4_advanced) { Rect bbox; int cur_flow_id = -1, cur_para_id = -1; TextExtractor.Line line; TextExtractor.Word word; TextExtractor.Style s, line_style; // For each line on the page... for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { if (line.GetNumWords() == 0) { continue; } if (cur_flow_id != line.GetFlowID()) { if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; ConsoleLog += "</Para>"; } ConsoleLog += "</Flow>"; } cur_flow_id = line.GetFlowID(); ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id; } if (cur_para_id != line.GetParagraphID()) { if (cur_para_id != -1) { ConsoleLog += "</Para>"; } cur_para_id = line.GetParagraphID(); ConsoleLog += "<Para id=\"{0}\">" + cur_para_id; } bbox = line.GetBBox(); line_style = line.GetStyle(); Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">"); PrintStyle(line_style); ConsoleLog += ""; // For each word in the line... for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { // Output the bounding box for the word. bbox = word.GetBBox(); ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2; int sz = word.GetStringLen(); if (sz == 0) { continue; } // If the word style is different from the parent style, output the new style. s = word.GetStyle(); if (s != line_style) { PrintStyle(s); } ConsoleLog += ">\n" + word.GetString(); ConsoleLog += "</Word>"; } ConsoleLog += "</Line>"; } if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; ConsoleLog += "</Para>"; } ConsoleLog += "</Flow>"; } } // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption. txt.Dispose(); doc.Close(); ConsoleLog += "Done."; } catch (PDFNetException e) { ConsoleLog += e.Message; } // Sample code showing how to use low-level text extraction APIs. if (example5_low_level) { try { LowLevelTextExtractUtils util = new LowLevelTextExtractUtils(); PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); // Example 1. Extract all text content from the document ElementReader reader = new ElementReader(); PageIterator itr = doc.GetPageIterator(); //for (; itr.HasNext(); itr.Next()) // Read every page { reader.Begin(itr.Current()); LowLevelTextExtractUtils u = new LowLevelTextExtractUtils(); u.DumpAllText(reader); ConsoleLog += u.ConsoleLog; reader.End(); } // Example 2. Extract text based on the selection rectangle. ConsoleLog += "----------------------------------------------------"; ConsoleLog += "Extract text based on the selection rectangle."; ConsoleLog += "----------------------------------------------------"; Page first_page = doc.GetPage(1); string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader); string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader); string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader); ConsoleLog += "Field 1: {0}" + field1; ConsoleLog += "Field 2: {0}" + field2; ConsoleLog += "Field 3: {0}" + field3; // ... reader.Dispose(); doc.Close(); ConsoleLog += "Done."; } catch (PDFNetException e) { ConsoleLog += e.Message; } } PDFNet.Terminate(); }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; bool example1_basic = false; bool example2_xml = false; bool example3_wordlist = false; bool example4_advanced = true; bool example5_low_level = false; // Sample code showing how to use high-level text extraction APIs. try { using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) { doc.InitSecurityHandler(); Page page = doc.GetPage(1); if (page == null) { Console.WriteLine("Page not found."); return; } using (TextExtractor txt = new TextExtractor()) { txt.Begin(page); // Read the page. // Other options you may want to consider... // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove); // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text); // ... // Example 1. Get all text on the page in a single string. // Words will be separated with space or new line characters. if (example1_basic) { // Get the word count. Console.WriteLine("Word Count: {0}", txt.GetWordCount()); Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText()); Console.WriteLine("-----------------------------------------------------------"); } // Example 2. Get XML logical structure for the page. if (example2_xml) { String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info); Console.WriteLine("\n\n- GetAsXML --------------------------\n{0}", text); Console.WriteLine("-----------------------------------------------------------"); } // Example 3. Extract words one by one. if (example3_wordlist) { TextExtractor.Word word; for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { Console.WriteLine(word.GetString()); } } Console.WriteLine("-----------------------------------------------------------"); } // Example 3. A more advanced text extraction example. // The output is XML structure containing paragraphs, lines, words, // as well as style and positioning information. if (example4_advanced) { Rect bbox; int cur_flow_id = -1, cur_para_id = -1; TextExtractor.Line line; TextExtractor.Word word; TextExtractor.Style s, line_style; Console.WriteLine("<PDFText>"); // For each line on the page... for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { if (line.GetNumWords() == 0) { continue; } if (cur_flow_id != line.GetFlowID()) { if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; Console.WriteLine("</Para>"); } Console.WriteLine("</Flow>"); } cur_flow_id = line.GetFlowID(); Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id); } if (cur_para_id != line.GetParagraphID()) { if (cur_para_id != -1) { Console.WriteLine("</Para>"); } cur_para_id = line.GetParagraphID(); Console.WriteLine("<Para id=\"{0}\">", cur_para_id); } bbox = line.GetBBox(); line_style = line.GetStyle(); Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00")); PrintStyle(line_style); Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n"); // For each word in the line... for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { // Output the bounding box for the word. bbox = word.GetBBox(); Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00")); Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\""); int sz = word.GetStringLen(); if (sz == 0) { continue; } // If the word style is different from the parent style, output the new style. s = word.GetStyle(); if (s != line_style) { PrintStyle(s); } Console.Write(">{0}", word.GetString()); Console.WriteLine("</Word>"); } Console.WriteLine("</Line>"); } if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; Console.WriteLine("</Para>"); } Console.WriteLine("</Flow>"); } } } Console.WriteLine("</PDFText>"); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } // Sample code showing how to use low-level text extraction APIs. if (example5_low_level) { try { LowLevelTextExtractUtils util = new LowLevelTextExtractUtils(); using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) { doc.InitSecurityHandler(); // Example 1. Extract all text content from the document using (ElementReader reader = new ElementReader()) { PageIterator itr = doc.GetPageIterator(); //for (; itr.HasNext(); itr.Next()) // Read every page { reader.Begin(itr.Current()); LowLevelTextExtractUtils.DumpAllText(reader); reader.End(); } // Example 2. Extract text based on the selection rectangle. Console.WriteLine("----------------------------------------------------"); Console.WriteLine("Extract text based on the selection rectangle."); Console.WriteLine("----------------------------------------------------"); Page first_page = doc.GetPage(1); string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader); string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader); string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader); Console.WriteLine("Field 1: {0}", field1); Console.WriteLine("Field 2: {0}", field2); Console.WriteLine("Field 3: {0}", field3); // ... Console.WriteLine("Done."); } } } catch (PDFNetException e) { Console.WriteLine(e.Message); } } }