public void RunPdfTron(string input_path) { PDFNet.Initialize(); // string output_path = "../../../../TestFiles/Output/"; try { // Open the test file PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); PageIterator itr; ElementReader page_reader = new ElementReader(); for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page { int pageno = itr.GetPageNumber(); page_reader.Begin(itr.Current()); ProcessElements(page_reader); page_reader.End(); } page_reader.Dispose(); // Calling Dispose() on ElementReader/Writer/Builder can result in increased performance and lower memory consumption. doc.Close(); } catch (PDFNetException e) { ConsoleLog += e.Message; } PDFNet.Terminate(); }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; try { Console.WriteLine("-------------------------------------------------"); Console.WriteLine("Sample 1 - Extract text data from all pages in the document."); // Open the test file Console.WriteLine("Opening the input pdf..."); using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) using (ElementReader page_reader = new ElementReader()) { doc.InitSecurityHandler(); PageIterator itr; for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page { page_reader.Begin(itr.Current()); ProcessElements(page_reader); page_reader.End(); } Console.WriteLine("Done."); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }
/// <summary> /// extract text in the pdf page /// </summary> void ExtractText() { _charCode = -1; _pageMatrix = PdfPage.GetDefaultMatrix(); _pageHeight = PdfPage.GetPageHeight(); _textLines = new PdfLineOfText[((int)_pageHeight >> 2) + 8]; using (ElementReader page_reader = new ElementReader()) { page_reader.Begin(PdfPage); ProcessElements(page_reader); } PdfLineOfText preLt = null; for (int i = 0; i < _textLines.Length; i++) { var lt = _textLines[i]; if (preLt != null && lt != null && preLt.FirstChar.Top - lt.FirstChar.Top < 3) { preLt.AddPdfChars(lt.Chars); _textLines[i] = lt = null; } preLt = lt; } }
// A utility method used to extract all text content from // a given selection rectangle. The rectangle coordinates are // expressed in PDF user/page coordinate system. public string ReadTextFromRect(Page page, Rect pos, ElementReader reader) { _srch_str = ""; reader.Begin(page); RectTextSearch(reader, pos); reader.End(); return(_srch_str); }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; string output_path = "../../TestFiles/Output/"; string input_filename = "newsletter.pdf"; string output_filename = "newsletter_edited.pdf"; try { Console.WriteLine("Opening the input file..."); using (PDFDoc doc = new PDFDoc(input_path + input_filename)) { doc.InitSecurityHandler(); ElementWriter writer = new ElementWriter(); ElementReader reader = new ElementReader(); XSet visited = new XSet(); PageIterator itr = doc.GetPageIterator(); while (itr.HasNext()) { try { Page page = itr.Current(); visited.Add(page.GetSDFObj().GetObjNum()); reader.Begin(page); writer.Begin(page, ElementWriter.WriteMode.e_replacement, false, true, page.GetResourceDict()); ProcessElements(reader, writer, visited); writer.End(); reader.End(); itr.Next(); } catch (PDFNetException e) { Console.WriteLine(e.Message); } } doc.Save(output_path + output_filename, SDFDoc.SaveOptions.e_remove_unused); Console.WriteLine("Done. Result saved in {0}...", output_filename); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }
static void Main(string[] args) { try { PDFNet.Initialize(); Console.WriteLine("-------------------------------------------------"); Console.WriteLine("Extract page element information from all "); Console.WriteLine("pages in the document."); // Open the test file using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) { doc.InitSecurityHandler(); int pgnum = doc.GetPageCount(); PageIterator itr; using (ElementReader page_reader = new ElementReader()) { for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) // Read every page { Console.WriteLine("Page {0:d}----------------------------------------", itr.GetPageNumber()); Rect crop_box = itr.Current().GetCropBox(); crop_box.Normalize(); // Console.WriteLine(" Page Rectangle: x={0:f} y={1:f} x2={2:f} y2={3:f}", crop_box.x1, crop_box.y1, crop_box.x2, crop_box.y2); // Console.WriteLine(" Page Size: width={0:f} height={1:f}", crop_box.Width(), crop_box.Height()); page_reader.Begin(itr.Current()); ProcessElements(page_reader); page_reader.End(); } } Console.WriteLine("Done."); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }
public void ReadTextFromCoordinates(string input_path, int pagenumber, int urx, int ury, int llx, int lly) { PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); Page page = doc.GetPage(pagenumber); ElementReader reader = new ElementReader(); PageIterator itr = doc.GetPageIterator(); reader.Begin(itr.Current()); LowLevelTextExtractUtils u = new LowLevelTextExtractUtils(); //u.DumpAllText(reader); //ConsoleLog += u.ConsoleLog; //reader.End(); string field3 = u.ReadTextFromRect(page, new Rect(urx, ury, llx, lly), reader); ConsoleLog = field3; reader.Dispose(); doc.Close(); }
double GetBottomTextYValue(int pageNum, double bottomAxisValue, double topAxisValue, bool isMatchNum) { Page curPage = _pdfDoc.GetPage(pageNum); List <TextAndAxisValue> values = new List <TextAndAxisValue>(); using (ElementReader pageReader = new ElementReader()) { pageReader.Begin(curPage); GetTextElement(pageNum, pageReader, values, bottomAxisValue, topAxisValue, isMatchNum); } Merge(values); Filtrate(values); double bottomValue = bottomAxisValue; if (values.Count > 0) { double value = values.Min(x => x.yValue) - 15; if (value > 0) { bottomValue = value; } } return(bottomValue); }
public SortedDictionary <double, FormLineList>[] GetFormLines(int pageNum, LinePos startPos, LinePos endPos , bool isSubsequentPage, SortedDictionary <double, FormLineList> lastPageVerticalLines) { //Get the information of lines and extreme points by travelsaling the page horizontalLines = new SortedDictionary <double, FormLineList>(); verticalLines = new SortedDictionary <double, FormLineList>(); _pageNum = pageNum; _startPos = startPos; _endPos = endPos; Page page = _pdfDoc.GetPage(pageNum); pageSize = PdfTronHelper.GetPageSize(page); topBound.AxisValue = startPos == null ? pageSize[1] : startPos.AxisValue; bottomBound.AxisValue = endPos == null ? 0 : endPos.AxisValue; ii = 0; using (ElementReader page_reader = new ElementReader()) { page_reader.Begin(page); ProcessElements(page_reader); } RemoveLittleLines(horizontalLines); RemoveLittleLines(verticalLines); //Remove short lines. Rect posRect = GetTablePosRect(_pageNum, _startPos, _endPos, isSubsequentPage); RemoveTooShortAndTooLongLines(page, horizontalLines, true, posRect); bool isNotNeedGenerateVerLines = isSubsequentPage && lastPageVerticalLines != null && verticalLines.Count == lastPageVerticalLines.Count; if (!isNotNeedGenerateVerLines) { RemoveTooShortAndTooLongLines(page, verticalLines, false, posRect); } //Generate drawed lines. Rect areaRect; bool existRealRect = horizontalLines.Count > 1 && verticalLines.Count > 1 && IsRect(posRect, horizontalLines, verticalLines); areaRect = existRealRect ? GenerateRectByLines() : posRect; if (existRealRect || isSubsequentPage) { RemoveLinesNotInRect(areaRect, horizontalLines, verticalLines); isNotNeedGenerateVerLines = isSubsequentPage && lastPageVerticalLines != null && verticalLines.Count == lastPageVerticalLines.Count; } else { posRect = GetTablePosRect(_pageNum, _startPos, _endPos, true); } FormLineGenerator.RemoveSpareNearLines(verticalLines, false); if (!isNotNeedGenerateVerLines) { isNotNeedGenerateVerLines = isSubsequentPage && lastPageVerticalLines != null && verticalLines.Count == lastPageVerticalLines.Count; } FormLineGenerator lineGenerator = new FormLineGenerator(page, areaRect); SortedDictionary <double, FormLineList>[] lines = lineGenerator.GetFormLines(existRealRect, horizontalLines, verticalLines, isNotNeedGenerateVerLines); return(lines); }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; string output_path = "../../TestFiles/Output/"; try { using (PDFDoc doc = new PDFDoc()) using (ElementBuilder eb = new ElementBuilder()) // ElementBuilder is used to build new Element objects using (ElementWriter writer = new ElementWriter()) // ElementWriter is used to write Elements to the page { // Start a new page ------------------------------------ // Position an image stream on several places on the page Page page = doc.PageCreate(new Rect(0, 0, 612, 794)); writer.Begin(page); // begin writing to this page // Create an Image that can be reused multiple times in the document or // multiple on the same page. MappedFile img_file = new MappedFile(input_path + "peppers.jpg"); FilterReader img_data = new FilterReader(img_file); Image img = Image.Create(doc, img_data, 400, 600, 8, ColorSpace.CreateDeviceRGB(), Image.InputFilter.e_jpeg); Element element = eb.CreateImage(img, new Matrix2D(200, -145, 20, 300, 200, 150)); writer.WritePlacedElement(element); GState gstate = element.GetGState(); // use the same image (just change its matrix) gstate.SetTransform(200, 0, 0, 300, 50, 450); writer.WritePlacedElement(element); // use the same image again (just change its matrix). writer.WritePlacedElement(eb.CreateImage(img, 300, 600, 200, -150)); writer.End(); // save changes to the current page doc.PagePushBack(page); // Start a new page ------------------------------------ // Construct and draw a path object using different styles page = doc.PageCreate(new Rect(0, 0, 612, 794)); writer.Begin(page); // begin writing to this page eb.Reset(); // Reset GState to default eb.PathBegin(); // start constructing the path eb.MoveTo(306, 396); eb.CurveTo(681, 771, 399.75, 864.75, 306, 771); eb.CurveTo(212.25, 864.75, -69, 771, 306, 396); eb.ClosePath(); element = eb.PathEnd(); // the path is now finished element.SetPathFill(true); // the path should be filled // Set the path color space and color gstate = element.GetGState(); gstate.SetFillColorSpace(ColorSpace.CreateDeviceCMYK()); gstate.SetFillColor(new ColorPt(1, 0, 0, 0)); // cyan gstate.SetTransform(0.5, 0, 0, 0.5, -20, 300); writer.WritePlacedElement(element); // Draw the same path using a different stroke color element.SetPathStroke(true); // this path is should be filled and stroked gstate.SetFillColor(new ColorPt(0, 0, 1, 0)); // yellow gstate.SetStrokeColorSpace(ColorSpace.CreateDeviceRGB()); gstate.SetStrokeColor(new ColorPt(1, 0, 0)); // red gstate.SetTransform(0.5, 0, 0, 0.5, 280, 300); gstate.SetLineWidth(20); writer.WritePlacedElement(element); // Draw the same path with with a given dash pattern element.SetPathFill(false); // this path is should be only stroked gstate.SetStrokeColor(new ColorPt(0, 0, 1)); // blue gstate.SetTransform(0.5, 0, 0, 0.5, 280, 0); double[] dash_pattern = { 30 }; gstate.SetDashPattern(dash_pattern, 0); writer.WritePlacedElement(element); // Use the path as a clipping path writer.WriteElement(eb.CreateGroupBegin()); // Save the graphics state // Start constructing a new path (the old path was lost when we created // a new Element using CreateGroupBegin()). eb.PathBegin(); eb.MoveTo(306, 396); eb.CurveTo(681, 771, 399.75, 864.75, 306, 771); eb.CurveTo(212.25, 864.75, -69, 771, 306, 396); eb.ClosePath(); element = eb.PathEnd(); // path is now built element.SetPathClip(true); // this path is a clipping path element.SetPathStroke(true); // this path is should be filled and stroked gstate = element.GetGState(); gstate.SetTransform(0.5, 0, 0, 0.5, -20, 0); writer.WriteElement(element); writer.WriteElement(eb.CreateImage(img, 100, 300, 400, 600)); writer.WriteElement(eb.CreateGroupEnd()); // Restore the graphics state writer.End(); // save changes to the current page doc.PagePushBack(page); // Start a new page ------------------------------------ page = doc.PageCreate(new Rect(0, 0, 612, 794)); writer.Begin(page); // begin writing to this page eb.Reset(); // Reset GState to default // Begin writing a block of text element = eb.CreateTextBegin(Font.Create(doc, Font.StandardType1Font.e_times_roman), 12); writer.WriteElement(element); string data = "Hello World!"; element = eb.CreateTextRun(data); element.SetTextMatrix(10, 0, 0, 10, 0, 600); element.GetGState().SetLeading(15); // Set the spacing between lines writer.WriteElement(element); writer.WriteElement(eb.CreateTextNewLine()); // New line element = eb.CreateTextRun(data); gstate = element.GetGState(); gstate.SetTextRenderMode(GState.TextRenderingMode.e_stroke_text); gstate.SetCharSpacing(-1.25); gstate.SetWordSpacing(-1.25); writer.WriteElement(element); writer.WriteElement(eb.CreateTextNewLine()); // New line element = eb.CreateTextRun(data); gstate = element.GetGState(); gstate.SetCharSpacing(0); gstate.SetWordSpacing(0); gstate.SetLineWidth(3); gstate.SetTextRenderMode(GState.TextRenderingMode.e_fill_stroke_text); gstate.SetStrokeColorSpace(ColorSpace.CreateDeviceRGB()); gstate.SetStrokeColor(new ColorPt(1, 0, 0)); // red gstate.SetFillColorSpace(ColorSpace.CreateDeviceCMYK()); gstate.SetFillColor(new ColorPt(1, 0, 0, 0)); // cyan writer.WriteElement(element); writer.WriteElement(eb.CreateTextNewLine()); // New line // Set text as a clipping path to the image. element = eb.CreateTextRun(data); gstate = element.GetGState(); gstate.SetTextRenderMode(GState.TextRenderingMode.e_clip_text); writer.WriteElement(element); // Finish the block of text writer.WriteElement(eb.CreateTextEnd()); // Draw an image that will be clipped by the above text writer.WriteElement(eb.CreateImage(img, 10, 100, 1300, 720)); writer.End(); // save changes to the current page doc.PagePushBack(page); // Start a new page ------------------------------------ // // The example illustrates how to embed the external font in a PDF document. // The example also shows how ElementReader can be used to copy and modify // Elements between pages. using (ElementReader reader = new ElementReader()) { // Start reading Elements from the last page. We will copy all Elements to // a new page but will modify the font associated with text. reader.Begin(doc.GetPage(doc.GetPageCount())); page = doc.PageCreate(new Rect(0, 0, 1300, 794)); writer.Begin(page); // begin writing to this page eb.Reset(); // Reset GState to default // Embed an external font in the document. Font font = Font.CreateTrueTypeFont(doc, input_path + "font.ttf"); while ((element = reader.Next()) != null) // Read page contents { if (element.GetType() == Element.Type.e_text) { element.GetGState().SetFont(font, 12); } writer.WriteElement(element); } reader.End(); writer.End(); // save changes to the current page doc.PagePushBack(page); // Start a new page ------------------------------------ // // The example illustrates how to embed the external font in a PDF document. // The example also shows how ElementReader can be used to copy and modify // Elements between pages. // Start reading Elements from the last page. We will copy all Elements to // a new page but will modify the font associated with text. reader.Begin(doc.GetPage(doc.GetPageCount())); page = doc.PageCreate(new Rect(0, 0, 1300, 794)); writer.Begin(page); // begin writing to this page eb.Reset(); // Reset GState to default // Embed an external font in the document. Font font2 = Font.CreateType1Font(doc, input_path + "Misc-Fixed.pfa"); while ((element = reader.Next()) != null) // Read page contents { if (element.GetType() == Element.Type.e_text) { element.GetGState().SetFont(font2, 12); } writer.WriteElement(element); } reader.End(); writer.End(); // save changes to the current page doc.PagePushBack(page); // Start a new page ------------------------------------ page = doc.PageCreate(); writer.Begin(page); // begin writing to this page eb.Reset(); // Reset GState to default // Begin writing a block of text element = eb.CreateTextBegin(Font.Create(doc, Font.StandardType1Font.e_times_roman), 12); element.SetTextMatrix(1.5, 0, 0, 1.5, 50, 600); element.GetGState().SetLeading(15); // Set the spacing between lines writer.WriteElement(element); string para = "A PDF text object consists of operators that can show " + "text strings, move the text position, and set text state and certain " + "other parameters. In addition, there are three parameters that are " + "defined only within a text object and do not persist from one text " + "object to the next: Tm, the text matrix, Tlm, the text line matrix, " + "Trm, the text rendering matrix, actually just an intermediate result " + "that combines the effects of text state parameters, the text matrix " + "(Tm), and the current transformation matrix"; int para_end = para.Length; int text_run = 0; int text_run_end; double para_width = 300; // paragraph width is 300 units double cur_width = 0; while (text_run < para_end) { text_run_end = para.IndexOf(' ', text_run); if (text_run_end < 0) { text_run_end = para_end - 1; } string text = para.Substring(text_run, text_run_end - text_run + 1); element = eb.CreateTextRun(text); if (cur_width + element.GetTextLength() < para_width) { writer.WriteElement(element); cur_width += element.GetTextLength(); } else { writer.WriteElement(eb.CreateTextNewLine()); // New line text = para.Substring(text_run, text_run_end - text_run + 1); element = eb.CreateTextRun(text); cur_width = element.GetTextLength(); writer.WriteElement(element); } text_run = text_run_end + 1; } // ----------------------------------------------------------------------- // The following code snippet illustrates how to adjust spacing between // characters (text runs). element = eb.CreateTextNewLine(); writer.WriteElement(element); // Skip 2 lines writer.WriteElement(element); writer.WriteElement(eb.CreateTextRun("An example of space adjustments between inter-characters:")); writer.WriteElement(eb.CreateTextNewLine()); // Write string "AWAY" without space adjustments between characters. element = eb.CreateTextRun("AWAY"); writer.WriteElement(element); writer.WriteElement(eb.CreateTextNewLine()); // Write string "AWAY" with space adjustments between characters. element = eb.CreateTextRun("A"); writer.WriteElement(element); element = eb.CreateTextRun("W"); element.SetPosAdjustment(140); writer.WriteElement(element); element = eb.CreateTextRun("A"); element.SetPosAdjustment(140); writer.WriteElement(element); element = eb.CreateTextRun("Y again"); element.SetPosAdjustment(115); writer.WriteElement(element); // Draw the same strings using direct content output... writer.Flush(); // flush pending Element writing operations. // You can also write page content directly to the content stream using // ElementWriter.WriteString(...) and ElementWriter.WriteBuffer(...) methods. // Note that if you are planning to use these functions you need to be familiar // with PDF page content operators (see Appendix A in PDF Reference Manual). // Because it is easy to make mistakes during direct output we recommend that // you use ElementBuilder and Element interface instead. writer.WriteString("T* T* "); // New Lines // writer.WriteElement(eb.CreateTextNewLine()); writer.WriteString("(Direct output to PDF page content stream:) Tj T* "); writer.WriteString("(AWAY) Tj T* "); writer.WriteString("[(A)140(W)140(A)115(Y again)] TJ "); // Finish the block of text writer.WriteElement(eb.CreateTextEnd()); writer.End(); // save changes to the current page doc.PagePushBack(page); // Start a new page ------------------------------------ // Image Masks // // In the opaque imaging model, images mark all areas they occupy on the page as // if with opaque paint. All portions of the image, whether black, white, gray, // or color, completely obscure any marks that may previously have existed in the // same place on the page. // In the graphic arts industry and page layout applications, however, it is common // to crop or 'mask out' the background of an image and then place the masked image // on a different background, allowing the existing background to show through the // masked areas. This sample illustrates how to use image masks. page = doc.PageCreate(); writer.Begin(page); // begin writing to the page // Create the Image Mask MappedFile imgf = new MappedFile(input_path + "imagemask.dat"); FilterReader mask_read = new FilterReader(imgf); ColorSpace device_gray = ColorSpace.CreateDeviceGray(); Image mask = Image.Create(doc, mask_read, 64, 64, 1, device_gray, Image.InputFilter.e_ascii_hex); mask.GetSDFObj().PutBool("ImageMask", true); element = eb.CreateRect(0, 0, 612, 794); element.SetPathStroke(false); element.SetPathFill(true); element.GetGState().SetFillColorSpace(device_gray); element.GetGState().SetFillColor(new ColorPt(0.8)); writer.WritePlacedElement(element); element = eb.CreateImage(mask, new Matrix2D(200, 0, 0, -200, 40, 680)); element.GetGState().SetFillColor(new ColorPt(0.1)); writer.WritePlacedElement(element); element.GetGState().SetFillColorSpace(ColorSpace.CreateDeviceRGB()); element.GetGState().SetFillColor(new ColorPt(1, 0, 0)); element = eb.CreateImage(mask, new Matrix2D(200, 0, 0, -200, 320, 680)); writer.WritePlacedElement(element); element.GetGState().SetFillColor(new ColorPt(0, 1, 0)); element = eb.CreateImage(mask, new Matrix2D(200, 0, 0, -200, 40, 380)); writer.WritePlacedElement(element); { // This sample illustrates Explicit Masking. img = Image.Create(doc, input_path + "peppers.jpg"); // mask is the explicit mask for the primary (base) image img.SetMask(mask); element = eb.CreateImage(img, new Matrix2D(200, 0, 0, -200, 320, 380)); writer.WritePlacedElement(element); } writer.End(); // save changes to the current page doc.PagePushBack(page); // Transparency sample ---------------------------------- // Start a new page ------------------------------------- page = doc.PageCreate(); writer.Begin(page); // begin writing to this page eb.Reset(); // Reset the GState to default // Write some transparent text at the bottom of the page. element = eb.CreateTextBegin(Font.Create(doc, Font.StandardType1Font.e_times_roman), 100); // Set the text knockout attribute. Text knockout must be set outside of // the text group. gstate = element.GetGState(); gstate.SetTextKnockout(false); gstate.SetBlendMode(GState.BlendMode.e_bl_difference); writer.WriteElement(element); element = eb.CreateTextRun("Transparency"); element.SetTextMatrix(1, 0, 0, 1, 30, 30); gstate = element.GetGState(); gstate.SetFillColorSpace(ColorSpace.CreateDeviceCMYK()); gstate.SetFillColor(new ColorPt(1, 0, 0, 0)); gstate.SetFillOpacity(0.5); writer.WriteElement(element); // Write the same text on top the old; shifted by 3 points element.SetTextMatrix(1, 0, 0, 1, 33, 33); gstate.SetFillColor(new ColorPt(0, 1, 0, 0)); gstate.SetFillOpacity(0.5); writer.WriteElement(element); writer.WriteElement(eb.CreateTextEnd()); // Draw three overlapping transparent circles. eb.PathBegin(); // start constructing the path eb.MoveTo(459.223, 505.646); eb.CurveTo(459.223, 415.841, 389.85, 343.04, 304.273, 343.04); eb.CurveTo(218.697, 343.04, 149.324, 415.841, 149.324, 505.646); eb.CurveTo(149.324, 595.45, 218.697, 668.25, 304.273, 668.25); eb.CurveTo(389.85, 668.25, 459.223, 595.45, 459.223, 505.646); element = eb.PathEnd(); element.SetPathFill(true); gstate = element.GetGState(); gstate.SetFillColorSpace(ColorSpace.CreateDeviceRGB()); gstate.SetFillColor(new ColorPt(0, 0, 1)); // Blue Circle gstate.SetBlendMode(GState.BlendMode.e_bl_normal); gstate.SetFillOpacity(0.5); writer.WriteElement(element); // Translate relative to the Blue Circle gstate.SetTransform(1, 0, 0, 1, 113, -185); gstate.SetFillColor(new ColorPt(0, 1, 0)); // Green Circle gstate.SetFillOpacity(0.5); writer.WriteElement(element); // Translate relative to the Green Circle gstate.SetTransform(1, 0, 0, 1, -220, 0); gstate.SetFillColor(new ColorPt(1, 0, 0)); // Red Circle gstate.SetFillOpacity(0.5); writer.WriteElement(element); writer.End(); // save changes to the current page doc.PagePushBack(page); // End page ------------------------------------ } doc.Save(output_path + "element_builder.pdf", SDFDoc.SaveOptions.e_remove_unused); Console.WriteLine("Done. Result saved in element_builder.pdf..."); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }
public void ReadAdvanced(string input_path) { PDFNet.Initialize(); try { PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); Page page = doc.GetPage(1); if (page == null) { ConsoleLog += "Page not found."; return; } TextExtractor txt = new TextExtractor(); txt.Begin(page); // Read the page. // Other options you may want to consider... // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove); // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text); // ... // Example 1. Get all text on the page in a single string. // Words will be separated with space or new line characters. if (example1_basic) { // Get the word count. ConsoleLog += "Word Count: {0}" + txt.GetWordCount(); ConsoleLog += "\n\n- GetAsText --------------------------\n{0}" + txt.GetAsText(); ConsoleLog += "-----------------------------------------------------------"; } // Example 2. Get XML logical structure for the page. if (example2_xml) { String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info); ConsoleLog += "\n\n- GetAsXML --------------------------\n{0}" + text; ConsoleLog += "-----------------------------------------------------------"; } // Example 3. Extract words one by one. if (example3_wordlist) { TextExtractor.Word word; for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { ConsoleLog += word.GetString(); } } ConsoleLog += "-----------------------------------------------------------"; } // Example 3. A more advanced text extraction example. // The output is XML structure containing paragraphs, lines, words, // as well as style and positioning information. if (example4_advanced) { Rect bbox; int cur_flow_id = -1, cur_para_id = -1; TextExtractor.Line line; TextExtractor.Word word; TextExtractor.Style s, line_style; // For each line on the page... for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { if (line.GetNumWords() == 0) { continue; } if (cur_flow_id != line.GetFlowID()) { if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; ConsoleLog += "</Para>"; } ConsoleLog += "</Flow>"; } cur_flow_id = line.GetFlowID(); ConsoleLog += "<Flow id=\"{0}\">" + cur_flow_id; } if (cur_para_id != line.GetParagraphID()) { if (cur_para_id != -1) { ConsoleLog += "</Para>"; } cur_para_id = line.GetParagraphID(); ConsoleLog += "<Para id=\"{0}\">" + cur_para_id; } bbox = line.GetBBox(); line_style = line.GetStyle(); Console.Write("<Line box=\"" + bbox.y1 + "," + bbox.y2 + "," + bbox.x1 + "," + bbox.x2 + ">"); PrintStyle(line_style); ConsoleLog += ""; // For each word in the line... for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { // Output the bounding box for the word. bbox = word.GetBBox(); ConsoleLog += "<Word box=\"{0}, {1}, {2}, {3}\"" + bbox.x1 + bbox.y1 + bbox.x2 + bbox.y2; int sz = word.GetStringLen(); if (sz == 0) { continue; } // If the word style is different from the parent style, output the new style. s = word.GetStyle(); if (s != line_style) { PrintStyle(s); } ConsoleLog += ">\n" + word.GetString(); ConsoleLog += "</Word>"; } ConsoleLog += "</Line>"; } if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; ConsoleLog += "</Para>"; } ConsoleLog += "</Flow>"; } } // Note: Calling Dispose() on TextExtractor when it is not anymore in use can result in increased performance and lower memory consumption. txt.Dispose(); doc.Close(); ConsoleLog += "Done."; } catch (PDFNetException e) { ConsoleLog += e.Message; } // Sample code showing how to use low-level text extraction APIs. if (example5_low_level) { try { LowLevelTextExtractUtils util = new LowLevelTextExtractUtils(); PDFDoc doc = new PDFDoc(input_path); doc.InitSecurityHandler(); // Example 1. Extract all text content from the document ElementReader reader = new ElementReader(); PageIterator itr = doc.GetPageIterator(); //for (; itr.HasNext(); itr.Next()) // Read every page { reader.Begin(itr.Current()); LowLevelTextExtractUtils u = new LowLevelTextExtractUtils(); u.DumpAllText(reader); ConsoleLog += u.ConsoleLog; reader.End(); } // Example 2. Extract text based on the selection rectangle. ConsoleLog += "----------------------------------------------------"; ConsoleLog += "Extract text based on the selection rectangle."; ConsoleLog += "----------------------------------------------------"; Page first_page = doc.GetPage(1); string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader); string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader); string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader); ConsoleLog += "Field 1: {0}" + field1; ConsoleLog += "Field 2: {0}" + field2; ConsoleLog += "Field 3: {0}" + field3; // ... reader.Dispose(); doc.Close(); ConsoleLog += "Done."; } catch (PDFNetException e) { ConsoleLog += e.Message; } } PDFNet.Terminate(); }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; bool example1_basic = false; bool example2_xml = false; bool example3_wordlist = false; bool example4_advanced = true; bool example5_low_level = false; // Sample code showing how to use high-level text extraction APIs. try { using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) { doc.InitSecurityHandler(); Page page = doc.GetPage(1); if (page == null) { Console.WriteLine("Page not found."); return; } using (TextExtractor txt = new TextExtractor()) { txt.Begin(page); // Read the page. // Other options you may want to consider... // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_no_dup_remove); // txt.Begin(page, null, TextExtractor.ProcessingFlags.e_remove_hidden_text); // ... // Example 1. Get all text on the page in a single string. // Words will be separated with space or new line characters. if (example1_basic) { // Get the word count. Console.WriteLine("Word Count: {0}", txt.GetWordCount()); Console.WriteLine("\n\n- GetAsText --------------------------\n{0}", txt.GetAsText()); Console.WriteLine("-----------------------------------------------------------"); } // Example 2. Get XML logical structure for the page. if (example2_xml) { String text = txt.GetAsXML(TextExtractor.XMLOutputFlags.e_words_as_elements | TextExtractor.XMLOutputFlags.e_output_bbox | TextExtractor.XMLOutputFlags.e_output_style_info); Console.WriteLine("\n\n- GetAsXML --------------------------\n{0}", text); Console.WriteLine("-----------------------------------------------------------"); } // Example 3. Extract words one by one. if (example3_wordlist) { TextExtractor.Word word; for (TextExtractor.Line line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { Console.WriteLine(word.GetString()); } } Console.WriteLine("-----------------------------------------------------------"); } // Example 3. A more advanced text extraction example. // The output is XML structure containing paragraphs, lines, words, // as well as style and positioning information. if (example4_advanced) { Rect bbox; int cur_flow_id = -1, cur_para_id = -1; TextExtractor.Line line; TextExtractor.Word word; TextExtractor.Style s, line_style; Console.WriteLine("<PDFText>"); // For each line on the page... for (line = txt.GetFirstLine(); line.IsValid(); line = line.GetNextLine()) { if (line.GetNumWords() == 0) { continue; } if (cur_flow_id != line.GetFlowID()) { if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; Console.WriteLine("</Para>"); } Console.WriteLine("</Flow>"); } cur_flow_id = line.GetFlowID(); Console.WriteLine("<Flow id=\"{0}\">", cur_flow_id); } if (cur_para_id != line.GetParagraphID()) { if (cur_para_id != -1) { Console.WriteLine("</Para>"); } cur_para_id = line.GetParagraphID(); Console.WriteLine("<Para id=\"{0}\">", cur_para_id); } bbox = line.GetBBox(); line_style = line.GetStyle(); Console.Write("<Line box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00")); PrintStyle(line_style); Console.Write(" cur_num=\"" + line.GetCurrentNum() + "\"" + ">\n"); // For each word in the line... for (word = line.GetFirstWord(); word.IsValid(); word = word.GetNextWord()) { // Output the bounding box for the word. bbox = word.GetBBox(); Console.Write("<Word box=\"{0}, {1}, {2}, {3}\"", bbox.x1.ToString("0.00"), bbox.y1.ToString("0.00"), bbox.x2.ToString("0.00"), bbox.y2.ToString("0.00")); Console.Write(" cur_num=\"" + word.GetCurrentNum() + "\""); int sz = word.GetStringLen(); if (sz == 0) { continue; } // If the word style is different from the parent style, output the new style. s = word.GetStyle(); if (s != line_style) { PrintStyle(s); } Console.Write(">{0}", word.GetString()); Console.WriteLine("</Word>"); } Console.WriteLine("</Line>"); } if (cur_flow_id != -1) { if (cur_para_id != -1) { cur_para_id = -1; Console.WriteLine("</Para>"); } Console.WriteLine("</Flow>"); } } } Console.WriteLine("</PDFText>"); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } // Sample code showing how to use low-level text extraction APIs. if (example5_low_level) { try { LowLevelTextExtractUtils util = new LowLevelTextExtractUtils(); using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) { doc.InitSecurityHandler(); // Example 1. Extract all text content from the document using (ElementReader reader = new ElementReader()) { PageIterator itr = doc.GetPageIterator(); //for (; itr.HasNext(); itr.Next()) // Read every page { reader.Begin(itr.Current()); LowLevelTextExtractUtils.DumpAllText(reader); reader.End(); } // Example 2. Extract text based on the selection rectangle. Console.WriteLine("----------------------------------------------------"); Console.WriteLine("Extract text based on the selection rectangle."); Console.WriteLine("----------------------------------------------------"); Page first_page = doc.GetPage(1); string field1 = util.ReadTextFromRect(first_page, new Rect(27, 392, 563, 534), reader); string field2 = util.ReadTextFromRect(first_page, new Rect(28, 551, 106, 623), reader); string field3 = util.ReadTextFromRect(first_page, new Rect(208, 550, 387, 621), reader); Console.WriteLine("Field 1: {0}", field1); Console.WriteLine("Field 2: {0}", field2); Console.WriteLine("Field 3: {0}", field3); // ... Console.WriteLine("Done."); } } } catch (PDFNetException e) { Console.WriteLine(e.Message); } } }
/// <summary> /// The main entry point for the application. /// </summary> static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; string output_path = "../../TestFiles/Output/"; try // Extract logical structure from a PDF document { using (PDFDoc doc = new PDFDoc(input_path + "tagged.pdf")) { doc.InitSecurityHandler(); bool example1 = true; bool example2 = true; bool example3 = true; if (example1) { Console.WriteLine("____________________________________________________________"); Console.WriteLine("Sample 1 - Traverse logical structure tree..."); STree tree = doc.GetStructTree(); if (tree.IsValid()) { Console.WriteLine("Document has a StructTree root."); for (int i = 0; i < tree.GetNumKids(); ++i) { // Recursively get structure info for all all child elements. ProcessStructElement(tree.GetKid(i), 0); } } else { Console.WriteLine("This document does not contain any logical structure."); } Console.WriteLine(); Console.WriteLine("Done 1."); } if (example2) { Console.WriteLine("____________________________________________________________"); Console.WriteLine("Sample 2 - Get parent logical structure elements from"); Console.WriteLine("layout elements."); ElementReader reader = new ElementReader(); for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) { reader.Begin(itr.Current()); ProcessElements(reader); reader.End(); } Console.WriteLine(); Console.WriteLine("Done 2."); } if (example3) { Console.WriteLine("____________________________________________________________"); Console.WriteLine("Sample 3 - 'XML style' extraction of PDF logical structure and page content."); //A map which maps page numbers(as Integers) //to page Maps(which map from struct mcid(as Integers) to //text Strings) Hashtable mcid_doc_map = new Hashtable(); ElementReader reader = new ElementReader(); for (PageIterator itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) { Page pg = itr.Current(); reader.Begin(pg); Hashtable page_mcid_map = new Hashtable(); mcid_doc_map.Add(pg.GetIndex(), page_mcid_map); ProcessElements2(reader, page_mcid_map); reader.End(); } STree tree = doc.GetStructTree(); if (tree.IsValid()) { for (int i = 0; i < tree.GetNumKids(); ++i) { ProcessStructElement2(tree.GetKid(i), mcid_doc_map, 0); } } Console.WriteLine(); Console.WriteLine("Done 3."); } doc.Save(output_path + "bookmark.pdf", 0); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }
static void Main(string[] args) { PDFNet.Initialize(); // Relative path to the folder containing test files. string input_path = "../../TestFiles/"; string output_path = "../../TestFiles/Output/"; try { // Read a PDF document from a stream or pass-in a memory buffer... FileStream istm = new FileStream(input_path + "tiger.pdf", FileMode.Open, FileAccess.Read); using (PDFDoc doc = new PDFDoc(istm)) using (ElementWriter writer = new ElementWriter()) using (ElementReader reader = new ElementReader()) { doc.InitSecurityHandler(); int num_pages = doc.GetPageCount(); Element element; // Perform some document editing ... // Here we simply copy all elements from one page to another. for (int i = 1; i <= num_pages; ++i) { Page pg = doc.GetPage(2 * i - 1); reader.Begin(pg); Page new_page = doc.PageCreate(pg.GetMediaBox()); doc.PageInsert(doc.GetPageIterator(2 * i), new_page); writer.Begin(new_page); while ((element = reader.Next()) != null) // Read page contents { writer.WriteElement(element); } writer.End(); reader.End(); } doc.Save(output_path + "doc_memory_edit.pdf", SDFDoc.SaveOptions.e_remove_unused); // Save the document to a stream or a memory buffer... using (FileStream ostm = new FileStream(output_path + "doc_memory_edit.txt", FileMode.Create, FileAccess.Write)) { doc.Save(ostm, SDFDoc.SaveOptions.e_remove_unused); } // Read some data from the file stored in memory reader.Begin(doc.GetPage(1)); while ((element = reader.Next()) != null) { if (element.GetType() == Element.Type.e_path) { Console.Write("Path, "); } } reader.End(); Console.WriteLine(""); Console.WriteLine(""); Console.WriteLine("Done. Result saved in doc_memory_edit.pdf and doc_memory_edit.txt ..."); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }
static void Main(string[] args) { PDFNet.Initialize(); // Example 1: // Extract images by traversing the display list for // every page. With this approach it is possible to obtain // image positioning information and DPI. try { using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) using (ElementReader reader = new ElementReader()) { doc.InitSecurityHandler(); PageIterator itr; for (itr = doc.GetPageIterator(); itr.HasNext(); itr.Next()) { reader.Begin(itr.Current()); ImageExtract(doc, reader); reader.End(); } Console.WriteLine("Done."); } } catch (PDFNetException e) { Console.WriteLine(e.Message); } Console.WriteLine("----------------------------------------------------------------"); // Example 2: // Extract images by scanning the low-level document. try { using (PDFDoc doc = new PDFDoc(input_path + "newsletter.pdf")) { doc.InitSecurityHandler(); image_counter = 0; SDFDoc cos_doc = doc.GetSDFDoc(); int num_objs = cos_doc.XRefSize(); for (int i = 1; i < num_objs; ++i) { Obj obj = cos_doc.GetObj(i); if (obj != null && !obj.IsFree() && obj.IsStream()) { // Process only images DictIterator itr = obj.Find("Subtype"); if (!itr.HasNext() || itr.Value().GetName() != "Image") { continue; } itr = obj.Find("Type"); if (!itr.HasNext() || itr.Value().GetName() != "XObject") { continue; } pdftron.PDF.Image image = new pdftron.PDF.Image(obj); Console.WriteLine("--> Image: {0}", ++image_counter); Console.WriteLine(" Width: {0}", image.GetImageWidth()); Console.WriteLine(" Height: {0}", image.GetImageHeight()); Console.WriteLine(" BPC: {0}", image.GetBitsPerComponent()); string fname = output_path + "image_extract2_" + image_counter.ToString(); image.Export(fname); // or ExporAsPng() or ExporAsTiff() ... // Convert PDF bitmap to GDI+ Bitmap... //Bitmap bmp = image.GetBitmap(); //bmp.Save(fname, ImageFormat.Png); //bmp.Dispose(); // Instead of converting PDF images to a Bitmap, you can also extract // uncompressed/compressed image data directly using element.GetImageData() // as illustrated in ElementReaderAdv sample project. } } } } catch (PDFNetException e) { Console.WriteLine(e.Message); } }