////////////////////////////////////////////////////////////////////////////////////////////////// // GetFirstParagraph // get reference to the first paragraph on the page ////////////////////////////////////////////////////////////////////////////////////////////////// private static bool MoveParagraphToParent(PdsStructElement struct_elem) { // search kid struct elements for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_elem == null) { PdfixEngine.ThrowException(); } string type = kid_elem.GetType_(true); if (type == "P") { if (!struct_elem.MoveChild(i, struct_elem, struct_elem.GetNumChildren() - 1)) { throw new Exception(); } return(true); } var paragraph = MoveParagraphToParent(kid_elem); return(paragraph); } } return(false); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetParagraphTextState // get the text state of the text objects inside paragraph by iterating content kid objects ////////////////////////////////////////////////////////////////////////////////////////////////// static PdfTextState GetParagraphTextState(PdsStructElement struct_elem) { for (int i = 0; i < struct_elem.GetNumKids(); i++) { if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidPageContent) { // acquire page on which the element is present PdfDoc doc = struct_elem.GetStructTree().GetDoc(); PdfPage page = doc.AcquirePage(struct_elem.GetKidPageNumber(i)); // find text object with mcid on the page to get the text state int mcid = struct_elem.GetKidMcid(i); var num_pages = page.GetNumPageObjects(); for (int j = 0; j < page.GetNumPageObjects(); j++) { var ts = GetPageObjectTextState(page.GetPageObject(j), mcid); // Handled by MIkhaylov KS if (ts.font_size == 0) { continue; } page.Release(); return(ts); } page.Release(); } } return(new PdfTextState()); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetFirstParagraph // get reference to the first paragraph on the page ////////////////////////////////////////////////////////////////////////////////////////////////// private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem) { // search kid struct elements for (int i = 0; i < struct_elem.GetNumKids(); i++) { if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement) { PdsObject kid_obj = struct_elem.GetKidObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); if (kid_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } string type = kid_elem.GetType_(true); if (type == "P") { return(kid_elem); } var paragraph = GetFirstParagraph(kid_elem); if (paragraph != null) { kid_elem.Release(); return(paragraph); } kid_elem.Release(); } } return(null); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetFirstParagraph // get reference to the first paragraph on the page ////////////////////////////////////////////////////////////////////////////////////////////////// private static PdsStructElement GetFirstTable(PdsStructElement struct_elem) { // search kid struct elements for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } string type = kid_elem.GetType_(true); if (type == "Table") { return(kid_elem); } var table = GetFirstTable(kid_elem); if (table != null) { return(table); } } } return(null); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetFirstParagraph // get reference to the first paragraph on the page ////////////////////////////////////////////////////////////////////////////////////////////////// private static PdsStructElement GetFirstParagraph(PdsStructElement struct_elem) { // search kid struct elements for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_elem == null) { PdfixEngine.ThrowException(); } string type = kid_elem.GetType_(true); if (type == "P") { return(kid_elem); } var paragraph = GetFirstParagraph(kid_elem); if (paragraph != null) { return(paragraph); } } } return(null); }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // read document structure tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { Console.WriteLine("No Tags available"); } else { for (var i = 0; i < struct_tree.GetNumKids(); i++) { PdsObject kid_object = struct_tree.GetKidObject(i); PdsStructElement struct_elem = struct_tree.AcquireStructElement(kid_object); ProcessStructElement(doc, struct_elem, ""); struct_elem.Release(); } } doc.Close(); pdfix.Destroy(); }
public static void Run( String openPath, // source PDF document String savePath // dest PDF document ) { Pdfix pdfix = PdfixEngine.Instance; PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } // tag text on the bottom of the page as artifact for (int i = 0; i < struct_tree.GetNumChildren(); i++) { PdsObject kid_obj = struct_tree.GetChildObject(i); PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj); RemoveParagraph(kid_elem); } // the struct tree was updates, save page content on each page to apply changes for (int i = 0; i < doc.GetNumPages(); i++) { PdfPage page = doc.AcquirePage(i); MarkUntaggedObjectsAsArtifact(page); page.Release(); } if (!doc.Save(savePath, Pdfix.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); }
////////////////////////////////////////////////////////////////////////////////////////////////// // Remove paragraph from structure tree // re-tag the struct element to heading based on font properties ////////////////////////////////////////////////////////////////////////////////////////////////// internal static void RemoveParagraph(PdsStructElement struct_elem) { // remove last 2 P struct elements from struct tree for (int i = struct_elem.GetNumKids() - 1; i >= 0; i--) { if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement) { PdsObject kid_obj = struct_elem.GetKidObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); string type = kid_elem.GetType_(true); if (type == "P") { for (int j = kid_elem.GetNumKids() - 1; j >= 0; j--) { if (!kid_elem.RemoveKid(j)) { throw new Exception(pdfix.GetErrorType().ToString()); } } } else if (type == "Figure") { // remove figure if does not contain an alt text string alt = kid_elem.GetAlt(); if (alt.Length == 0) { for (int j = kid_elem.GetNumKids() - 1; j >= 0; j--) { if (!kid_elem.RemoveKid(j)) { throw new Exception(pdfix.GetErrorType().ToString()); } } } } else { RemoveParagraph(kid_elem); } // remove this element if it has no kids if (kid_elem.GetNumKids() == 0) { struct_elem.RemoveKid(i); } kid_elem.Release(); } // remove only 2 paragraphs in this sample if (++count >= 2) { break; } } }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement figure = GetFirstFigure(struct_tree); if (figure == null) { throw new Exception("No figure found."); } if (!figure.SetAlt("This is a new alternate text")) { throw new Exception(pdfix.GetError()); } if (!doc.Save(savePath, PdfSaveFlags.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); pdfix.Destroy(); }
private static bool MoveParagraphToParent(PdsStructTree struct_tree) { for (int i = 0; i < struct_tree.GetNumChildren();) { PdsObject kid_obj = struct_tree.GetChildObject(i); PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj); var paragraph = MoveParagraphToParent(kid_elem); return(paragraph); } return(false); }
////////////////////////////////////////////////////////////////////////////////////////////////// // Remove paragraph from structure tree // re-tag the struct element to heading based on font properties ////////////////////////////////////////////////////////////////////////////////////////////////// internal static void RemoveParagraph(PdsStructElement struct_elem) { // remove last 2 P struct elements from struct tree for (int i = struct_elem.GetNumChildren() - 1; i >= 0; i--) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); string type = kid_elem.GetType_(true); if (type == "P") { for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--) { if (!kid_elem.RemoveChild(j)) { PdfixEngine.ThrowException(); } } } else if (type == "Figure") { // remove figure if does not contain an alt text string alt = kid_elem.GetAlt(); if (alt.Length == 0) { for (int j = kid_elem.GetNumChildren() - 1; j >= 0; j--) { if (!kid_elem.RemoveChild(j)) { PdfixEngine.ThrowException(); } } } } else { RemoveParagraph(kid_elem); } // remove this element if it has no kids if (kid_elem.GetNumChildren() == 0) { struct_elem.RemoveChild(i); } } // remove only 2 paragraphs in this sample if (++count >= 2) { break; } } }
private static PdsStructElement GetFirstTable(PdsStructTree struct_tree) { for (int i = 0; i < struct_tree.GetNumChildren(); i++) { PdsObject kid_obj = struct_tree.GetChildObject(i); PdsStructElement kid_elem = struct_tree.GetStructElementFromObject(kid_obj); var paragraph = GetFirstTable(kid_elem); if (paragraph != null) { return(paragraph); } } return(null); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetParagraphBBox // get the text state of the text objects inside paragraph by iterating content kid objects ////////////////////////////////////////////////////////////////////////////////////////////////// private static bool GetStructElementBBox(PdsStructElement struct_elem, ref PdfRect bbox) { bool result = false; for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent) { // acquire page on which the element is present PdfDoc doc = struct_elem.GetStructTree().GetDoc(); PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i)); // find text object with mcid on the page to get the text state int mcid = struct_elem.GetChildMcid(i); var content = page.GetContent(); for (int j = 0; j < content.GetNumObjects(); j++) { PdsPageObject page_object = content.GetObject(j); // check if this text page object has the same mcid PdsContentMark content_mark = page_object.GetContentMark(); if (content_mark != null && content_mark.GetTagMcid() == mcid) { PdfRect elem_bbox = page_object.GetBBox(); if ((bbox.left - bbox.right == 0) || (bbox.top - bbox.bottom == 0)) { bbox = elem_bbox; } else { bbox.left = Math.Min(bbox.left, elem_bbox.left); bbox.right = Math.Max(bbox.right, elem_bbox.right); bbox.top = Math.Max(bbox.top, elem_bbox.top); bbox.bottom = Math.Min(bbox.bottom, elem_bbox.bottom); } result = true; } } } else if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); GetStructElementBBox(kid_elem, ref bbox); } } return(result); }
private static PdsStructElement GetFirstParagraph(PdsStructTree struct_tree) { for (int i = 0; i < struct_tree.GetNumKids(); i++) { PdsObject kid_obj = struct_tree.GetKidObject(i); PdsStructElement kid_elem = struct_tree.AcquireStructElement(kid_obj); var paragraph = GetFirstParagraph(kid_elem); if (paragraph != null) { kid_elem.Release(); return(paragraph); } kid_elem.Release(); } return(null); }
public static void Run( String openPath // source PDF document ) { Pdfix pdfix = PdfixEngine.Instance; PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // read document structure tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { Console.WriteLine("No Tags available"); } else { for (var i = 0; i < struct_tree.GetNumChildren(); i++) { PdsObject kid_object = struct_tree.GetChildObject(i); PdsStructElement struct_elem = struct_tree.GetStructElementFromObject(kid_object); ProcessStructElement(doc, struct_elem, ""); } } doc.Close(); }
public static void Run( String openPath, // source PDF document String savePath // dest PDF document ) { Pdfix pdfix = PdfixEngine.Instance; PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement figure = GetFirstFigure(struct_tree); if (figure == null) { throw new Exception("No figure found."); } if (!figure.SetAlt("This is a new alternate text")) { throw new Exception(pdfix.GetError()); } if (!doc.Save(savePath, Pdfix.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); }
////////////////////////////////////////////////////////////////////////////////////////////////// // TagParagraphAsHeading // re-tag the struct element to heading based on font properties ////////////////////////////////////////////////////////////////////////////////////////////////// static void TagParagraphAsHeading(PdsStructElement struct_elem) { string type = struct_elem.GetType_(true); if (type == "P") { // get the paragraph text_state PdfTextState ts = GetParagraphTextState(struct_elem); // get the font name string font_name = ts.font != null?ts.font.GetFontName() : ""; string tag_type = ""; if (font_name.Contains("Black") && ts.font_size >= 25) { tag_type = "H1"; } else if (font_name.Contains("Bold") && ts.font_size >= 16) { tag_type = "H2"; } // update tag type if (tag_type.Length != 0) { struct_elem.SetType(tag_type); } return; // this was a P tag, no need to continue to kid struct elements } // search kid struct elements for (int i = 0; i < struct_elem.GetNumKids(); i++) { if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement) { PdsObject kid_obj = struct_elem.GetKidObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); TagParagraphAsHeading(kid_elem); kid_elem.Release(); } } }
// process struct element childs and collect all bboxes of the content elements // bounding boxes of child elements and content can be placed on different pages static List <PdfRect> GetStructElementBboxes(PdfDoc doc, PdsStructElement struct_elem) { var bboxes = new List <PdfRect>(); int num_kids = struct_elem.GetNumKids(); for (int i = 0; i < num_kids; i++) { var kid_obj = struct_elem.GetKidObject(i); switch (struct_elem.GetKidType(i)) { case PdfStructElementType.kPdsStructKidElement: { var kid_struct_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); if (kid_struct_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } bboxes.AddRange(GetStructElementBboxes(doc, kid_struct_elem)); kid_struct_elem.Release(); } break; case PdfStructElementType.kPdsStructKidObject: break; case PdfStructElementType.kPdsStructKidStreamContent: case PdfStructElementType.kPdsStructKidPageContent: { var kid_page_num = struct_elem.GetKidPageNumber(i); var mcid = struct_elem.GetKidMcid(i); bboxes.AddRange(GetMcidBBoxes(doc, kid_page_num, mcid)); } break; } } return(bboxes); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetParagraphTextState // get the text state of the text objects inside paragraph by iterating content kid objects ////////////////////////////////////////////////////////////////////////////////////////////////// static PdfTextState GetParagraphTextState(PdsStructElement struct_elem) { for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent) { // acquire page on which the element is present PdfDoc doc = struct_elem.GetStructTree().GetDoc(); PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i)); // find text object with mcid on the page to get the text state int mcid = struct_elem.GetChildMcid(i); var content = page.GetContent(); for (int j = 0; j < content.GetNumObjects();) { var ts = GetPageObjectTextState(content.GetObject(j), mcid); page.Release(); return(ts); } page.Release(); } } return(new PdfTextState()); }
// process struct element childs and collect all bboxes of the content elements // bounding boxes of child elements and content can be placed on different pages static List <PdfRect> GetStructElementBboxes(PdfDoc doc, PdsStructElement struct_elem) { var bboxes = new List <PdfRect>(); int num_kids = struct_elem.GetNumChildren(); for (int i = 0; i < num_kids; i++) { var kid_obj = struct_elem.GetChildObject(i); switch (struct_elem.GetChildType(i)) { case PdfStructElementType.kPdsStructChildElement: { var kid_struct_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); if (kid_struct_elem == null) { PdfixEngine.ThrowException(); } bboxes.AddRange(GetStructElementBboxes(doc, kid_struct_elem)); } break; case PdfStructElementType.kPdsStructChildObject: break; case PdfStructElementType.kPdsStructChildStreamContent: case PdfStructElementType.kPdsStructChildPageContent: { var kid_page_num = struct_elem.GetChildPageNumber(i); var mcid = struct_elem.GetChildMcid(i); bboxes.AddRange(GetMcidBBoxes(doc, kid_page_num, mcid)); } break; } } return(bboxes); }
static void ProcessStructElement(PdfDoc doc, PdsStructElement struct_elem, string indent) { indent += " "; if (struct_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the element type string type_str = struct_elem.GetType_(true); if (type_str.Length != 0) { Console.WriteLine(indent + "Struct Element: " + type_str); } // get struct element bounding boxes (can be on multiple pages) // skip document struct element - can take long time to collect all bboxes if (type_str != "Document") { List <PdfRect> bboxes = GetStructElementBboxes(doc, struct_elem); bboxes.ForEach(delegate(PdfRect bbox) { Console.WriteLine(indent + "[" + bbox.left + ", " + bbox.bottom + ", " + bbox.right + ", " + bbox.top + "]"); }); } string tile_str = struct_elem.GetTitle(); if (tile_str.Length != 0) { Console.WriteLine(indent + "title: " + tile_str); } string actual_text_str = struct_elem.GetActualText(); if (actual_text_str.Length != 0) { Console.WriteLine(indent + "actual text: " + actual_text_str); } string alt_str = struct_elem.GetAlt(); if (alt_str.Length != 0) { Console.WriteLine(indent, "alt: " + alt_str); } var page_num = struct_elem.GetPageNumber(); if (page_num != -1) { Console.WriteLine(indent, "Page number: " + page_num); } int num_kids = struct_elem.GetNumKids(); for (int i = 0; i < num_kids; i++) { var kid_obj = struct_elem.GetKidObject(i); // based on structure element you can obtain different data switch (struct_elem.GetKidType(i)) { case PdfStructElementType.kPdsStructKidElement: { var kid_struct_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); if (kid_struct_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } ProcessStructElement(doc, kid_struct_elem, indent); kid_struct_elem.Release(); } break; case PdfStructElementType.kPdsStructKidObject: break; case PdfStructElementType.kPdsStructKidStreamContent: { var kid_page_num = struct_elem.GetKidPageNumber(i); Console.WriteLine(indent + "Kid Page number: " + kid_page_num); var mcid = struct_elem.GetKidMcid(i); Console.WriteLine(indent + "MCID: " + mcid); } break; case PdfStructElementType.kPdsStructKidPageContent: { var mcid = struct_elem.GetKidMcid(i); Console.WriteLine(indent + "MCID: " + mcid); } break; } } Console.WriteLine(""); }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement paragraph = GetFirstParagraph(struct_tree); if (paragraph == null) { throw new Exception("No table found."); } // move paragraph to the back of it's parent PdsStructElement parent = struct_tree.AcquireStructElement(paragraph.GetParentObject()); if (parent == null) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!paragraph.SetParent(parent, parent.GetNumKids() - 1)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, PdfSaveFlags.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); pdfix.Destroy(); }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } // tag text on the bottom of the page as artifact for (int i = 0; i < struct_tree.GetNumKids(); i++) { PdsObject kid_obj = struct_tree.GetKidObject(i); PdsStructElement kid_elem = struct_tree.AcquireStructElement(kid_obj); TagParagraphAsHeading(kid_elem); kid_elem.Release(); } if (!doc.Save(savePath, PdfSaveFlags.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); pdfix.Destroy(); }
public static void Run( String openPath, // source PDF document String savePath // dest PDF document ) { Pdfix pdfix = PdfixEngine.Instance; PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement table = GetFirstTable(struct_tree); if (table == null) { throw new Exception("No table found."); } PdfRect bbox = new PdfRect(); GetStructElementBBox(table, ref bbox); // remove all items from the table to make it untagged cotnent for (int i = table.GetNumChildren() - 1; i >= 0; i--) { table.RemoveChild(i); } // tag page PdfPage page = doc.AcquirePage(0); PdePageMap page_map = page.AcquirePageMap(); PdeElement elem = page_map.CreateElement(PdfElementType.kPdeImage, null); elem.SetBBox(bbox); elem.SetAlt("This is image caption"); // prepare document template to ignore already tagged content var doc_prelight = doc.GetTemplate(); doc_prelight.SetProperty("ignore_tags", 1); // re-tag non-tagged page content PdePageMap pageMap = page.AcquirePageMap(); if (pageMap == null) { throw new Exception(pdfix.GetError()); } if (!pageMap.CreateElements(null, null)) { throw new Exception(pdfix.GetError()); } if (!page_map.AddTags(table, null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // udpate the table element type if (!table.SetType("Sect")) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, Pdfix.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement paragraph = GetFirstParagraph(struct_tree); if (paragraph == null) { throw new Exception("No paragraph found."); } PdfRect annot_bbox = new PdfRect(); GetStructElementBBox(paragraph, ref annot_bbox); // add new link annotation to the page PdfPage page = doc.AcquirePage(0); PdfLinkAnnot annot = page.AddLinkAnnot(0, annot_bbox); if (annot == null) { throw new Exception(pdfix.GetErrorType().ToString()); } // re-tag the document the link annotation if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, PdfSaveFlags.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); pdfix.Destroy(); }
////////////////////////////////////////////////////////////////////////////////////////////////// // TagParagraphAsHeading // re-tag the struct element to heading based on font properties ////////////////////////////////////////////////////////////////////////////////////////////////// static void TagParagraphAsHeading(PdsStructElement struct_elem) { string type = struct_elem.GetType_(true); // if (type == "P") { // get the paragraph text_state PdfTextState ts = GetParagraphTextState(struct_elem); // get the font name //string font_name = ts.font != null ? ts.font.GetFontName() : ""; string tag_type = ""; //if (font_name.Contains("Black") && ts.font_size >= 25) // tag_type = "H1"; //else if (font_name.Contains("Bold") && ts.font_size >= 16) // tag_type = "H2"; Console.WriteLine("Before replacing.."); if (ts.font_size >= 14) { tag_type = "H1"; Console.WriteLine("Replacing paragraph with Heading 1"); } else if (ts.font_size >= 12) { tag_type = "H2"; Console.WriteLine("Replacing paragraph with Heading 2"); } else if (ts.font_size >= 6) { tag_type = "H3"; Console.WriteLine("Replacing paragraph with Heading 2"); } // update tag type if (tag_type.Length != 0) { struct_elem.SetType(tag_type); } // return; // this was a P tag, no need to continue to kid struct elements //} // search kid struct elements for (int i = 0; i < struct_elem.GetNumKids(); i++) { string actualText = struct_elem.GetActualText(); if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidElement) { PdsObject kid_obj = struct_elem.GetKidObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); TagParagraphAsHeading(kid_elem); kid_elem.Release(); } else if (struct_elem.GetKidType(i) == PdfStructElementType.kPdsStructKidPageContent) { PdfTextState textState = GetParagraphTextState(struct_elem); //PdsObject kid_obj = //PdsStructElement kid_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); //var element_text = kid_elem.GetActualText(); } } }
public static void Run( String openPath, // source PDF document String savePath // dest PDF document ) { Pdfix pdfix = PdfixEngine.Instance; PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement paragraph = GetFirstParagraph(struct_tree); if (paragraph == null) { throw new Exception("No paragraph found."); } PdfRect annot_bbox = new PdfRect(); GetStructElementBBox(paragraph, ref annot_bbox); // add new link annotation to the page PdfPage page = doc.AcquirePage(0); PdfLinkAnnot annot = (PdfLinkAnnot)page.CreateAnnot(PdfAnnotSubtype.kAnnotLink, annot_bbox); page.AddAnnot(0, annot); if (annot == null) { throw new Exception(pdfix.GetErrorType().ToString()); } // re-tag the document the link annotation if (!doc.RemoveTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.AddTags(null, null)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, Pdfix.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); }
static void ProcessStructElement(PdsStructElement struct_elem, string indent) { indent += " "; if (struct_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } string type_str = struct_elem.GetType_(true); Console.WriteLine(indent + type_str); string tile_str = struct_elem.GetTitle(); Console.WriteLine(indent + tile_str); string actual_text_str = struct_elem.GetActualText(); Console.WriteLine(indent + actual_text_str); string alt_str = struct_elem.GetAlt(); Console.WriteLine(indent, alt_str); var page_num = struct_elem.GetPageNumber(); Console.WriteLine(indent, "Page number: " + page_num); int num_kids = struct_elem.GetNumKids(); for (int i = 0; i < num_kids; i++) { var kid_obj = struct_elem.GetKidObject(i); // based on structure element you can obtain different data switch (struct_elem.GetKidType(i)) { case PdfStructElementType.kPdsStructKidElement: { var kid_struct_elem = struct_elem.GetStructTree().AcquireStructElement(kid_obj); if (kid_struct_elem == null) { throw new Exception(pdfix.GetErrorType().ToString()); } ProcessStructElement(kid_struct_elem, indent); kid_struct_elem.Release(); } break; case PdfStructElementType.kPdsStructKidObject: break; case PdfStructElementType.kPdsStructKidStreamContent: { var kid_page_num = struct_elem.GetKidPageNumber(i); Console.WriteLine(indent + "Kid Page number: " + kid_page_num); var mcid = struct_elem.GetKidMcid(i); Console.WriteLine(indent + "MCID: " + mcid); } break; case PdfStructElementType.kPdsStructKidPageContent: { var mcid = struct_elem.GetKidMcid(i); Console.WriteLine(indent + "MCID: " + mcid); } break; } } }
public static void Run( String email, // authorization email String licenseKey, // authorization license key String openPath, // source PDF document String savePath // dest PDF document ) { pdfix = new Pdfix(); if (pdfix == null) { throw new Exception("Pdfix initialization fail"); } if (!pdfix.Authorize(email, licenseKey)) { throw new Exception(pdfix.GetErrorType().ToString()); } PdfDoc doc = pdfix.OpenDoc(openPath, ""); if (doc == null) { throw new Exception(pdfix.GetError()); } // cleanup any previous structure tree if (!doc.RemoveTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // autotag document first if (!doc.AddTags(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // get the struct tree PdsStructTree struct_tree = doc.GetStructTree(); if (struct_tree == null) { throw new Exception(pdfix.GetErrorType().ToString()); } PdsStructElement table = GetFirstTable(struct_tree); if (table == null) { throw new Exception("No table found."); } PdfRect bbox = new PdfRect(); GetStructElementBBox(table, ref bbox); // remove all items from the table to make it untagged cotnent for (int i = table.GetNumKids() - 1; i >= 0; i--) { table.RemoveKid(i); } // tag page PdfPage page = doc.AcquirePage(0); PdePageMap page_map = page.CreatePageMap(); PdeElement elem = page_map.CreateElement(PdfElementType.kPdeImage, null); elem.SetBBox(bbox); elem.SetAlt("This is image caption"); // prepare document template to ignore already tagged content PdfDocTemplate doc_tmpl = doc.GetDocTemplate(); doc_tmpl.SetProperty("ignore_tags", 1); // re-tag non-tagged page content if (!page_map.AcquireElements(null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!page_map.AddTags(table, null, IntPtr.Zero)) { throw new Exception(pdfix.GetErrorType().ToString()); } // udpate the table element type if (!table.SetType("Sect")) { throw new Exception(pdfix.GetErrorType().ToString()); } if (!doc.Save(savePath, PdfSaveFlags.kSaveFull)) { throw new Exception(pdfix.GetError()); } doc.Close(); pdfix.Destroy(); }