private static void ProcessPageObject(PdfPage page, PdsPageObject obj, string savePath) { if (obj == null) { PdfixEngine.ThrowException(); } switch (obj.GetObjectType()) { case PdfPageObjectType.kPdsPageImage: ExtractImage(page, (PdsImage)obj, savePath); break; case PdfPageObjectType.kPdsPageForm: { var form = (PdsForm)obj; var content = form.GetContent(); for (int i = 0; i < content.GetNumObjects(); i++) { ProcessPageObject(page, content.GetObject(i), savePath); } } break; } }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetPageObjectTextState // get text object's of specified mcid the text state ////////////////////////////////////////////////////////////////////////////////////////////////// static PdfTextState GetPageObjectTextState(PdsPageObject page_object, int mcid) { if (page_object.GetObjectType() == PdfPageObjectType.kPdsPageText) { PdsText text = (PdsText)page_object; // check if this text page object has the same mcid PdsContentMark content_mark = page_object.GetContentMark(); if (content_mark != null && content_mark.GetTagMcid() == mcid) { PdfTextState ts = text.GetTextState(); if (ts.font != null) { return(ts); } } } else if (page_object.GetObjectType() == PdfPageObjectType.kPdsPageForm) { // search for the text object inside of the form XObject PdsForm form = (PdsForm)page_object; var content = form.GetContent(); for (int i = 0; i < content.GetNumObjects(); i++) { var ts = GetPageObjectTextState(content.GetObject(i), mcid); if (ts.font != null) { return(ts); } } } return(new PdfTextState()); }
// get the ocg layers of the page object internal static List <KeyValuePair <string, int> > GetPageObjectLayers(PdsPageObject page_object) { List <KeyValuePair <string, int> > layers = new List <KeyValuePair <string, int> >(); var content_mark = page_object.GetContentMark(); if (content_mark != null) { for (var i = 0; i < content_mark.GetNumTags(); i++) { var name = content_mark.GetTagName(i); if (name == "OC") { var content_mark_obj = content_mark.GetTagObject(i); if (content_mark_obj != null) { void push_ocg(PdsDictionary ocg) { var ocg_name = ocg.GetText("Name"); var id = ocg.GetId(); layers.Add(new KeyValuePair <string, int>(ocg_name, id)); } var type = content_mark_obj.GetText("Type"); if (type == "OCMD") { var ocgs_dict = content_mark_obj.GetDictionary("OCGs"); if (ocgs_dict != null) { push_ocg(ocgs_dict); } var ocgs_arr = content_mark_obj.GetArray("OCGs"); if (ocgs_arr != null) { for (var j = 0; j < ocgs_arr.GetNumObjects(); j++) { var ocg_dict = ocgs_arr.GetDictionary(j); push_ocg(ocg_dict); } } } else if (type == "OCG") { push_ocg(content_mark_obj); } } } } } return(layers); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetParagraphBBox // get the text state of the text objects inside paragraph by iterating content kid objects ////////////////////////////////////////////////////////////////////////////////////////////////// private static bool GetStructElementBBox(PdsStructElement struct_elem, ref PdfRect bbox) { bool result = false; for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent) { // acquire page on which the element is present PdfDoc doc = struct_elem.GetStructTree().GetDoc(); PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i)); // find text object with mcid on the page to get the text state int mcid = struct_elem.GetChildMcid(i); var content = page.GetContent(); for (int j = 0; j < content.GetNumObjects(); j++) { PdsPageObject page_object = content.GetObject(j); // check if this text page object has the same mcid PdsContentMark content_mark = page_object.GetContentMark(); if (content_mark != null && content_mark.GetTagMcid() == mcid) { PdfRect elem_bbox = page_object.GetBBox(); if ((bbox.left - bbox.right == 0) || (bbox.top - bbox.bottom == 0)) { bbox = elem_bbox; } else { bbox.left = Math.Min(bbox.left, elem_bbox.left); bbox.right = Math.Max(bbox.right, elem_bbox.right); bbox.top = Math.Max(bbox.top, elem_bbox.top); bbox.bottom = Math.Min(bbox.bottom, elem_bbox.bottom); } result = true; } } } else if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); GetStructElementBBox(kid_elem, ref bbox); } } return(result); }
////////////////////////////////////////////////////////////////////////////////////////////////// // MarkUntaggedObjectsAsArtifact // find any non-tagged objects in the page content and mark them as artifact ////////////////////////////////////////////////////////////////////////////////////////////////// internal static void MarkUntaggedObjectsAsArtifact(PdfPage page) { PdfDoc doc = page.GetDoc(); for (int i = 0; i < page.GetNumPageObjects(); i++) { PdsPageObject page_obj = page.GetPageObject(i); PdsContentMark content_mark = page_obj.GetContentMark(); if (!content_mark.GetTagArtifact() && content_mark.GetTagMcid() == -1) { PdsDictionary artifact_dict = doc.CreateDictObject(false); artifact_dict.Put("Type", doc.CreateNameObject(false, "Pagination")); artifact_dict.Put("Subtype", doc.CreateNameObject(false, "Footer")); content_mark.AddTag("Artifact", artifact_dict, false); } } page.SetContent(); }
// check page object if it fits to internal static void CheckPageObject(PdsPageObject page_object, KeyValuePair <string, int> layer) { switch (page_object.GetObjectType()) { case PdfPageObjectType.kPdsPageText: { var text_obj = (PdsText)page_object; string text = text_obj.GetText(); var layers = GetPageObjectLayers(page_object); foreach (KeyValuePair <string, int> l in layers) { if (layer.Key == l.Key && layer.Value == l.Value) { Console.WriteLine(text); break; } } } break; } }
// collect all bounding boxes of the page object with specified mcid static List <PdfRect> GetMcidBBoxes(PdsPageObject obj, int mcid) { var bboxes = new List <PdfRect>(); // check object mcid var content_mark = obj.GetContentMark(); if (content_mark != null && content_mark.GetTagMcid() == mcid) { bboxes.Add(obj.GetBBox()); } else { if (obj.GetObjectType() == PdfPageObjectType.kPdsPageForm) { var form_obj = (PdsForm)obj; for (var i = 0; i < form_obj.GetNumPageObjects(); i++) { bboxes.AddRange(GetMcidBBoxes(form_obj.GetPageObject(i), mcid)); } } } return(bboxes); }
private static void ProcessPageObject(PdfPage page, PdsPageObject obj, string savePath) { if (obj == null) { throw new Exception(_pdfix.GetError()); } switch (obj.GetObjectType()) { case PdfPageObjectType.kPdsPageImage: ExtractImage(page, (PdsImage)obj, savePath); break; case PdfPageObjectType.kPdsPageForm: { var form = (PdsForm)obj; for (int i = 0; i < form.GetNumPageObjects(); i++) { ProcessPageObject(page, form.GetPageObject(i), savePath); } } break; } }