////////////////////////////////////////////////////////////////////////////////////////////////// // GetPageObjectTextState // get text object's of specified mcid the text state ////////////////////////////////////////////////////////////////////////////////////////////////// static PdfTextState GetPageObjectTextState(PdsPageObject page_object, int mcid) { if (page_object.GetObjectType() == PdfPageObjectType.kPdsPageText) { PdsText text = (PdsText)page_object; // check if this text page object has the same mcid PdsContentMark content_mark = page_object.GetContentMark(); if (content_mark != null && content_mark.GetTagMcid() == mcid) { PdfTextState ts = text.GetTextState(); if (ts.font != null) { return(ts); } } } else if (page_object.GetObjectType() == PdfPageObjectType.kPdsPageForm) { // search for the text object inside of the form XObject PdsForm form = (PdsForm)page_object; var content = form.GetContent(); for (int i = 0; i < content.GetNumObjects(); i++) { var ts = GetPageObjectTextState(content.GetObject(i), mcid); if (ts.font != null) { return(ts); } } } return(new PdfTextState()); }
////////////////////////////////////////////////////////////////////////////////////////////////// // GetParagraphBBox // get the text state of the text objects inside paragraph by iterating content kid objects ////////////////////////////////////////////////////////////////////////////////////////////////// private static bool GetStructElementBBox(PdsStructElement struct_elem, ref PdfRect bbox) { bool result = false; for (int i = 0; i < struct_elem.GetNumChildren(); i++) { if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildPageContent) { // acquire page on which the element is present PdfDoc doc = struct_elem.GetStructTree().GetDoc(); PdfPage page = doc.AcquirePage(struct_elem.GetChildPageNumber(i)); // find text object with mcid on the page to get the text state int mcid = struct_elem.GetChildMcid(i); var content = page.GetContent(); for (int j = 0; j < content.GetNumObjects(); j++) { PdsPageObject page_object = content.GetObject(j); // check if this text page object has the same mcid PdsContentMark content_mark = page_object.GetContentMark(); if (content_mark != null && content_mark.GetTagMcid() == mcid) { PdfRect elem_bbox = page_object.GetBBox(); if ((bbox.left - bbox.right == 0) || (bbox.top - bbox.bottom == 0)) { bbox = elem_bbox; } else { bbox.left = Math.Min(bbox.left, elem_bbox.left); bbox.right = Math.Max(bbox.right, elem_bbox.right); bbox.top = Math.Max(bbox.top, elem_bbox.top); bbox.bottom = Math.Min(bbox.bottom, elem_bbox.bottom); } result = true; } } } else if (struct_elem.GetChildType(i) == PdfStructElementType.kPdsStructChildElement) { PdsObject kid_obj = struct_elem.GetChildObject(i); PdsStructElement kid_elem = struct_elem.GetStructTree().GetStructElementFromObject(kid_obj); GetStructElementBBox(kid_elem, ref bbox); } } return(result); }
////////////////////////////////////////////////////////////////////////////////////////////////// // MarkUntaggedObjectsAsArtifact // find any non-tagged objects in the page content and mark them as artifact ////////////////////////////////////////////////////////////////////////////////////////////////// internal static void MarkUntaggedObjectsAsArtifact(PdfPage page) { PdfDoc doc = page.GetDoc(); for (int i = 0; i < page.GetNumPageObjects(); i++) { PdsPageObject page_obj = page.GetPageObject(i); PdsContentMark content_mark = page_obj.GetContentMark(); if (!content_mark.GetTagArtifact() && content_mark.GetTagMcid() == -1) { PdsDictionary artifact_dict = doc.CreateDictObject(false); artifact_dict.Put("Type", doc.CreateNameObject(false, "Pagination")); artifact_dict.Put("Subtype", doc.CreateNameObject(false, "Footer")); content_mark.AddTag("Artifact", artifact_dict, false); } } page.SetContent(); }