private void EnsureExistingRootTagIsDocument() { IRoleMappingResolver mapping; mapping = context.GetRoleMappingResolver(rootTagElement.GetRole().GetValue(), rootTagElement.GetNamespace( )); bool isDocBeforeResolving = mapping.CurrentRoleIsStandard() && StandardRoles.DOCUMENT.Equals(mapping.GetRole ()); mapping = context.ResolveMappingToStandardOrDomainSpecificRole(rootTagElement.GetRole().GetValue(), rootTagElement .GetNamespace()); bool isDocAfterResolving = mapping != null && mapping.CurrentRoleIsStandard() && StandardRoles.DOCUMENT.Equals (mapping.GetRole()); if (isDocBeforeResolving && !isDocAfterResolving) { LogCreatedRootTagHasMappingIssue(rootTagElement.GetNamespace(), mapping); } else { if (!isDocAfterResolving) { WrapAllKidsInTag(rootTagElement, rootTagElement.GetRole(), rootTagElement.GetNamespace()); rootTagElement.SetRole(PdfName.Document); if (context.TargetTagStructureVersionIs2()) { rootTagElement.SetNamespace(context.GetDocumentDefaultNamespace()); context.EnsureNamespaceRegistered(context.GetDocumentDefaultNamespace()); } } } }
/// <summary>Transforms root tags in a way that complies with the PDF References.</summary> /// <remarks> /// Transforms root tags in a way that complies with the PDF References. /// <br/><br/> /// PDF Reference /// 10.7.3 Grouping Elements: /// <br/><br/> /// For most content extraction formats, the document must be a tree with a single top-level element; /// the structure tree root (identified by the StructTreeRoot entry in the document catalog) must have /// only one child in its K (kids) array. If the PDF file contains a complete document, the structure /// type Document is recommended for this top-level element in the logical structure hierarchy. If the /// file contains a well-formed document fragment, one of the structure types Part, Art, Sect, or Div /// may be used instead. /// </remarks> public virtual void NormalizeDocumentRootTag() { // in this method we could deal with existing document, so we don't won't to throw exceptions here bool forbid = forbidUnknownRoles; forbidUnknownRoles = false; IList <IPdfStructElem> rootKids = document.GetStructTreeRoot().GetKids(); if (rootKids.Count == 1 && allowedRootTagRoles.Contains(rootKids[0].GetRole())) { rootTagElement = (PdfStructElem)rootKids[0]; } else { PdfStructElem prevRootTag = rootTagElement; document.GetStructTreeRoot().GetPdfObject().Remove(PdfName.K); if (prevRootTag == null) { rootTagElement = document.GetStructTreeRoot().AddKid(new PdfStructElem(document, PdfName.Document)); } else { document.GetStructTreeRoot().AddKid(rootTagElement); if (!PdfName.Document.Equals(rootTagElement.GetRole())) { WrapAllKidsInTag(rootTagElement, rootTagElement.GetRole()); rootTagElement.SetRole(PdfName.Document); } } int originalRootKidsIndex = 0; bool isBeforeOriginalRoot = true; foreach (IPdfStructElem elem in rootKids) { // StructTreeRoot kids are always PdfStructElem, so we are save here to cast it PdfStructElem kid = (PdfStructElem)elem; if (kid.GetPdfObject() == rootTagElement.GetPdfObject()) { isBeforeOriginalRoot = false; continue; } bool kidIsDocument = PdfName.Document.Equals(kid.GetRole()); if (isBeforeOriginalRoot) { rootTagElement.AddKid(originalRootKidsIndex, kid); originalRootKidsIndex += kidIsDocument ? kid.GetKids().Count : 1; } else { rootTagElement.AddKid(kid); } if (kidIsDocument) { RemoveOldRoot(kid); } } } forbidUnknownRoles = forbid; }
/// <summary>Transforms root tags in a way that complies with the tagged PDF specification.</summary> /// <remarks> /// Transforms root tags in a way that complies with the tagged PDF specification. /// Depending on PDF version behaviour may differ. /// <br /> /// ISO 32000-1 (PDF 1.7 and lower) /// 14.8.4.2 Grouping Elements /// <br /> /// "In a tagged PDF document, the structure tree shall contain a single top-level element; that is, /// the structure tree root (identified by the StructTreeRoot entry in the document catalogue) shall /// have only one child in its K (kids) array. If the PDF file contains a complete document, the structure /// type Document should be used for this top-level element in the logical structure hierarchy. If the file /// contains a well-formed document fragment, one of the structure types Part, Art, Sect, or Div may be used instead." /// <br /> /// For PDF 2.0 and higher root tag is allowed to have only the Document role. /// </remarks> public virtual void NormalizeDocumentRootTag() { // in this method we could deal with existing document, so we don't won't to throw exceptions here bool forbid = forbidUnknownRoles; forbidUnknownRoles = false; IList <IStructureNode> rootKids = document.GetStructTreeRoot().GetKids(); IRoleMappingResolver mapping = null; if (rootKids.Count > 0) { PdfStructElem firstKid = (PdfStructElem)rootKids[0]; mapping = ResolveMappingToStandardOrDomainSpecificRole(firstKid.GetRole().GetValue(), firstKid.GetNamespace ()); } if (rootKids.Count == 1 && mapping != null && mapping.CurrentRoleIsStandard() && IsRoleAllowedToBeRoot(mapping .GetRole())) { rootTagElement = (PdfStructElem)rootKids[0]; } else { document.GetStructTreeRoot().GetPdfObject().Remove(PdfName.K); rootTagElement = new RootTagNormalizer(this, rootTagElement, document).MakeSingleStandardRootTag(rootKids); } forbidUnknownRoles = forbid; }
protected internal TagReference(PdfStructElem referencedTag, TagTreePointer tagPointer, int insertIndex) { this.role = referencedTag.GetRole(); this.referencedTag = referencedTag; this.tagPointer = tagPointer; this.insertIndex = insertIndex; }
protected internal virtual void InspectKid(IStructureNode kid) { try { if (kid is PdfStructElem) { PdfStructElem structElemKid = (PdfStructElem)kid; PdfName s = structElemKid.GetRole(); String tagN = s.GetValue(); String tag = FixTagName(tagN); @out.Write("<"); @out.Write(tag); InspectAttributes(structElemKid); @out.Write(">" + Environment.NewLine); PdfString alt = (structElemKid).GetAlt(); if (alt != null) { @out.Write("<alt><![CDATA["); @out.Write(iText.IO.Util.StringUtil.ReplaceAll(alt.GetValue(), "[\\000]*", "")); @out.Write("]]></alt>" + Environment.NewLine); } InspectKids(structElemKid.GetKids()); @out.Write("</"); @out.Write(tag); @out.Write(">" + Environment.NewLine); } else { if (kid is PdfMcr) { ParseTag((PdfMcr)kid); } else { @out.Write(" <flushedKid/> "); } } } catch (System.IO.IOException e) { throw new iText.IO.IOException(iText.IO.IOException.UnknownIOException, e); } }
private void AddStructTreeRootKidsToTheRootTag(IList <IStructureNode> rootKids) { int originalRootKidsIndex = 0; bool isBeforeOriginalRoot = true; foreach (IStructureNode elem in rootKids) { // StructTreeRoot kids are always PdfStructElement, so we are save here to cast it PdfStructElem kid = (PdfStructElem)elem; if (kid.GetPdfObject() == rootTagElement.GetPdfObject()) { isBeforeOriginalRoot = false; continue; } // This boolean is used to "flatten" possible deep "stacking" of the tag structure in case of the multiple pages copying operations. // This could happen due to the wrapping of all the kids in the createNewRootTag or ensureExistingRootTagIsDocument methods. // And therefore, we don't need here to resolve mappings, because we exactly know which role we set. bool kidIsDocument = PdfName.Document.Equals(kid.GetRole()); if (kidIsDocument && kid.GetNamespace() != null && context.TargetTagStructureVersionIs2()) { // we flatten only tags of document role in standard structure namespace String kidNamespaceName = kid.GetNamespace().GetNamespaceName(); kidIsDocument = StandardNamespaces.PDF_1_7.Equals(kidNamespaceName) || StandardNamespaces.PDF_2_0.Equals(kidNamespaceName ); } if (isBeforeOriginalRoot) { rootTagElement.AddKid(originalRootKidsIndex, kid); originalRootKidsIndex += kidIsDocument ? kid.GetKids().Count : 1; } else { rootTagElement.AddKid(kid); } if (kidIsDocument) { RemoveOldRoot(kid); } } }
private void SetNamespaceForNewTagsBasedOnExistingRoot() { IList <IStructureNode> rootKids = document.GetStructTreeRoot().GetKids(); if (rootKids.Count > 0) { PdfStructElem firstKid = (PdfStructElem)rootKids[0]; IRoleMappingResolver resolvedMapping = ResolveMappingToStandardOrDomainSpecificRole(firstKid.GetRole().GetValue (), firstKid.GetNamespace()); if (resolvedMapping == null || !resolvedMapping.CurrentRoleIsStandard()) { ILog logger = LogManager.GetLogger(typeof(iText.Kernel.Pdf.Tagutils.TagStructureContext)); String nsStr; if (firstKid.GetNamespace() != null) { nsStr = firstKid.GetNamespace().GetNamespaceName(); } else { nsStr = StandardNamespaces.GetDefault(); } logger.Warn(String.Format(iText.IO.LogMessageConstant.EXISTING_TAG_STRUCTURE_ROOT_IS_NOT_STANDARD, firstKid .GetRole().GetValue(), nsStr)); } if (resolvedMapping == null || !StandardNamespaces.PDF_1_7.Equals(resolvedMapping.GetNamespace().GetNamespaceName ())) { documentDefaultNamespace = FetchNamespace(StandardNamespaces.PDF_2_0); } } else { documentDefaultNamespace = FetchNamespace(StandardNamespaces.PDF_2_0); } }