Пример #1
0
        private void EnsureExistingRootTagIsDocument()
        {
            IRoleMappingResolver mapping;

            mapping = context.GetRoleMappingResolver(rootTagElement.GetRole().GetValue(), rootTagElement.GetNamespace(
                                                         ));
            bool isDocBeforeResolving = mapping.CurrentRoleIsStandard() && StandardRoles.DOCUMENT.Equals(mapping.GetRole
                                                                                                             ());

            mapping = context.ResolveMappingToStandardOrDomainSpecificRole(rootTagElement.GetRole().GetValue(), rootTagElement
                                                                           .GetNamespace());
            bool isDocAfterResolving = mapping != null && mapping.CurrentRoleIsStandard() && StandardRoles.DOCUMENT.Equals
                                           (mapping.GetRole());

            if (isDocBeforeResolving && !isDocAfterResolving)
            {
                LogCreatedRootTagHasMappingIssue(rootTagElement.GetNamespace(), mapping);
            }
            else
            {
                if (!isDocAfterResolving)
                {
                    WrapAllKidsInTag(rootTagElement, rootTagElement.GetRole(), rootTagElement.GetNamespace());
                    rootTagElement.SetRole(PdfName.Document);
                    if (context.TargetTagStructureVersionIs2())
                    {
                        rootTagElement.SetNamespace(context.GetDocumentDefaultNamespace());
                        context.EnsureNamespaceRegistered(context.GetDocumentDefaultNamespace());
                    }
                }
            }
        }
Пример #2
0
        /// <summary>Transforms root tags in a way that complies with the PDF References.</summary>
        /// <remarks>
        /// Transforms root tags in a way that complies with the PDF References.
        /// <br/><br/>
        /// PDF Reference
        /// 10.7.3 Grouping Elements:
        /// <br/><br/>
        /// For most content extraction formats, the document must be a tree with a single top-level element;
        /// the structure tree root (identified by the StructTreeRoot entry in the document catalog) must have
        /// only one child in its K (kids) array. If the PDF file contains a complete document, the structure
        /// type Document is recommended for this top-level element in the logical structure hierarchy. If the
        /// file contains a well-formed document fragment, one of the structure types Part, Art, Sect, or Div
        /// may be used instead.
        /// </remarks>
        public virtual void NormalizeDocumentRootTag()
        {
            // in this method we could deal with existing document, so we don't won't to throw exceptions here
            bool forbid = forbidUnknownRoles;

            forbidUnknownRoles = false;
            IList <IPdfStructElem> rootKids = document.GetStructTreeRoot().GetKids();

            if (rootKids.Count == 1 && allowedRootTagRoles.Contains(rootKids[0].GetRole()))
            {
                rootTagElement = (PdfStructElem)rootKids[0];
            }
            else
            {
                PdfStructElem prevRootTag = rootTagElement;
                document.GetStructTreeRoot().GetPdfObject().Remove(PdfName.K);
                if (prevRootTag == null)
                {
                    rootTagElement = document.GetStructTreeRoot().AddKid(new PdfStructElem(document, PdfName.Document));
                }
                else
                {
                    document.GetStructTreeRoot().AddKid(rootTagElement);
                    if (!PdfName.Document.Equals(rootTagElement.GetRole()))
                    {
                        WrapAllKidsInTag(rootTagElement, rootTagElement.GetRole());
                        rootTagElement.SetRole(PdfName.Document);
                    }
                }
                int  originalRootKidsIndex = 0;
                bool isBeforeOriginalRoot  = true;
                foreach (IPdfStructElem elem in rootKids)
                {
                    // StructTreeRoot kids are always PdfStructElem, so we are save here to cast it
                    PdfStructElem kid = (PdfStructElem)elem;
                    if (kid.GetPdfObject() == rootTagElement.GetPdfObject())
                    {
                        isBeforeOriginalRoot = false;
                        continue;
                    }
                    bool kidIsDocument = PdfName.Document.Equals(kid.GetRole());
                    if (isBeforeOriginalRoot)
                    {
                        rootTagElement.AddKid(originalRootKidsIndex, kid);
                        originalRootKidsIndex += kidIsDocument ? kid.GetKids().Count : 1;
                    }
                    else
                    {
                        rootTagElement.AddKid(kid);
                    }
                    if (kidIsDocument)
                    {
                        RemoveOldRoot(kid);
                    }
                }
            }
            forbidUnknownRoles = forbid;
        }
Пример #3
0
        /// <summary>Transforms root tags in a way that complies with the tagged PDF specification.</summary>
        /// <remarks>
        /// Transforms root tags in a way that complies with the tagged PDF specification.
        /// Depending on PDF version behaviour may differ.
        /// <br />
        /// ISO 32000-1 (PDF 1.7 and lower)
        /// 14.8.4.2 Grouping Elements
        /// <br />
        /// "In a tagged PDF document, the structure tree shall contain a single top-level element; that is,
        /// the structure tree root (identified by the StructTreeRoot entry in the document catalogue) shall
        /// have only one child in its K (kids) array. If the PDF file contains a complete document, the structure
        /// type Document should be used for this top-level element in the logical structure hierarchy. If the file
        /// contains a well-formed document fragment, one of the structure types Part, Art, Sect, or Div may be used instead."
        /// <br />
        /// For PDF 2.0 and higher root tag is allowed to have only the Document role.
        /// </remarks>
        public virtual void NormalizeDocumentRootTag()
        {
            // in this method we could deal with existing document, so we don't won't to throw exceptions here
            bool forbid = forbidUnknownRoles;

            forbidUnknownRoles = false;
            IList <IStructureNode> rootKids = document.GetStructTreeRoot().GetKids();
            IRoleMappingResolver   mapping  = null;

            if (rootKids.Count > 0)
            {
                PdfStructElem firstKid = (PdfStructElem)rootKids[0];
                mapping = ResolveMappingToStandardOrDomainSpecificRole(firstKid.GetRole().GetValue(), firstKid.GetNamespace
                                                                           ());
            }
            if (rootKids.Count == 1 && mapping != null && mapping.CurrentRoleIsStandard() && IsRoleAllowedToBeRoot(mapping
                                                                                                                   .GetRole()))
            {
                rootTagElement = (PdfStructElem)rootKids[0];
            }
            else
            {
                document.GetStructTreeRoot().GetPdfObject().Remove(PdfName.K);
                rootTagElement = new RootTagNormalizer(this, rootTagElement, document).MakeSingleStandardRootTag(rootKids);
            }
            forbidUnknownRoles = forbid;
        }
Пример #4
0
 protected internal TagReference(PdfStructElem referencedTag, TagTreePointer tagPointer, int insertIndex)
 {
     this.role          = referencedTag.GetRole();
     this.referencedTag = referencedTag;
     this.tagPointer    = tagPointer;
     this.insertIndex   = insertIndex;
 }
Пример #5
0
 protected internal virtual void InspectKid(IStructureNode kid)
 {
     try {
         if (kid is PdfStructElem)
         {
             PdfStructElem structElemKid = (PdfStructElem)kid;
             PdfName       s             = structElemKid.GetRole();
             String        tagN          = s.GetValue();
             String        tag           = FixTagName(tagN);
             @out.Write("<");
             @out.Write(tag);
             InspectAttributes(structElemKid);
             @out.Write(">" + Environment.NewLine);
             PdfString alt = (structElemKid).GetAlt();
             if (alt != null)
             {
                 @out.Write("<alt><![CDATA[");
                 @out.Write(iText.IO.Util.StringUtil.ReplaceAll(alt.GetValue(), "[\\000]*", ""));
                 @out.Write("]]></alt>" + Environment.NewLine);
             }
             InspectKids(structElemKid.GetKids());
             @out.Write("</");
             @out.Write(tag);
             @out.Write(">" + Environment.NewLine);
         }
         else
         {
             if (kid is PdfMcr)
             {
                 ParseTag((PdfMcr)kid);
             }
             else
             {
                 @out.Write(" <flushedKid/> ");
             }
         }
     }
     catch (System.IO.IOException e) {
         throw new iText.IO.IOException(iText.IO.IOException.UnknownIOException, e);
     }
 }
Пример #6
0
        private void AddStructTreeRootKidsToTheRootTag(IList <IStructureNode> rootKids)
        {
            int  originalRootKidsIndex = 0;
            bool isBeforeOriginalRoot  = true;

            foreach (IStructureNode elem in rootKids)
            {
                // StructTreeRoot kids are always PdfStructElement, so we are save here to cast it
                PdfStructElem kid = (PdfStructElem)elem;
                if (kid.GetPdfObject() == rootTagElement.GetPdfObject())
                {
                    isBeforeOriginalRoot = false;
                    continue;
                }
                // This boolean is used to "flatten" possible deep "stacking" of the tag structure in case of the multiple pages copying operations.
                // This could happen due to the wrapping of all the kids in the createNewRootTag or ensureExistingRootTagIsDocument methods.
                // And therefore, we don't need here to resolve mappings, because we exactly know which role we set.
                bool kidIsDocument = PdfName.Document.Equals(kid.GetRole());
                if (kidIsDocument && kid.GetNamespace() != null && context.TargetTagStructureVersionIs2())
                {
                    // we flatten only tags of document role in standard structure namespace
                    String kidNamespaceName = kid.GetNamespace().GetNamespaceName();
                    kidIsDocument = StandardNamespaces.PDF_1_7.Equals(kidNamespaceName) || StandardNamespaces.PDF_2_0.Equals(kidNamespaceName
                                                                                                                             );
                }
                if (isBeforeOriginalRoot)
                {
                    rootTagElement.AddKid(originalRootKidsIndex, kid);
                    originalRootKidsIndex += kidIsDocument ? kid.GetKids().Count : 1;
                }
                else
                {
                    rootTagElement.AddKid(kid);
                }
                if (kidIsDocument)
                {
                    RemoveOldRoot(kid);
                }
            }
        }
Пример #7
0
        private void SetNamespaceForNewTagsBasedOnExistingRoot()
        {
            IList <IStructureNode> rootKids = document.GetStructTreeRoot().GetKids();

            if (rootKids.Count > 0)
            {
                PdfStructElem        firstKid        = (PdfStructElem)rootKids[0];
                IRoleMappingResolver resolvedMapping = ResolveMappingToStandardOrDomainSpecificRole(firstKid.GetRole().GetValue
                                                                                                        (), firstKid.GetNamespace());
                if (resolvedMapping == null || !resolvedMapping.CurrentRoleIsStandard())
                {
                    ILog   logger = LogManager.GetLogger(typeof(iText.Kernel.Pdf.Tagutils.TagStructureContext));
                    String nsStr;
                    if (firstKid.GetNamespace() != null)
                    {
                        nsStr = firstKid.GetNamespace().GetNamespaceName();
                    }
                    else
                    {
                        nsStr = StandardNamespaces.GetDefault();
                    }
                    logger.Warn(String.Format(iText.IO.LogMessageConstant.EXISTING_TAG_STRUCTURE_ROOT_IS_NOT_STANDARD, firstKid
                                              .GetRole().GetValue(), nsStr));
                }
                if (resolvedMapping == null || !StandardNamespaces.PDF_1_7.Equals(resolvedMapping.GetNamespace().GetNamespaceName
                                                                                      ()))
                {
                    documentDefaultNamespace = FetchNamespace(StandardNamespaces.PDF_2_0);
                }
            }
            else
            {
                documentDefaultNamespace = FetchNamespace(StandardNamespaces.PDF_2_0);
            }
        }