/// <summary>Transforms root tags in a way that complies with the PDF References.</summary> /// <remarks> /// Transforms root tags in a way that complies with the PDF References. /// <br/><br/> /// PDF Reference /// 10.7.3 Grouping Elements: /// <br/><br/> /// For most content extraction formats, the document must be a tree with a single top-level element; /// the structure tree root (identified by the StructTreeRoot entry in the document catalog) must have /// only one child in its K (kids) array. If the PDF file contains a complete document, the structure /// type Document is recommended for this top-level element in the logical structure hierarchy. If the /// file contains a well-formed document fragment, one of the structure types Part, Art, Sect, or Div /// may be used instead. /// </remarks> public virtual void NormalizeDocumentRootTag() { // in this method we could deal with existing document, so we don't won't to throw exceptions here bool forbid = forbidUnknownRoles; forbidUnknownRoles = false; IList <IPdfStructElem> rootKids = document.GetStructTreeRoot().GetKids(); if (rootKids.Count == 1 && allowedRootTagRoles.Contains(rootKids[0].GetRole())) { rootTagElement = (PdfStructElem)rootKids[0]; } else { PdfStructElem prevRootTag = rootTagElement; document.GetStructTreeRoot().GetPdfObject().Remove(PdfName.K); if (prevRootTag == null) { rootTagElement = document.GetStructTreeRoot().AddKid(new PdfStructElem(document, PdfName.Document)); } else { document.GetStructTreeRoot().AddKid(rootTagElement); if (!PdfName.Document.Equals(rootTagElement.GetRole())) { WrapAllKidsInTag(rootTagElement, rootTagElement.GetRole()); rootTagElement.SetRole(PdfName.Document); } } int originalRootKidsIndex = 0; bool isBeforeOriginalRoot = true; foreach (IPdfStructElem elem in rootKids) { // StructTreeRoot kids are always PdfStructElem, so we are save here to cast it PdfStructElem kid = (PdfStructElem)elem; if (kid.GetPdfObject() == rootTagElement.GetPdfObject()) { isBeforeOriginalRoot = false; continue; } bool kidIsDocument = PdfName.Document.Equals(kid.GetRole()); if (isBeforeOriginalRoot) { rootTagElement.AddKid(originalRootKidsIndex, kid); originalRootKidsIndex += kidIsDocument ? kid.GetKids().Count : 1; } else { rootTagElement.AddKid(kid); } if (kidIsDocument) { RemoveOldRoot(kid); } } } forbidUnknownRoles = forbid; }
private iText.Kernel.Pdf.Navigation.PdfStructureDestination Add(PdfStructElem elem) { if (elem.GetPdfObject().GetIndirectReference() == null) { throw new PdfException(PdfException.StructureElementInStructureDestinationShallBeAnIndirectObject); } ((PdfArray)GetPdfObject()).Add(elem.GetPdfObject()); return(this); }
private bool EnsureElementPageEqualsKidPage(PdfStructElem elem, PdfDictionary kidPage) { PdfObject pageObject = elem.GetPdfObject().Get(PdfName.Pg); if (pageObject == null) { pageObject = kidPage; elem.GetPdfObject().Put(PdfName.Pg, kidPage); } return(kidPage.Equals(pageObject)); }
public override void Draw(DrawContext drawContext) { LayoutTaggingHelper taggingHelper = GetProperty <LayoutTaggingHelper>(Property.TAGGING_HELPER); // We want to reach the actual tag from logical structure tree, in order to set custom properties, for // which iText doesn't provide convenient API at the moment. Specifically we are aiming at setting /ID // entry in structure element dictionary corresponding to the table header cell. Here we are creating tag // for the current element in logical structure tree right at the beginning of #draw method. // If this particular instance of header cell is paging artifact it would be marked so by layouting // engine and it would not allow to create a tag (return value of the method would be 'false'). // If this particular instance of header cell is the header which is to be tagged, a tag will be created. // It's safe to create a tag at this moment, it will be picked up and placed at correct position in the // logical structure tree later by layout engine. TagTreePointer p = new TagTreePointer(pdfDocument); if (taggingHelper.CreateTag(this, p)) { // After the tag is created, we can fetch low level entity PdfStructElem // in order to work with it directly. These changes would be directly reflected // in the PDF file inner structure. PdfStructElem structElem = tagContext.GetPointerStructElem(p); PdfDictionary structElemDict = structElem.GetPdfObject(); structElemDict.Put(PdfName.ID, headerId); idTree.AddEntry(headerId.GetValue(), structElemDict); } base.Draw(drawContext); }
private void RemovePageTagFromParent(IPdfStructElem pageTag, IPdfStructElem parent) { if (parent is PdfStructElem) { PdfStructElem structParent = (PdfStructElem)parent; if (!structParent.IsFlushed()) { structParent.RemoveKid(pageTag); PdfDictionary parentObject = structParent.GetPdfObject(); if (!connectedStructToModel.ContainsKey(parentObject) && parent.GetKids().Count == 0 && parentObject != rootTagElement .GetPdfObject()) { RemovePageTagFromParent(structParent, parent.GetParent()); parentObject.GetIndirectReference().SetFree(); } } else { if (pageTag is PdfMcr) { throw new PdfException(PdfException.CannotRemoveTagBecauseItsParentIsFlushed); } } } }
private void RemovePageTagFromParent(IStructureNode pageTag, IStructureNode parent) { if (parent is PdfStructElem) { PdfStructElem structParent = (PdfStructElem)parent; if (!structParent.IsFlushed()) { structParent.RemoveKid(pageTag); PdfDictionary parentStructDict = structParent.GetPdfObject(); if (waitingTagsManager.GetObjForStructDict(parentStructDict) == null && parent.GetKids().Count == 0 && !(structParent .GetParent() is PdfStructTreeRoot)) { RemovePageTagFromParent(structParent, parent.GetParent()); PdfIndirectReference indRef = parentStructDict.GetIndirectReference(); if (indRef != null) { // TODO how about possible references to structure element from refs or structure destination for instance? indRef.SetFree(); } } } else { if (pageTag is PdfMcr) { throw new PdfException(PdfException.CannotRemoveTagBecauseItsParentIsFlushed); } } } }
/// <summary>Removes the current tag.</summary> /// <remarks> /// Removes the current tag. If it has kids, they will become kids of the current tag parent. /// This method call moves this /// <c>TagTreePointer</c> /// to the current tag parent. /// <br /><br /> /// You cannot remove root tag, and also you cannot remove the tag if it's parent is already flushed; /// in this two cases an exception will be thrown. /// </remarks> /// <returns> /// this /// <see cref="TagStructureContext"/> /// instance. /// </returns> public virtual iText.Kernel.Pdf.Tagutils.TagTreePointer RemoveTag() { PdfStructElem currentStructElem = GetCurrentStructElem(); IStructureNode parentElem = currentStructElem.GetParent(); if (parentElem is PdfStructTreeRoot) { throw new PdfException(PdfException.CannotRemoveDocumentRootTag); } IList <IStructureNode> kids = currentStructElem.GetKids(); PdfStructElem parent = (PdfStructElem)parentElem; if (parent.IsFlushed()) { throw new PdfException(PdfException.CannotRemoveTagBecauseItsParentIsFlushed); } // remove waiting tag state if tag is removed Object objForStructDict = tagStructureContext.GetWaitingTagsManager().GetObjForStructDict(currentStructElem .GetPdfObject()); tagStructureContext.GetWaitingTagsManager().RemoveWaitingState(objForStructDict); int removedKidIndex = parent.RemoveKid(currentStructElem); PdfIndirectReference indRef = currentStructElem.GetPdfObject().GetIndirectReference(); if (indRef != null) { // TODO how about possible references to structure element from refs or structure destination for instance? indRef.SetFree(); } foreach (IStructureNode kid in kids) { if (kid is PdfStructElem) { parent.AddKid(removedKidIndex++, (PdfStructElem)kid); } else { PdfMcr mcr = PrepareMcrForMovingToNewParent((PdfMcr)kid, parent); parent.AddKid(removedKidIndex++, mcr); } } currentStructElem.GetPdfObject().Clear(); SetCurrentStructElem(parent); return(this); }
public override AccessibilityProperties AddAttributes(PdfDictionary attributes) { PdfObject attributesObject = backingElem.GetAttributes(false); PdfObject combinedAttributes = CombineAttributesList(attributesObject, JavaCollectionsUtil.SingletonList(attributes ), backingElem.GetPdfObject().GetAsNumber(PdfName.R)); backingElem.SetAttributes(combinedAttributes); return(this); }
private PdfStructElem GetCurrentElemEnsureIndirect() { PdfStructElem currentStructElem = GetCurrentStructElem(); if (currentStructElem.GetPdfObject().GetIndirectReference() == null) { currentStructElem.MakeIndirect(GetDocument()); } return(currentStructElem); }
private bool EnsureElementPageEqualsKidPage(PdfStructElem elem, PdfDictionary kidPage) { PdfObject pageObject = elem.GetPdfObject().Get(PdfName.Pg); if (pageObject == null) { pageObject = kidPage; // Explicitly using object indirect reference here in order to correctly process released objects. elem.Put(PdfName.Pg, kidPage.GetIndirectReference()); } return(kidPage.Equals(pageObject)); }
private void RemoveWaitingStateAndFlushIfParentFlushed(PdfStructElem structElem) { if (structElem != null) { waitingTagToAssociatedObj.JRemove(structElem.GetPdfObject()); IStructureNode parent = structElem.GetParent(); if (parent is PdfStructElem && ((PdfStructElem)parent).IsFlushed()) { FlushStructElementAndItKids(structElem); } } }
/// <returns>parent of the flushed tag</returns> internal virtual IStructureNode FlushTag(PdfStructElem tagStruct) { Object associatedObj = waitingTagToAssociatedObj.JRemove(tagStruct.GetPdfObject()); if (associatedObj != null) { associatedObjToWaitingTag.JRemove(associatedObj); } IStructureNode parent = tagStruct.GetParent(); FlushStructElementAndItKids(tagStruct); return(parent); }
/// <returns>parent of the flushed tag</returns> internal virtual IPdfStructElem FlushTag(PdfStructElem tagStruct) { IAccessibleElement modelElement = connectedStructToModel.JRemove(tagStruct.GetPdfObject()); if (modelElement != null) { connectedModelToStruct.JRemove(modelElement); } IPdfStructElem parent = tagStruct.GetParent(); FlushStructElementAndItKids(tagStruct); return(parent); }
internal virtual PdfStructElem GetCurrentStructElem() { if (currentStructElem.IsFlushed()) { throw new PdfException(PdfException.TagTreePointerIsInInvalidStateItPointsAtFlushedElementUseMoveToRoot); } if (currentStructElem.GetPdfObject().GetIndirectReference().IsFree()) { // is removed throw new PdfException(PdfException.TagTreePointerIsInInvalidStateItPointsAtRemovedElementUseMoveToRoot); } return(currentStructElem); }
private void FlushStructElementAndItKids(PdfStructElem elem) { if (waitingTagToAssociatedObj.ContainsKey(elem.GetPdfObject())) { return; } foreach (IStructureNode kid in elem.GetKids()) { if (kid is PdfStructElem) { FlushStructElementAndItKids((PdfStructElem)kid); } } elem.Flush(); }
private void FlushStructElementAndItKids(PdfStructElem elem) { if (connectedStructToModel.ContainsKey(elem.GetPdfObject())) { return; } foreach (IPdfStructElem kid in elem.GetKids()) { if (kid is PdfStructElem) { FlushStructElementAndItKids((PdfStructElem)kid); } } elem.Flush(); }
private void RemoveStructToModelConnection(PdfStructElem structElem) { if (structElem != null) { IAccessibleElement element = connectedStructToModel.JRemove(structElem.GetPdfObject()); structElem.SetRole(element.GetRole()); if (element.GetAccessibilityProperties() != null) { element.GetAccessibilityProperties().SetToStructElem(structElem); } if (structElem.GetParent() == null) { // is flushed FlushStructElementAndItKids(structElem); } } }
// it is StructTreeRoot // should never happen as we always should have only one root tag and we don't remove it private void FlushParentIfBelongsToPage(PdfStructElem parent, PdfPage currentPage) { if (parent.IsFlushed() || connectedStructToModel.ContainsKey(parent.GetPdfObject()) || parent.GetPdfObject () == rootTagElement.GetPdfObject()) { return; } IList <IPdfStructElem> kids = parent.GetKids(); bool allKidsBelongToPage = true; foreach (IPdfStructElem kid in kids) { if (kid is PdfMcr) { PdfDictionary kidPage = ((PdfMcr)kid).GetPageObject(); if (!kidPage.IsFlushed() && !kidPage.Equals(currentPage.GetPdfObject())) { allKidsBelongToPage = false; break; } } else { if (kid is PdfStructElem) { // If kid is structElem and was already flushed then in kids list there will be null for it instead of // PdfStructElem. And therefore if we get into this if clause it means that some StructElem wasn't flushed. allKidsBelongToPage = false; break; } } } if (allKidsBelongToPage) { IPdfStructElem parentsParent = parent.GetParent(); parent.Flush(); if (parentsParent is PdfStructElem) { FlushParentIfBelongsToPage((PdfStructElem)parentsParent, currentPage); } } return; }
internal virtual void FlushParentIfBelongsToPage(PdfStructElem parent, PdfPage currentPage) { if (parent.IsFlushed() || waitingTagsManager.GetObjForStructDict(parent.GetPdfObject()) != null || parent. GetParent() is PdfStructTreeRoot) { return; } IList <IStructureNode> kids = parent.GetKids(); bool readyToBeFlushed = true; foreach (IStructureNode kid in kids) { if (kid is PdfMcr) { PdfDictionary kidPage = ((PdfMcr)kid).GetPageObject(); if (!kidPage.IsFlushed() && (currentPage == null || !kidPage.Equals(currentPage.GetPdfObject()))) { readyToBeFlushed = false; break; } } else { if (kid is PdfStructElem) { // If kid is structElem and was already flushed then in kids list there will be null for it instead of // PdfStructElement. And therefore if we get into this if-clause it means that some StructElem wasn't flushed. readyToBeFlushed = false; break; } } } if (readyToBeFlushed) { IStructureNode parentsParent = parent.GetParent(); parent.Flush(); if (parentsParent is PdfStructElem) { FlushParentIfBelongsToPage((PdfStructElem)parentsParent, currentPage); } } }
private static void ProcessStructElem(PdfStructElem elem, StringBuilder builder) { PdfDictionary page = elem.GetPdfObject().GetAsDictionary(PdfName.Pg); if (page == null) { return; } PdfStream contents = page.GetAsStream(PdfName.Contents); if (contents != null) { builder.Append("Content: \n" + Encoding.UTF8.GetString(contents.GetBytes()) + "\n"); } else { PdfArray array = page.GetAsArray(PdfName.Contents); builder.Append("Contents array: " + array + "\n"); } }
private void AddStructTreeRootKidsToTheRootTag(IList <IStructureNode> rootKids) { int originalRootKidsIndex = 0; bool isBeforeOriginalRoot = true; foreach (IStructureNode elem in rootKids) { // StructTreeRoot kids are always PdfStructElement, so we are save here to cast it PdfStructElem kid = (PdfStructElem)elem; if (kid.GetPdfObject() == rootTagElement.GetPdfObject()) { isBeforeOriginalRoot = false; continue; } // This boolean is used to "flatten" possible deep "stacking" of the tag structure in case of the multiple pages copying operations. // This could happen due to the wrapping of all the kids in the createNewRootTag or ensureExistingRootTagIsDocument methods. // And therefore, we don't need here to resolve mappings, because we exactly know which role we set. bool kidIsDocument = PdfName.Document.Equals(kid.GetRole()); if (kidIsDocument && kid.GetNamespace() != null && context.TargetTagStructureVersionIs2()) { // we flatten only tags of document role in standard structure namespace String kidNamespaceName = kid.GetNamespace().GetNamespaceName(); kidIsDocument = StandardNamespaces.PDF_1_7.Equals(kidNamespaceName) || StandardNamespaces.PDF_2_0.Equals(kidNamespaceName ); } if (isBeforeOriginalRoot) { rootTagElement.AddKid(originalRootKidsIndex, kid); originalRootKidsIndex += kidIsDocument ? kid.GetKids().Count : 1; } else { rootTagElement.AddKid(kid); } if (kidIsDocument) { RemoveOldRoot(kid); } } }
internal virtual Object SaveAssociatedObjectForWaitingTag(Object associatedObj, PdfStructElem structElem) { associatedObjToWaitingTag.Put(associatedObj, structElem); return(waitingTagToAssociatedObj.Put(structElem.GetPdfObject(), associatedObj)); }
internal virtual IAccessibleElement GetModelConnectedToStruct(PdfStructElem @struct) { return(connectedStructToModel.Get(@struct.GetPdfObject())); }
internal virtual void SetToStructElem(PdfStructElem elem) { if (GetActualText() != null) { elem.SetActualText(new PdfString(GetActualText())); } if (GetAlternateDescription() != null) { elem.SetAlt(new PdfString(GetAlternateDescription())); } if (GetExpansion() != null) { elem.SetE(new PdfString(GetExpansion())); } if (GetLanguage() != null) { elem.SetLang(new PdfString(GetLanguage())); } IList<PdfDictionary> newAttributesList = GetAttributesList(); if (newAttributesList.Count > 0) { PdfObject attributesObject = elem.GetAttributes(false); PdfObject combinedAttributes = CombineAttributesList(attributesObject, newAttributesList, elem.GetPdfObject ().GetAsNumber(PdfName.R)); elem.SetAttributes(combinedAttributes); } }
internal static void Apply(AccessibilityProperties properties, PdfStructElem elem) { if (properties.GetActualText() != null) { elem.SetActualText(new PdfString(properties.GetActualText(), PdfEncodings.UNICODE_BIG)); } if (properties.GetAlternateDescription() != null) { elem.SetAlt(new PdfString(properties.GetAlternateDescription(), PdfEncodings.UNICODE_BIG)); } if (properties.GetExpansion() != null) { elem.SetE(new PdfString(properties.GetExpansion(), PdfEncodings.UNICODE_BIG)); } if (properties.GetLanguage() != null) { elem.SetLang(new PdfString(properties.GetLanguage(), PdfEncodings.UNICODE_BIG)); } IList <PdfStructureAttributes> newAttributesList = properties.GetAttributesList(); if (newAttributesList.Count > 0) { PdfObject attributesObject = elem.GetAttributes(false); PdfObject combinedAttributes = CombineAttributesList(attributesObject, -1, newAttributesList, elem.GetPdfObject ().GetAsNumber(PdfName.R)); elem.SetAttributes(combinedAttributes); } if (properties.GetPhoneme() != null) { elem.SetPhoneme(new PdfString(properties.GetPhoneme(), PdfEncodings.UNICODE_BIG)); } if (properties.GetPhoneticAlphabet() != null) { elem.SetPhoneticAlphabet(new PdfName(properties.GetPhoneticAlphabet())); } if (properties.GetNamespace() != null) { elem.SetNamespace(properties.GetNamespace()); } foreach (TagTreePointer @ref in properties.GetRefsList()) { elem.AddRef(@ref.GetCurrentStructElem()); } }
internal virtual void SaveConnectionBetweenStructAndModel(IAccessibleElement element, PdfStructElem structElem ) { connectedModelToStruct[element] = structElem; connectedStructToModel[structElem.GetPdfObject()] = element; }