/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources){ this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0){ PdfLiteral oper = (PdfLiteral)operands[operands.Count-1]; if ("BI".Equals(oper.ToString())){ // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null; HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/// <summary> /// Removes ocgs from a page resources </summary> /// <param name="page"> a page dictionary </param> /// <param name="ocgs"> a set of names of OCG layers </param> private void RemoveProperties(PdfDictionary page, ICollection<string> ocgs) { PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); if (resources == null) { return; } PdfDictionary properties = resources.GetAsDict(PdfName.PROPERTIES); if (properties == null) { return; } ICollection<PdfName> names = properties.Keys; IList<PdfName> remove = new List<PdfName>(); foreach (PdfName name in names) { PdfDictionary dict = properties.GetAsDict(name); if (IsToBeRemoved(dict, ocgs)) { remove.Add(name); } else { RemoveOCGsFromArray(dict, PdfName.OCGS, ocgs); } } foreach (PdfName name in remove) { properties.Remove(name); } }
/// <summary> /// Uses the OCGParser on a page </summary> /// <param name="parser"> the OCGParser </param> /// <param name="page"> the page dictionary of the page that needs to be parsed. </param> /// <exception cref="IOException"> </exception> private void Parse(OCGParser parser, PdfDictionary page) { PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS); PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); parser.Parse(stream, resources); }
/** * If the child of a structured element is a dictionary, we inspect the * child; we may also draw a tag. * * @param k * the child dictionary to inspect */ virtual public void InspectChildDictionary(PdfDictionary k, bool inspectAttributes) { if (k == null) return; PdfName s = k.GetAsName(PdfName.S); if (s != null) { String tagN = PdfName.DecodeName(s.ToString()); String tag = FixTagName(tagN); outp.Write("<"); outp.Write(tag); if (inspectAttributes) { PdfDictionary a = k.GetAsDict(PdfName.A); if (a != null) { Dictionary<PdfName, PdfObject>.KeyCollection keys = a.Keys; foreach (PdfName key in keys) { outp.Write(' '); PdfObject value = a.Get(key); value = PdfReader.GetPdfObject(value); outp.Write(XmlName(key)); outp.Write("=\""); outp.Write(value.ToString()); outp.Write("\""); } } } outp.Write(">"); PdfDictionary dict = k.GetAsDict(PdfName.PG); if (dict != null) ParseTag(tagN, k.GetDirectObject(PdfName.K), dict); InspectChild(k.GetDirectObject(PdfName.K)); outp.Write("</"); outp.Write(tag); outp.WriteLine(">"); } else InspectChild(k.GetDirectObject(PdfName.K)); }
/** * Searches for a tag in a page. * * @param tag * the name of the tag * @param obj * an identifier to find the marked content * @param page * a page dictionary * @throws IOException */ public virtual void ParseTag(String tag, PdfObject obj, PdfDictionary page) { // if the identifier is a number, we can extract the content right away if (obj is PdfNumber) { PdfNumber mcid = (PdfNumber) obj; RenderFilter filter = new MarkedContentRenderFilter(mcid.IntValue); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, new RenderFilter[]{filter}); PdfContentStreamProcessor processor = new PdfContentStreamProcessor( listener); processor.ProcessContent(PdfReader.GetPageContent(page), page .GetAsDict(PdfName.RESOURCES)); outp.Write(XMLUtil.EscapeXML(listener.GetResultantText(), true)); } // if the identifier is an array, we call the parseTag method // recursively else if (obj is PdfArray) { PdfArray arr = (PdfArray) obj; int n = arr.Size; for (int i = 0; i < n; i++) { ParseTag(tag, arr[i], page); if (i < n - 1) outp.WriteLine(); } } // if the identifier is a dictionary, we get the resources from the // dictionary else if (obj is PdfDictionary) { PdfDictionary mcr = (PdfDictionary) obj; ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr .GetAsDict(PdfName.PG)); } }
/** * Shows the detail of a dictionary. * @param dic the dictionary of which you want the detail * @param depth the depth of the current dictionary (for nested dictionaries) * @return a String representation of the dictionary */ public static String GetDictionaryDetail(PdfDictionary dic, int depth){ StringBuilder builder = new StringBuilder(); builder.Append('('); IList<PdfName> subDictionaries = new List<PdfName>(); foreach (PdfName key in dic.Keys) { PdfObject val = dic.GetDirectObject(key); if (val.IsDictionary()) subDictionaries.Add(key); builder.Append(key); builder.Append('='); builder.Append(val); builder.Append(", "); } builder.Length = builder.Length-2; builder.Append(')'); foreach (PdfName pdfSubDictionaryName in subDictionaries) { builder.Append('\n'); for (int i = 0; i < depth+1; i++){ builder.Append('\t'); } builder.Append("Subdictionary "); builder.Append(pdfSubDictionaryName); builder.Append(" = "); builder.Append(GetDictionaryDetail(dic.GetAsDict(pdfSubDictionaryName), depth+1)); } return builder.ToString(); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if(structParents == null) throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); annots = page.GetAsArray(PdfName.ANNOTS); if(annots == null) annots = new PdfArray(); PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject) item); items.RemoveAt(0); } } if(annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for(int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null) throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
/** * Displays a summary of the entries in the XObject dictionary for the stream * @param resourceDic the resource dictionary for the stream * @return a string with the summary of the entries * @throws IOException * @since 5.0.2 */ public static String GetXObjectDetail(PdfDictionary resourceDic) { StringBuilder sb = new StringBuilder(); PdfDictionary xobjects = resourceDic.GetAsDict(PdfName.XOBJECT); if (xobjects == null) return "No XObjects"; foreach (PdfName entryName in xobjects.Keys) { PdfStream xobjectStream = xobjects.GetAsStream(entryName); sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + " = " + xobjectStream.GetAsNumber(PdfName.LENGTH) + " bytes ------\n"); if (!xobjectStream.Get(PdfName.SUBTYPE).Equals(PdfName.IMAGE)){ byte[] contentBytes = ContentByteUtils.GetContentBytesFromContentObject(xobjectStream); foreach (byte b in contentBytes) { sb.Append((char)b); } sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + "End of Content" + "------\n"); } } return sb.ToString(); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if (structParents == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); } annots = page.GetAsArray(PdfName.ANNOTS); if (annots == null) { annots = new PdfArray(); } PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject)item); items.RemoveAt(0); } } if (annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for (int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
/** * Converts an annotation structure item to a Form XObject annotation. * @param item the structure item * @throws IOException */ virtual protected void ConvertToXObject(StructureObject item) { PdfDictionary structElem = item.GetStructElem(); if (structElem == null) { return; } PdfDictionary dict = item.GetObjAsDict(); if (dict == null || !dict.CheckType(PdfName.ANNOT)) { return; } PdfDictionary ap = dict.GetAsDict(PdfName.AP); if (ap == null) { return; } PdfNumber structParent = dict.GetAsNumber(PdfName.STRUCTPARENT); if (structParent == null) { return; } PdfStream stream = ap.GetAsStream(PdfName.N); if (stream == null) { return; } stream.Put(PdfName.STRUCTPARENT, structParent); PdfIndirectReference xobjr = ap.GetAsIndirectObject(PdfName.N); if (xobjr == null) { return; } // remove the annotation from the page for (int i = 0; i < annots.Length; i++) { PdfIndirectReference annotref = annots.GetAsIndirectObject(i); if (item.GetObjRef().Number == annotref.Number) { annots.Remove(i); break; } } // replace the existing attributes by a PrintField attribute PdfDictionary attribute = new PdfDictionary(); attribute.Put(PdfName.O, PdfName.PRINTFIELD); PdfString description = dict.GetAsString(PdfName.TU); if (description == null) { description = dict.GetAsString(PdfName.T); } if (PdfName.BTN.Equals(dict.Get(PdfName.FT))) { PdfNumber fflags = dict.GetAsNumber(PdfName.FF); if (fflags != null) { int ff = fflags.IntValue; if ((ff & PdfFormField.FF_PUSHBUTTON) != 0) { attribute.Put(PdfName.ROLE, PdfName.PB); } // I don't think the condition below will ever be true if ((ff & PdfFormField.FF_RADIO) != 0) { attribute.Put(PdfName.ROLE, PdfName.rb); } else { attribute.Put(PdfName.ROLE, PdfName.CB); } } } else { attribute.Put(PdfName.ROLE, PdfName.TV); } attribute.Put(PdfName.DESC, description); // Updating the values of the StructElem dictionary PdfString t = structElem.GetAsString(PdfName.T); if (t == null || t.ToString().Trim().Length == 0) { structElem.Put(PdfName.T, dict.GetAsString(PdfName.T)); } structElem.Put(PdfName.A, attribute); structElem.Put(PdfName.S, PdfName.P); structElem.Put(PdfName.PG, pageref); // Defining a new MCID int mcid = items.ProcessMCID(structParents, item.GetRef()); LOGGER.Info("Using MCID " + mcid); structElem.Put(PdfName.K, new PdfNumber(mcid)); // removing the annotation from the parent tree items.RemoveFromParentTree(structParent); // Adding the XObject to the page PdfName xobj = new PdfName("XObj" + structParent.IntValue); LOGGER.Info("Creating XObject with name " + xobj); xobjects.Put(xobj, xobjr); PdfArray array = dict.GetAsArray(PdfName.RECT); // Getting the position of the annotation Rectangle rect = new Rectangle( array.GetAsNumber(0).FloatValue, array.GetAsNumber(1).FloatValue, array.GetAsNumber(2).FloatValue, array.GetAsNumber(3).FloatValue); rect.Normalize(); // A Do operator is forbidden inside a text block if (inText && !btWrite) { LOGGER.Debug("Introducing extra ET"); byte[] bytes = Encoding.ASCII.GetBytes("ET\n"); baos.Write(bytes, 0, bytes.Length); etExtra = true; } // Writing the marked-content sequence with the Do operator // Note that the position assumes that the CTM wasn't changed in the graphics state // TODO: do the math if the CTM did change! ByteBuffer buf = new ByteBuffer(); buf.Append("/P <</MCID "); buf.Append(mcid); buf.Append(">> BDC\n"); buf.Append("q 1 0 0 1 "); buf.Append(rect.Left.ToString(CultureInfo.InvariantCulture)); buf.Append(" "); buf.Append(rect.Bottom.ToString(CultureInfo.InvariantCulture)); buf.Append(" cm "); buf.Append(xobj.GetBytes()); buf.Append(" Do Q\n"); buf.Append("EMC\n"); buf.Flush(); buf.WriteTo(baos); // if we were inside a text block, we've introduced an ET, so we'll need to write a BT if (inText) { btWrite = true; } }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); if ("BI".Equals(@operator.ToString())) { int found = 0; int ch; bool immediateAfterBI = true; while ((ch = tokeniser.Read()) != -1) { if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch)) { baos.WriteByte((byte)ch); } immediateAfterBI = false; if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; } else if (found == 1 && ch == 'E') { found++; } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. } else if (found == 2 && ch == 'I') { found++; } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { break; } else { found = 0; } } } } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
public static List <Attachment> GetAttachments(Stream stream, string password = null) { PdfReader reader = null; if (!string.IsNullOrWhiteSpace(password)) { reader = new PdfReader(stream, Encoding.ASCII.GetBytes(password)); } else { reader = new PdfReader(stream); } #region Variables PdfDictionary catalog = null; PdfDictionary documentNames = null; PdfDictionary embeddedFiles = null; PdfDictionary fileArray = null; PdfDictionary file = null; PRStream prstream = null; Attachment fContent = null; List <Attachment> lstAtt = null; #endregion Variables catalog = reader.Catalog; lstAtt = new List <Attachment>(); documentNames = (PdfDictionary)PdfReader.GetPdfObject(catalog.Get(PdfName.NAMES)); if (documentNames != null) { embeddedFiles = (PdfDictionary)PdfReader.GetPdfObject(documentNames.Get(PdfName.EMBEDDEDFILES)); if (embeddedFiles != null) { PdfArray filespecs = embeddedFiles.GetAsArray(PdfName.NAMES); for (int i = 0; i < filespecs.Size; i++) { i++; fileArray = filespecs.GetAsDict(i); file = fileArray.GetAsDict(PdfName.EF); foreach (PdfName key in file.Keys) { prstream = (PRStream)PdfReader.GetPdfObject(file.GetAsIndirectObject(key)); fContent = new Attachment(); fContent.Name = fileArray.GetAsString(key).ToString(); fContent.Content = PdfReader.GetStreamBytes(prstream); lstAtt.Add(fContent); } } } } return(lstAtt); }