/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null; ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); renderListener.RenderImage(renderInfo); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
public byte[] Modify(byte[] contentBytes, PdfDictionary resourcesDictionary) { _contentStreamBuilderStack.Push(new PdfContentStreamBuilder()); _resourceDictionaryStack.Push(resourcesDictionary); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; //System.Diagnostics.Debug.WriteLine("[Debug] Opr: " + oper.ToString()); PdfContentOperatorHandler operHandler = null; if (_operators.TryGetValue(oper.ToString(), out operHandler)) { operands = operHandler(oper, operands); } _contentStreamBuilderStack.Peek().Push(operands); } _resourceDictionaryStack.Pop(); return(_contentStreamBuilderStack.Pop().GetBytes()); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <iTextSharp.text.pdf.PdfObject> operands = new List <iTextSharp.text.pdf.PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; // w.GetOperatorInfo(oper) //w.wr.Print("operator info {0} type {1} string {2}", oper.GetType().ToString(), oper.Type, oper.ToString()); if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null; // 'iTextSharp.text.pdf.parser.ImageRenderInfo.CreateForEmbeddedImage(iTextSharp.text.pdf.parser.Matrix, iTextSharp.text.pdf.parser.InlineImageInfo, iTextSharp.text.pdf.PdfDictionary)' is inaccessible due to its protection level ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); renderListener.RenderImage(renderInfo); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
public static PdfData ConvertToPdfData(string fileName, int pageNum) { if ((string.IsNullOrEmpty(fileName) || string.IsNullOrWhiteSpace(fileName)) && pageNum <= 0) { return(null); } Helpers.D.Log("PdfConvertIText.ConvertToPdfData({0}, {1})", fileName, pageNum); PdfData data = new PdfData(); PdfContentParser parser;// = new PdfContentParser(); PRTokeniser tokeniser = new PRTokeniser(fileName); PdfDictionary dict; ArrayList items; parser = new PdfContentParser(tokeniser); dict = parser.ReadDictionary(); //dict.Contains(PdfName.IMAGE) items = parser.Parse(parser.ReadArray().ArrayList); Helpers.D.Log("PdfConvertIText.ConvertToPdfData: {0} | {1}", items.Count, string.Join(", ", items.ToArray())); return(data); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if (structParents == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); } annots = page.GetAsArray(PdfName.ANNOTS); if (annots == null) { annots = new PdfArray(); } PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject)item); items.RemoveAt(0); } } if (annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for (int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level) { if (level >= MAXLEVEL) { return; } PRTokeniser inp = location.GetLocation(cmapName); try { List <PdfObject> list = new List <PdfObject>(); PdfContentParser cp = new PdfContentParser(inp); int maxExc = 50; while (true) { try { cp.Parse(list); } catch { if (--maxExc < 0) { break; } continue; } if (list.Count == 0) { break; } String last = list[list.Count - 1].ToString(); if (level == 0 && list.Count == 3 && last.Equals(DEF)) { PdfObject key = list[0]; if (PdfName.REGISTRY.Equals(key)) { cmap.Registry = list[1].ToString(); } else if (PdfName.ORDERING.Equals(key)) { cmap.Ordering = list[1].ToString(); } else if (CMAPNAME.Equals(key)) { cmap.Name = list[1].ToString(); } else if (PdfName.SUPPLEMENT.Equals(key)) { try { cmap.Supplement = ((PdfNumber)list[1]).IntValue; } catch {} } } else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3) { int lmax = list.Count - 2; for (int k = 0; k < lmax; k += 2) { if (list[k] is PdfString) { cmap.AddChar((PdfString)list[k], list[k + 1]); } } } else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4) { int lmax = list.Count - 3; for (int k = 0; k < lmax; k += 3) { if (list[k] is PdfString && list[k + 1] is PdfString) { cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]); } } } else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName) { ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1); } } } finally { inp.Close(); } }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); if ("BI".Equals(@operator.ToString())) { int found = 0; int ch; bool immediateAfterBI = true; while ((ch = tokeniser.Read()) != -1) { if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch)) { baos.WriteByte((byte)ch); } immediateAfterBI = false; if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; } else if (found == 1 && ch == 'E') { found++; } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. } else if (found == 2 && ch == 'I') { found++; } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { break; } else { found = 0; } } } } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources){ this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0){ PdfLiteral oper = (PdfLiteral)operands[operands.Count-1]; if ("BI".Equals(oper.ToString())){ // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null; HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if(structParents == null) throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); annots = page.GetAsArray(PdfName.ANNOTS); if(annots == null) annots = new PdfArray(); PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject) item); items.RemoveAt(0); } } if(annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for(int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null) throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
/** * Processes PDF syntax * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; // special handling for embedded images. If we hit an ID oper, we need // to skip all content until we reach an EI oper surrounded by whitespace. // The following algorithm has one potential issue: what if the image stream // contains <ws>EI<ws> ? // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. if ("ID".Equals(oper.ToString())) { MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E') { found++; accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I') { found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { operands = new List <PdfObject>(); operands.Add(new PdfLiteral("ID")); InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands); // we should probably eventually do something to make the accumulated image content stream available operands = new List <PdfObject>(); operands.Add(new PdfLiteral("EI")); InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands); break; } else { accumulated.WriteTo(baos); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } } InvokeOperator(oper, operands); } this.resources.Pop(); }