/** * Parses an inline image from the provided content parser. The parser must be positioned immediately following the BI operator in the content stream. * The parser will be left with current position immediately following the EI operator that terminates the inline image * @param ps the content parser to use for reading the image. * @return the parsed image * @throws IOException if anything goes wring with the parsing * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing */ public static PdfImageObject ParseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic) { PdfDictionary inlineImageDictionary = ParseInlineImageDictionary(ps); byte[] samples = ParseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps); return(new PdfImageObject(inlineImageDictionary, samples)); }
public byte[] Modify(byte[] contentBytes, PdfDictionary resourcesDictionary) { _contentStreamBuilderStack.Push(new PdfContentStreamBuilder()); _resourceDictionaryStack.Push(resourcesDictionary); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; //System.Diagnostics.Debug.WriteLine("[Debug] Opr: " + oper.ToString()); PdfContentOperatorHandler operHandler = null; if (_operators.TryGetValue(oper.ToString(), out operHandler)) { operands = operHandler(oper, operands); } _contentStreamBuilderStack.Peek().Push(operands); } _resourceDictionaryStack.Pop(); return(_contentStreamBuilderStack.Pop().GetBytes()); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <iTextSharp.text.pdf.PdfObject> operands = new List <iTextSharp.text.pdf.PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; // w.GetOperatorInfo(oper) //w.wr.Print("operator info {0} type {1} string {2}", oper.GetType().ToString(), oper.Type, oper.ToString()); if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null; // 'iTextSharp.text.pdf.parser.ImageRenderInfo.CreateForEmbeddedImage(iTextSharp.text.pdf.parser.Matrix, iTextSharp.text.pdf.parser.InlineImageInfo, iTextSharp.text.pdf.PdfDictionary)' is inaccessible due to its protection level ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); renderListener.RenderImage(renderInfo); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; if ("BI".Equals(oper.ToString())) { // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null; ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); renderListener.RenderImage(renderInfo); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/** * Parses an inline image from the provided content parser. The parser must be positioned immediately following the BI operator in the content stream. * The parser will be left with current position immediately following the EI operator that terminates the inline image * @param ps the content parser to use for reading the image. * @return the parsed image * @throws IOException if anything goes wring with the parsing * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing */ public static InlineImageInfo ParseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic) { PdfDictionary inlineImageDictionary = ParseInlineImageDictionary(ps); byte[] samples = ParseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps); return(new InlineImageInfo(samples, inlineImageDictionary)); }
/** * Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the EI operator. * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. * @param ps the parser to extract the embedded image information from * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values * @throws IOException if the parse fails */ private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) { // by the time we get to here, we have already parsed the BI operator PdfDictionary dictionary = new PdfDictionary(); for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()) { PdfObject value = ps.ReadPRObject(); PdfName resolvedKey; inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey); if (resolvedKey == null) { resolvedKey = (PdfName)key; } dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value)); } int ch = ps.GetTokeniser().Read(); if (!PRTokeniser.IsWhitespace(ch)) { throw new IOException("Unexpected character " + ch + " found after ID in inline image"); } return(dictionary); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it int startIndex = 0; if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace)) { bytes[0] = (byte)shouldBeWhiteSpace; startIndex++; } for (int i = startIndex; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParseException("End of content stream reached before end of image data"); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) { throw new InlineImageParseException("EI not found after end of image data"); } return(bytes); }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
static void Main(string[] args) { var reader = new PdfReader(@"D:\Tmp\sample.pdf"); try { var parser = new PdfContentParser(new PRTokeniser(reader.GetPageContent(2))); var sb = new StringBuilder(); while (parser.Tokeniser.NextToken()) { if (parser.Tokeniser.TokenType == PRTokeniser.TK_STRING) { string str = parser.Tokeniser.StringValue; sb.Append(str); } } Console.WriteLine(sb.ToString()); } finally { reader.Close(); } }
public static PdfData ConvertToPdfData(string fileName, int pageNum) { if ((string.IsNullOrEmpty(fileName) || string.IsNullOrWhiteSpace(fileName)) && pageNum <= 0) { return(null); } Helpers.D.Log("PdfConvertIText.ConvertToPdfData({0}, {1})", fileName, pageNum); PdfData data = new PdfData(); PdfContentParser parser;// = new PdfContentParser(); PRTokeniser tokeniser = new PRTokeniser(fileName); PdfDictionary dict; ArrayList items; parser = new PdfContentParser(tokeniser); dict = parser.ReadDictionary(); //dict.Contains(PdfName.IMAGE) items = parser.Parse(parser.ReadArray().ArrayList); Helpers.D.Log("PdfConvertIText.ConvertToPdfData: {0} | {1}", items.Count, string.Join(", ", items.ToArray())); return(data); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) for (int i = 0; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParseException("End of content stream reached before end of image data"); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) { throw new InlineImageParseException("EI not found after end of image data"); } return(bytes); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) throw new ArgumentException("Dictionary contains filters"); PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it int startIndex = 0; if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0){ // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't) bytes[0] = (byte)shouldBeWhiteSpace; startIndex++; } for (int i = startIndex; i < bytesToRead; i++){ int ch = tokeniser.Read(); if (ch == -1) throw new InlineImageParseException("End of content stream reached before end of image data"); bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) throw new InlineImageParseException("EI not found after end of image data"); return bytes; }
/** * Processes PDF syntax * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources) { this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(contentBytes); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1]; // special handling for embedded images. If we hit an ID oper, we need // to skip all content until we reach an EI oper surrounded by whitespace. // The following algorithm has one potential issue: what if the image stream // contains <ws>EI<ws> ? // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. if ("ID".Equals(oper.ToString())) { MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E') { found++; accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I') { found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { operands = new List <PdfObject>(); operands.Add(new PdfLiteral("ID")); InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands); // we should probably eventually do something to make the accumulated image content stream available operands = new List <PdfObject>(); operands.Add(new PdfLiteral("EI")); InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands); break; } else { accumulated.WriteTo(baos); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } } InvokeOperator(oper, operands); } this.resources.Pop(); }
/// <summary> /// Parses a stream object and removes OCGs. </summary> /// <param name="stream"> a stream object </param> /// <param name="resources"> the resources dictionary of that object (containing info about the OCGs) </param> public virtual void Parse(PRStream stream, PdfDictionary resources) { baos = new MemoryStream(); properties = resources.GetAsDict(PdfName.PROPERTIES); xobj = new HashSet2 <PdfName>(); PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects != null) { // remove XObject (form or image) that belong to an OCG that needs to be removed foreach (PdfName name in xobjects.Keys) { PRStream xobject = (PRStream)xobjects.GetAsStream(name); PdfDictionary oc = xobject.GetAsDict(PdfName.OC); if (oc != null) { PdfString ocname = oc.GetAsString(PdfName.NAME); if (ocname != null && ocgs.Contains(ocname.ToString())) { xobj.Add(name); } } } foreach (PdfName name in xobj) { xobjects.Remove(name); } } // parse the content stream byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes)); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(this, @operator, operands); if ("BI".Equals(@operator.ToString())) { int found = 0; int ch; bool immediateAfterBI = true; while ((ch = tokeniser.Read()) != -1) { if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch)) { baos.WriteByte((byte)ch); } immediateAfterBI = false; if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; } else if (found == 1 && ch == 'E') { found++; } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. } else if (found == 2 && ch == 'I') { found++; } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { break; } else { found = 0; } } } } baos.Flush(); baos.Close(); stream.SetData(baos.GetBuffer()); }
private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level) { if (level >= MAXLEVEL) { return; } PRTokeniser inp = location.GetLocation(cmapName); try { List <PdfObject> list = new List <PdfObject>(); PdfContentParser cp = new PdfContentParser(inp); int maxExc = 50; while (true) { try { cp.Parse(list); } catch { if (--maxExc < 0) { break; } continue; } if (list.Count == 0) { break; } String last = list[list.Count - 1].ToString(); if (level == 0 && list.Count == 3 && last.Equals(DEF)) { PdfObject key = list[0]; if (PdfName.REGISTRY.Equals(key)) { cmap.Registry = list[1].ToString(); } else if (PdfName.ORDERING.Equals(key)) { cmap.Ordering = list[1].ToString(); } else if (CMAPNAME.Equals(key)) { cmap.Name = list[1].ToString(); } else if (PdfName.SUPPLEMENT.Equals(key)) { try { cmap.Supplement = ((PdfNumber)list[1]).IntValue; } catch {} } } else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3) { int lmax = list.Count - 2; for (int k = 0; k < lmax; k += 2) { if (list[k] is PdfString) { cmap.AddChar((PdfString)list[k], list[k + 1]); } } } else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4) { int lmax = list.Count - 3; for (int k = 0; k < lmax; k += 3) { if (list[k] is PdfString && list[k + 1] is PdfString) { cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]); } } } else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName) { ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1); } } } finally { inp.Close(); } }
/** * Processes PDF syntax. * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()} * @param contentBytes the bytes of a content stream * @param resources the resources that come with the content stream */ public void ProcessContent(byte[] contentBytes, PdfDictionary resources){ this.resources.Push(resources); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0){ PdfLiteral oper = (PdfLiteral)operands[operands.Count-1]; if ("BI".Equals(oper.ToString())){ // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null; HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic); } else { InvokeOperator(oper, operands); } } this.resources.Pop(); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if(structParents == null) throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); annots = page.GetAsArray(PdfName.ANNOTS); if(annots == null) annots = new PdfArray(); PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List<PdfObject> operands = new List<PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject) item); items.RemoveAt(0); } } if(annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for(int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null) throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
/** * Parses an inline image from the provided content parser. The parser must be positioned immediately following the BI operator in the content stream. * The parser will be left with current position immediately following the EI operator that terminates the inline image * @param ps the content parser to use for reading the image. * @return the parsed image * @throws IOException if anything goes wring with the parsing * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing */ public static InlineImageInfo ParseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic) { PdfDictionary inlineImageDictionary = ParseInlineImageDictionary(ps); byte[] samples = ParseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps); return new InlineImageInfo(samples, inlineImageDictionary); }
/** * Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the EI operator. * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. * @param ps the parser to extract the embedded image information from * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values * @throws IOException if the parse fails */ private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) { // by the time we get to here, we have already parsed the BI operator PdfDictionary dictionary = new PdfDictionary(); for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()){ PdfObject value = ps.ReadPRObject(); PdfName resolvedKey; inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey); if (resolvedKey == null) resolvedKey = (PdfName)key; dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value)); } int ch = ps.GetTokeniser().Read(); if (!PRTokeniser.IsWhitespace(ch)) throw new IOException("Unexpected character " + ch + " found after ID in inline image"); return dictionary; }
/** * Parses the samples of the image from the underlying content parser, accounting for filters * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * <b>Note:</b>This implementation does not actually apply the filters at this time * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.Contains(PdfName.FILTER)) { return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps)); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PRTokeniser tokeniser = ps.GetTokeniser(); byte[] ff = null; while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E') { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I') { found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { byte[] tmp = baos.ToArray(); if (InlineImageStreamBytesAreComplete(tmp, imageDictionary)) { return(tmp); } byte[] accumulatedArr = accumulated.ToArray(); baos.Write(accumulatedArr, 0, accumulatedArr.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } else { baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } throw new InlineImageParseException("Could not find image data or EI"); }
/** * Parses the content of a page, replacing appearances of annotations * with Form XObjects. * @param page a page dictionary * @throws IOException */ public void Parse(PdfDictionary page, PdfIndirectReference pageref) { LOGGER.Info("Parsing page with reference " + pageref); // initializing member variables baos = new MemoryStream(); this.page = page; this.pageref = pageref; structParents = page.GetAsNumber(PdfName.STRUCTPARENTS); if (structParents == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure")); } annots = page.GetAsArray(PdfName.ANNOTS); if (annots == null) { annots = new PdfArray(); } PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES); xobjects = resources.GetAsDict(PdfName.XOBJECT); if (xobjects == null) { xobjects = new PdfDictionary(); resources.Put(PdfName.XOBJECT, xobjects); } // parsing the content stream of the page PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS); byte[] contentBytes = PdfReader.GetStreamBytes(stream); PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes))); PdfContentParser ps = new PdfContentParser(tokeniser); List <PdfObject> operands = new List <PdfObject>(); while (ps.Parse(operands).Count > 0) { PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1]; ProcessOperator(opr, operands); } // dealing with orphans while (items.Count > 0 && items[0].GetPageref() == pageref.Number) { StructureItem item = items[0]; if (item is StructureObject) { ConvertToXObject((StructureObject)item); items.RemoveAt(0); } } if (annots.Length == 0) { page.Remove(PdfName.ANNOTS); } else { PdfDictionary annot; for (int i = 0; i < annots.Size; i++) { annot = annots.GetAsDict(i); if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null) { throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found")); } } } // replacing the content stream baos.Flush(); baos.Close(); stream.SetData(baos.ToArray()); // showing how many items are left LOGGER.Info(String.Format("There are {0} items left for processing", items.Count)); }
/** * Parses the samples of the image from the underlying content parser, accounting for filters * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * <b>Note:</b>This implementation does not actually apply the filters at this time * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.Contains(PdfName.FILTER)){ return ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PRTokeniser tokeniser = ps.GetTokeniser(); byte[] ff = null; while ((ch = tokeniser.Read()) != -1){ if (found == 0 && PRTokeniser.IsWhitespace(ch)){ found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E'){ found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && PRTokeniser.IsWhitespace(ch)){ // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I'){ found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)){ try { byte[] tmp = baos.ToArray(); new PdfImageObject(imageDictionary, tmp, colorSpaceDic); return tmp; } catch (Exception) { byte[] tmp = accumulated.ToArray(); baos.Write(tmp, 0, tmp.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } else { baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } throw new InlineImageParseException("Could not find image data or EI"); }