/** * Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the EI operator. * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. * @param ps the parser to extract the embedded image information from * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values * @throws IOException if the parse fails */ private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) { // by the time we get to here, we have already parsed the BI operator PdfDictionary dictionary = new PdfDictionary(); for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()) { PdfObject value = ps.ReadPRObject(); PdfName resolvedKey; inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey); if (resolvedKey == null) { resolvedKey = (PdfName)key; } dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value)); } int ch = ps.GetTokeniser().Read(); if (!PRTokeniser.IsWhitespace(ch)) { throw new IOException("Unexpected character " + ch + " found after ID in inline image"); } return(dictionary); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it int startIndex = 0; if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0) // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't) { bytes[0] = (byte)shouldBeWhiteSpace; startIndex++; } for (int i = startIndex; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParseException("End of content stream reached before end of image data"); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) { // Some PDF producers seem to add another non-whitespace character after the image data. // Let's try to handle that case here. PdfObject ei2 = ps.ReadPRObject(); if (!ei2.ToString().Equals("EI")) { throw new InlineImageParseException("EI not found after end of image data"); } } return(bytes); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) { throw new ArgumentException("Dictionary contains filters"); } PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) for (int i = 0; i < bytesToRead; i++) { int ch = tokeniser.Read(); if (ch == -1) { throw new InlineImageParseException("End of content stream reached before end of image data"); } bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) { throw new InlineImageParseException("EI not found after end of image data"); } return(bytes); }
/** * Parses the samples of the image from the underlying content parser, accounting for filters * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * <b>Note:</b>This implementation does not actually apply the filters at this time * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.Contains(PdfName.FILTER)) { return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps)); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PRTokeniser tokeniser = ps.GetTokeniser(); byte[] ff = null; while ((ch = tokeniser.Read()) != -1) { if (found == 0 && PRTokeniser.IsWhitespace(ch)) { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E') { found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && PRTokeniser.IsWhitespace(ch)) { // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I') { found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)) { byte[] tmp = baos.ToArray(); if (InlineImageStreamBytesAreComplete(tmp, imageDictionary)) { return(tmp); } byte[] accumulatedArr = accumulated.ToArray(); baos.Write(accumulatedArr, 0, accumulatedArr.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } else { baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } throw new InlineImageParseException("Could not find image data or EI"); }
/** * Parses the samples of the image from the underlying content parser, accounting for filters * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * <b>Note:</b>This implementation does not actually apply the filters at this time * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // by the time we get to here, we have already parsed the ID operator if (!imageDictionary.Contains(PdfName.FILTER)){ return ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps); } // read all content until we reach an EI operator surrounded by whitespace. // The following algorithm has two potential issues: what if the image stream // contains <ws>EI<ws> ? // Plus, there are some streams that don't have the <ws> before the EI operator // it sounds like we would have to actually decode the content stream, which // I'd rather avoid right now. MemoryStream baos = new MemoryStream(); MemoryStream accumulated = new MemoryStream(); int ch; int found = 0; PRTokeniser tokeniser = ps.GetTokeniser(); byte[] ff = null; while ((ch = tokeniser.Read()) != -1){ if (found == 0 && PRTokeniser.IsWhitespace(ch)){ found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && ch == 'E'){ found++; accumulated.WriteByte((byte)ch); } else if (found == 1 && PRTokeniser.IsWhitespace(ch)){ // this clause is needed if we have a white space character that is part of the image data // followed by a whitespace character that precedes the EI operator. In this case, we need // to flush the first whitespace, then treat the current whitespace as the first potential // character for the end of stream check. Note that we don't increment 'found' here. baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); accumulated.WriteByte((byte)ch); } else if (found == 2 && ch == 'I'){ found++; accumulated.WriteByte((byte)ch); } else if (found == 3 && PRTokeniser.IsWhitespace(ch)){ try { byte[] tmp = baos.ToArray(); new PdfImageObject(imageDictionary, tmp, colorSpaceDic); return tmp; } catch (Exception) { byte[] tmp = accumulated.ToArray(); baos.Write(tmp, 0, tmp.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } else { baos.Write(ff = accumulated.ToArray(), 0, ff.Length); accumulated.SetLength(0); baos.WriteByte((byte)ch); found = 0; } } throw new InlineImageParseException("Could not find image data or EI"); }
/** * Parses the samples of the image from the underlying content parser, ignoring all filters. * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. * The parser will be left positioned immediately following the EI operator. * This is primarily useful if no filters have been applied. * @param imageDictionary the dictionary of the inline image * @param ps the content parser * @return the samples of the image * @throws IOException if anything bad happens during parsing */ private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) { // special case: when no filter is specified, we just read the number of bits // per component, multiplied by the width and height. if (imageDictionary.Contains(PdfName.FILTER)) throw new ArgumentException("Dictionary contains filters"); PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT); int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue; byte[] bytes = new byte[bytesToRead]; PRTokeniser tokeniser = ps.GetTokeniser(); int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this) // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it int startIndex = 0; if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0){ // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't) bytes[0] = (byte)shouldBeWhiteSpace; startIndex++; } for (int i = startIndex; i < bytesToRead; i++){ int ch = tokeniser.Read(); if (ch == -1) throw new InlineImageParseException("End of content stream reached before end of image data"); bytes[i] = (byte)ch; } PdfObject ei = ps.ReadPRObject(); if (!ei.ToString().Equals("EI")) throw new InlineImageParseException("EI not found after end of image data"); return bytes; }
/** * Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the EI operator. * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. * @param ps the parser to extract the embedded image information from * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values * @throws IOException if the parse fails */ private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) { // by the time we get to here, we have already parsed the BI operator PdfDictionary dictionary = new PdfDictionary(); for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()){ PdfObject value = ps.ReadPRObject(); PdfName resolvedKey; inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey); if (resolvedKey == null) resolvedKey = (PdfName)key; dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value)); } int ch = ps.GetTokeniser().Read(); if (!PRTokeniser.IsWhitespace(ch)) throw new IOException("Unexpected character " + ch + " found after ID in inline image"); return dictionary; }