Exemplo n.º 1
0
        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();

            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject())
            {
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                {
                    resolvedKey = (PdfName)key;
                }

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();

            if (!PRTokeniser.IsWhitespace(ch))
            {
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            }

            return(dictionary);
        }
Exemplo n.º 2
0
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
            // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
            int startIndex = 0;

            if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0)  // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
            {
                bytes[0] = (byte)shouldBeWhiteSpace;
                startIndex++;
            }
            for (int i = startIndex; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                // Some PDF producers seem to add another non-whitespace character after the image data.
                // Let's try to handle that case here.
                PdfObject ei2 = ps.ReadPRObject();
                if (!ei2.ToString().Equals("EI"))
                {
                    throw new InlineImageParseException("EI not found after end of image data");
                }
            }

            return(bytes);
        }
Exemplo n.º 3
0
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            for (int i = 0; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                throw new InlineImageParseException("EI not found after end of image data");
            }

            return(bytes);
        }
Exemplo n.º 4
0
 /**
  * Parses the samples of the image from the underlying content parser, ignoring all filters.
  * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
  * The parser will be left positioned immediately following the EI operator.
  * This is primarily useful if no filters have been applied. 
  * @param imageDictionary the dictionary of the inline image
  * @param ps the content parser
  * @return the samples of the image
  * @throws IOException if anything bad happens during parsing
  */
 private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) {
     // special case:  when no filter is specified, we just read the number of bits
     // per component, multiplied by the width and height.
     if (imageDictionary.Contains(PdfName.FILTER))
         throw new ArgumentException("Dictionary contains filters");
     
     PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);
     
     int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;
     byte[] bytes = new byte[bytesToRead];
     PRTokeniser tokeniser = ps.GetTokeniser();
     
     int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
     // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
     // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
     int startIndex = 0;
     if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0){ // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
         bytes[0] = (byte)shouldBeWhiteSpace;
         startIndex++;
     }
     for (int i = startIndex; i < bytesToRead; i++){
         int ch = tokeniser.Read();
         if (ch == -1)
             throw new InlineImageParseException("End of content stream reached before end of image data");
         
         bytes[i] = (byte)ch;
     }
     PdfObject ei = ps.ReadPRObject();
     if (!ei.ToString().Equals("EI"))
         throw new InlineImageParseException("EI not found after end of image data");
     
     return bytes;
 }
Exemplo n.º 5
0
        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();
            
            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()){
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                    resolvedKey = (PdfName)key;

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();
            if (!PRTokeniser.IsWhitespace(ch))
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            
            return dictionary;
        }