/// <summary>Parses an inline image from the provided content parser.</summary>
        /// <remarks>
        /// Parses an inline image from the provided content parser.  The parser must be positioned immediately following the BI operator in the content stream.
        /// The parser will be left with current position immediately following the EI operator that terminates the inline image
        /// </remarks>
        /// <param name="ps">the content parser to use for reading the image.</param>
        /// <param name="colorSpaceDic">a color space dictionary</param>
        /// <returns>the parsed image</returns>
        /// <exception cref="System.IO.IOException">if anything goes wring with the parsing</exception>
        /// <exception cref="InlineImageParseException">if parsing of the inline image failed due to issues specific to inline image processing
        ///     </exception>
        public static PdfStream Parse(PdfCanvasParser ps, PdfDictionary colorSpaceDic)
        {
            PdfDictionary inlineImageDict = ParseDictionary(ps);

            byte[]    samples = ParseSamples(inlineImageDict, colorSpaceDic, ps);
            PdfStream inlineImageAsStreamObject = new PdfStream(samples);

            inlineImageAsStreamObject.PutAll(inlineImageDict);
            return(inlineImageAsStreamObject);
        }
        /// <summary>Parses the next inline image dictionary from the parser.</summary>
        /// <remarks>
        /// Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the BI operator.
        /// The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
        /// </remarks>
        /// <param name="ps">the parser to extract the embedded image information from</param>
        /// <returns>the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
        ///     </returns>
        /// <exception cref="System.IO.IOException">if the parse fails</exception>
        private static PdfDictionary ParseDictionary(PdfCanvasParser ps)
        {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dict = new PdfDictionary();

            for (PdfObject key = ps.ReadObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadObject())
            {
                PdfObject value       = ps.ReadObject();
                PdfName   resolvedKey = inlineImageEntryAbbreviationMap.Get((PdfName)key);
                if (resolvedKey == null)
                {
                    resolvedKey = (PdfName)key;
                }
                dict.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }
            int ch = ps.GetTokeniser().Read();

            if (!PdfTokenizer.IsWhitespace(ch))
            {
                throw new InlineImageParsingUtils.InlineImageParseException(PdfException.UnexpectedCharacter1FoundAfterIDInInlineImage
                                                                            ).SetMessageParams(ch);
            }
            return(dict);
        }
Exemplo n.º 3
0
        public virtual void InnerArraysInContentStreamTest()
        {
            String      inputFileName = sourceFolder + "innerArraysInContentStream.pdf";
            PdfDocument pdfDocument   = new PdfDocument(new PdfReader(inputFileName));

            byte[] docInBytes = pdfDocument.GetFirstPage().GetContentBytes();
            RandomAccessSourceFactory factory = new RandomAccessSourceFactory();
            PdfTokenizer      tokeniser       = new PdfTokenizer(new RandomAccessFileOrArray(factory.CreateSource(docInBytes)));
            PdfResources      resources       = pdfDocument.GetPage(1).GetResources();
            PdfCanvasParser   ps       = new PdfCanvasParser(tokeniser, resources);
            IList <PdfObject> actual   = ps.Parse(null);
            IList <PdfObject> expected = new List <PdfObject>();

            expected.Add(new PdfString("Cyan"));
            expected.Add(new PdfArray(new int[] { 1, 0, 0, 0 }));
            expected.Add(new PdfString("Magenta"));
            expected.Add(new PdfArray(new int[] { 0, 1, 0, 0 }));
            expected.Add(new PdfString("Yellow"));
            expected.Add(new PdfArray(new int[] { 0, 0, 1, 0 }));
            PdfArray cmpArray = new PdfArray(expected);

            NUnit.Framework.Assert.IsTrue(new CompareTool().CompareArrays(cmpArray, (((PdfDictionary)actual[1]).GetAsArray
                                                                                         (new PdfName("ColorantsDef")))));
        }
        /// <summary>
        /// Parses the samples of the image from the underlying content parser, accounting for filters
        /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
        /// </summary>
        /// <remarks>
        /// Parses the samples of the image from the underlying content parser, accounting for filters
        /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
        /// The parser will be left positioned immediately following the EI operator.
        /// <b>Note:</b>This implementation does not actually apply the filters at this time
        /// </remarks>
        /// <param name="imageDictionary">the dictionary of the inline image</param>
        /// <param name="ps">the content parser</param>
        /// <returns>the samples of the image</returns>
        /// <exception cref="System.IO.IOException">if anything bad happens during parsing</exception>
        private static byte[] ParseSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser
                                           ps)
        {
            // by the time we get to here, we have already parsed the ID operator
            if (!imageDictionary.ContainsKey(PdfName.Filter) && ImageColorSpaceIsKnown(imageDictionary, colorSpaceDic)
                )
            {
                return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps));
            }
            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos        = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int          ch;
            int          found     = 0;
            PdfTokenizer tokeniser = ps.GetTokeniser();

            while ((ch = tokeniser.Read()) != -1)
            {
                if (found == 0 && PdfTokenizer.IsWhitespace(ch))
                {
                    found++;
                    accumulated.Write(ch);
                }
                else
                {
                    if (found == 1 && ch == 'E')
                    {
                        found++;
                        accumulated.Write(ch);
                    }
                    else
                    {
                        if (found == 1 && PdfTokenizer.IsWhitespace(ch))
                        {
                            // this clause is needed if we have a white space character that is part of the image data
                            // followed by a whitespace character that precedes the EI operator.  In this case, we need
                            // to flush the first whitespace, then treat the current whitespace as the first potential
                            // character for the end of stream check.  Note that we don't increment 'found' here.
                            baos.Write(accumulated.ToArray());
                            accumulated.JReset();
                            accumulated.Write(ch);
                        }
                        else
                        {
                            if (found == 2 && ch == 'I')
                            {
                                found++;
                                accumulated.Write(ch);
                            }
                            else
                            {
                                if (found == 3 && PdfTokenizer.IsWhitespace(ch))
                                {
                                    byte[] tmp = baos.ToArray();
                                    if (InlineImageStreamBytesAreComplete(tmp, imageDictionary))
                                    {
                                        return(tmp);
                                    }
                                    baos.Write(accumulated.ToArray());
                                    accumulated.JReset();
                                    baos.Write(ch);
                                    found = 0;
                                }
                                else
                                {
                                    baos.Write(accumulated.ToArray());
                                    accumulated.JReset();
                                    baos.Write(ch);
                                    found = 0;
                                }
                            }
                        }
                    }
                }
            }
            throw new InlineImageParsingUtils.InlineImageParseException(PdfException.CannotFindImageDataOrEI);
        }
        /// <summary>Parses the samples of the image from the underlying content parser, ignoring all filters.</summary>
        /// <remarks>
        /// Parses the samples of the image from the underlying content parser, ignoring all filters.
        /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
        /// The parser will be left positioned immediately following the EI operator.
        /// This is primarily useful if no filters have been applied.
        /// </remarks>
        /// <param name="imageDictionary">the dictionary of the inline image</param>
        /// <param name="ps">the content parser</param>
        /// <returns>the samples of the image</returns>
        /// <exception cref="System.IO.IOException">if anything bad happens during parsing</exception>
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser
                                                     ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.ContainsKey(PdfName.Filter))
            {
                throw new ArgumentException("Dictionary contains filters");
            }
            PdfNumber h           = imageDictionary.GetAsNumber(PdfName.Height);
            int       bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue();

            byte[]       bytes              = new byte[bytesToRead];
            PdfTokenizer tokeniser          = ps.GetTokeniser();
            int          shouldBeWhiteSpace = tokeniser.Read();
            // skip next character (which better be a whitespace character - I suppose we could check for this)
            // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
            // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
            int startIndex = 0;

            if (!PdfTokenizer.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0)
            {
                // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't
                bytes[0] = (byte)shouldBeWhiteSpace;
                startIndex++;
            }
            for (int i = startIndex; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParsingUtils.InlineImageParseException(PdfException.EndOfContentStreamReachedBeforeEndOfImageData
                                                                                );
                }
                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadObject();

            if (!ei.ToString().Equals("EI"))
            {
                // Some PDF producers seem to add another non-whitespace character after the image data.
                // Let's try to handle that case here.
                PdfObject ei2 = ps.ReadObject();
                if (!ei2.ToString().Equals("EI"))
                {
                    throw new InlineImageParsingUtils.InlineImageParseException(PdfException.OperatorEINotFoundAfterEndOfImageData
                                                                                );
                }
            }
            return(bytes);
        }
        /// <summary>
        /// Parses the samples of the image from the underlying content parser, accounting for filters
        /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
        /// </summary>
        /// <remarks>
        /// Parses the samples of the image from the underlying content parser, accounting for filters
        /// The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
        /// The parser will be left positioned immediately following the EI operator.
        /// <b>Note:</b>This implementation does not actually apply the filters at this time
        /// </remarks>
        /// <param name="imageDictionary">the dictionary of the inline image</param>
        /// <param name="ps">the content parser</param>
        /// <returns>the samples of the image</returns>
        private static byte[] ParseSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfCanvasParser
                                           ps)
        {
            // by the time we get to here, we have already parsed the ID operator
            if (!imageDictionary.ContainsKey(PdfName.Filter) && ImageColorSpaceIsKnown(imageDictionary, colorSpaceDic)
                )
            {
                return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps));
            }
            // read all content until we reach an EI operator followed by whitespace.
            // then decode the content stream to check that bytes that were parsed are really all image bytes
            MemoryStream baos = new MemoryStream();
            int          ch;
            int          found     = 0;
            PdfTokenizer tokeniser = ps.GetTokeniser();

            while ((ch = tokeniser.Read()) != -1)
            {
                if (ch == 'E')
                {
                    // probably some bytes were preserved so write them
                    baos.Write(EI, 0, found);
                    // just preserve 'E' and do not write it immediately
                    found = 1;
                }
                else
                {
                    if (found == 1 && ch == 'I')
                    {
                        // just preserve 'EI' and do not write it immediately
                        found = 2;
                    }
                    else
                    {
                        if (found == 2 && PdfTokenizer.IsWhitespace(ch))
                        {
                            byte[] tmp = baos.ToArray();
                            if (InlineImageStreamBytesAreComplete(tmp, imageDictionary))
                            {
                                return(tmp);
                            }
                        }
                        // probably some bytes were preserved so write them
                        baos.Write(EI, 0, found);
                        baos.Write(ch);
                        found = 0;
                    }
                }
            }
            throw new InlineImageParsingUtils.InlineImageParseException(PdfException.CannotFindImageDataOrEI);
        }