/**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();

            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject())
            {
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                {
                    resolvedKey = (PdfName)key;
                }

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();

            if (!PRTokeniser.IsWhitespace(ch))
            {
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            }

            return(dictionary);
        }
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
            // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
            int startIndex = 0;

            if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0)  // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
            {
                bytes[0] = (byte)shouldBeWhiteSpace;
                startIndex++;
            }
            for (int i = startIndex; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                // Some PDF producers seem to add another non-whitespace character after the image data.
                // Let's try to handle that case here.
                PdfObject ei2 = ps.ReadPRObject();
                if (!ei2.ToString().Equals("EI"))
                {
                    throw new InlineImageParseException("EI not found after end of image data");
                }
            }

            return(bytes);
        }
        /**
         * Parses the samples of the image from the underlying content parser, accounting for filters
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * <b>Note:</b>This implementation does not actually apply the filters at this time
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the ID operator

            if (!imageDictionary.Contains(PdfName.FILTER))
            {
                return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps));
            }


            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos        = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int          ch;
            int          found     = 0;
            PRTokeniser  tokeniser = ps.GetTokeniser();

            byte[] ff = null;

            while ((ch = tokeniser.Read()) != -1)
            {
                if (found == 0 && PRTokeniser.IsWhitespace(ch))
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && ch == 'E')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                {
                    // this clause is needed if we have a white space character that is part of the image data
                    // followed by a whitespace character that precedes the EI operator.  In this case, we need
                    // to flush the first whitespace, then treat the current whitespace as the first potential
                    // character for the end of stream check.  Note that we don't increment 'found' here.
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 2 && ch == 'I')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                {
                    byte[] tmp = baos.ToArray();
                    if (InlineImageStreamBytesAreComplete(tmp, imageDictionary))
                    {
                        return(tmp);
                    }
                    byte[] accumulatedArr = accumulated.ToArray();
                    baos.Write(accumulatedArr, 0, accumulatedArr.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
                else
                {
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
            }
            throw new InlineImageParseException("Could not find image data or EI");
        }
Beispiel #4
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
                if ("BI".Equals(@operator.ToString()))
                {
                    int  found = 0;
                    int  ch;
                    bool immediateAfterBI = true;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch))
                        {
                            baos.WriteByte((byte)ch);
                        }
                        immediateAfterBI = false;
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                        }
                        else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                        {
                            // this clause is needed if we have a white space character that is part of the image data
                            // followed by a whitespace character that precedes the EI operator.  In this case, we need
                            // to flush the first whitespace, then treat the current whitespace as the first potential
                            // character for the end of stream check. Note that we don't increment 'found' here.
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            break;
                        }
                        else
                        {
                            found = 0;
                        }
                    }
                }
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
Beispiel #5
0
        /**
         * Processes PDF syntax
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // special handling for embedded images.  If we hit an ID oper, we need
                // to skip all content until we reach an EI oper surrounded by whitespace.
                // The following algorithm has one potential issue: what if the image stream
                // contains <ws>EI<ws> ?
                // it sounds like we would have to actually decode the content stream, which
                // I'd rather avoid right now.
                if ("ID".Equals(oper.ToString()))
                {
                    MemoryStream baos        = new MemoryStream();
                    MemoryStream accumulated = new MemoryStream();
                    int          ch;
                    int          found = 0;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("ID"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            // we should probably eventually do something to make the accumulated image content stream available

                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("EI"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            break;
                        }
                        else
                        {
                            accumulated.WriteTo(baos);
                            accumulated.SetLength(0);

                            baos.WriteByte((byte)ch);
                            found = 0;
                        }
                    }
                }
                InvokeOperator(oper, operands);
            }

            this.resources.Pop();
        }