/**
         * Parses an inline image from the provided content parser.  The parser must be positioned immediately following the BI operator in the content stream.
         * The parser will be left with current position immediately following the EI operator that terminates the inline image
         * @param ps the content parser to use for reading the image.
         * @return the parsed image
         * @throws IOException if anything goes wring with the parsing
         * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing
         */
        public static PdfImageObject ParseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic)
        {
            PdfDictionary inlineImageDictionary = ParseInlineImageDictionary(ps);

            byte[] samples = ParseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps);
            return(new PdfImageObject(inlineImageDictionary, samples));
        }
        public byte[] Modify(byte[] contentBytes, PdfDictionary resourcesDictionary)
        {
            _contentStreamBuilderStack.Push(new PdfContentStreamBuilder());
            _resourceDictionaryStack.Push(resourcesDictionary);
            PRTokeniser      tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps        = new PdfContentParser(tokeniser);

            List <PdfObject> operands = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                //System.Diagnostics.Debug.WriteLine("[Debug] Opr: " + oper.ToString());

                PdfContentOperatorHandler operHandler = null;

                if (_operators.TryGetValue(oper.ToString(), out operHandler))
                {
                    operands = operHandler(oper, operands);
                }

                _contentStreamBuilderStack.Peek().Push(operands);
            }

            _resourceDictionaryStack.Pop();
            return(_contentStreamBuilderStack.Pop().GetBytes());
        }
        /**
         * Processes PDF syntax.
         * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <iTextSharp.text.pdf.PdfObject> operands = new List <iTextSharp.text.pdf.PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // w.GetOperatorInfo(oper)
                //w.wr.Print("operator info {0} type {1} string {2}", oper.GetType().ToString(), oper.Type, oper.ToString());

                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null;

                    // 'iTextSharp.text.pdf.parser.ImageRenderInfo.CreateForEmbeddedImage(iTextSharp.text.pdf.parser.Matrix, iTextSharp.text.pdf.parser.InlineImageInfo, iTextSharp.text.pdf.PdfDictionary)' is inaccessible due to its protection level
                    ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
                    renderListener.RenderImage(renderInfo);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }
            this.resources.Pop();
        }
        /**
         * Processes PDF syntax.
         * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];
                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null;

                    ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
                    renderListener.RenderImage(renderInfo);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }
            this.resources.Pop();
        }
        /**
         * Parses an inline image from the provided content parser.  The parser must be positioned immediately following the BI operator in the content stream.
         * The parser will be left with current position immediately following the EI operator that terminates the inline image
         * @param ps the content parser to use for reading the image.
         * @return the parsed image
         * @throws IOException if anything goes wring with the parsing
         * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing
         */
        public static InlineImageInfo ParseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic)
        {
            PdfDictionary inlineImageDictionary = ParseInlineImageDictionary(ps);

            byte[] samples = ParseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps);
            return(new InlineImageInfo(samples, inlineImageDictionary));
        }
        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();

            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject())
            {
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                {
                    resolvedKey = (PdfName)key;
                }

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();

            if (!PRTokeniser.IsWhitespace(ch))
            {
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            }

            return(dictionary);
        }
Beispiel #7
0
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
            // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
            int startIndex = 0;

            if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace))
            {
                bytes[0] = (byte)shouldBeWhiteSpace;
                startIndex++;
            }
            for (int i = startIndex; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                throw new InlineImageParseException("EI not found after end of image data");
            }

            return(bytes);
        }
Beispiel #8
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
        static void Main(string[] args)
        {
            var reader = new PdfReader(@"D:\Tmp\sample.pdf");

            try
            {
                var parser = new PdfContentParser(new PRTokeniser(reader.GetPageContent(2)));
                var sb     = new StringBuilder();
                while (parser.Tokeniser.NextToken())
                {
                    if (parser.Tokeniser.TokenType == PRTokeniser.TK_STRING)
                    {
                        string str = parser.Tokeniser.StringValue;
                        sb.Append(str);
                    }
                }
                Console.WriteLine(sb.ToString());
            }
            finally {
                reader.Close();
            }
        }
Beispiel #10
0
        public static PdfData ConvertToPdfData(string fileName, int pageNum)
        {
            if ((string.IsNullOrEmpty(fileName) || string.IsNullOrWhiteSpace(fileName)) && pageNum <= 0)
            {
                return(null);
            }
            Helpers.D.Log("PdfConvertIText.ConvertToPdfData({0}, {1})", fileName, pageNum);
            PdfData          data = new PdfData();
            PdfContentParser parser;// = new PdfContentParser();
            PRTokeniser      tokeniser = new PRTokeniser(fileName);
            PdfDictionary    dict;
            ArrayList        items;

            parser = new PdfContentParser(tokeniser);

            dict = parser.ReadDictionary();
            //dict.Contains(PdfName.IMAGE)
            items = parser.Parse(parser.ReadArray().ArrayList);
            Helpers.D.Log("PdfConvertIText.ConvertToPdfData: {0} | {1}", items.Count, string.Join(", ", items.ToArray()));

            return(data);
        }
Beispiel #11
0
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            for (int i = 0; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                throw new InlineImageParseException("EI not found after end of image data");
            }

            return(bytes);
        }
Beispiel #12
0
 /**
  * Parses the samples of the image from the underlying content parser, ignoring all filters.
  * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
  * The parser will be left positioned immediately following the EI operator.
  * This is primarily useful if no filters have been applied. 
  * @param imageDictionary the dictionary of the inline image
  * @param ps the content parser
  * @return the samples of the image
  * @throws IOException if anything bad happens during parsing
  */
 private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) {
     // special case:  when no filter is specified, we just read the number of bits
     // per component, multiplied by the width and height.
     if (imageDictionary.Contains(PdfName.FILTER))
         throw new ArgumentException("Dictionary contains filters");
     
     PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);
     
     int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;
     byte[] bytes = new byte[bytesToRead];
     PRTokeniser tokeniser = ps.GetTokeniser();
     
     int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
     // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
     // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
     int startIndex = 0;
     if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0){ // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
         bytes[0] = (byte)shouldBeWhiteSpace;
         startIndex++;
     }
     for (int i = startIndex; i < bytesToRead; i++){
         int ch = tokeniser.Read();
         if (ch == -1)
             throw new InlineImageParseException("End of content stream reached before end of image data");
         
         bytes[i] = (byte)ch;
     }
     PdfObject ei = ps.ReadPRObject();
     if (!ei.ToString().Equals("EI"))
         throw new InlineImageParseException("EI not found after end of image data");
     
     return bytes;
 }
Beispiel #13
0
        /**
         * Processes PDF syntax
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // special handling for embedded images.  If we hit an ID oper, we need
                // to skip all content until we reach an EI oper surrounded by whitespace.
                // The following algorithm has one potential issue: what if the image stream
                // contains <ws>EI<ws> ?
                // it sounds like we would have to actually decode the content stream, which
                // I'd rather avoid right now.
                if ("ID".Equals(oper.ToString()))
                {
                    MemoryStream baos        = new MemoryStream();
                    MemoryStream accumulated = new MemoryStream();
                    int          ch;
                    int          found = 0;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("ID"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            // we should probably eventually do something to make the accumulated image content stream available

                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("EI"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            break;
                        }
                        else
                        {
                            accumulated.WriteTo(baos);
                            accumulated.SetLength(0);

                            baos.WriteByte((byte)ch);
                            found = 0;
                        }
                    }
                }
                InvokeOperator(oper, operands);
            }

            this.resources.Pop();
        }
Beispiel #14
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
                if ("BI".Equals(@operator.ToString()))
                {
                    int  found = 0;
                    int  ch;
                    bool immediateAfterBI = true;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch))
                        {
                            baos.WriteByte((byte)ch);
                        }
                        immediateAfterBI = false;
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                        }
                        else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                        {
                            // this clause is needed if we have a white space character that is part of the image data
                            // followed by a whitespace character that precedes the EI operator.  In this case, we need
                            // to flush the first whitespace, then treat the current whitespace as the first potential
                            // character for the end of stream check. Note that we don't increment 'found' here.
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            break;
                        }
                        else
                        {
                            found = 0;
                        }
                    }
                }
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
Beispiel #15
0
        private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level)
        {
            if (level >= MAXLEVEL)
            {
                return;
            }
            PRTokeniser inp = location.GetLocation(cmapName);

            try {
                List <PdfObject> list = new List <PdfObject>();
                PdfContentParser cp   = new PdfContentParser(inp);
                int maxExc            = 50;
                while (true)
                {
                    try {
                        cp.Parse(list);
                    }
                    catch {
                        if (--maxExc < 0)
                        {
                            break;
                        }
                        continue;
                    }
                    if (list.Count == 0)
                    {
                        break;
                    }
                    String last = list[list.Count - 1].ToString();
                    if (level == 0 && list.Count == 3 && last.Equals(DEF))
                    {
                        PdfObject key = list[0];
                        if (PdfName.REGISTRY.Equals(key))
                        {
                            cmap.Registry = list[1].ToString();
                        }
                        else if (PdfName.ORDERING.Equals(key))
                        {
                            cmap.Ordering = list[1].ToString();
                        }
                        else if (CMAPNAME.Equals(key))
                        {
                            cmap.Name = list[1].ToString();
                        }
                        else if (PdfName.SUPPLEMENT.Equals(key))
                        {
                            try {
                                cmap.Supplement = ((PdfNumber)list[1]).IntValue;
                            }
                            catch {}
                        }
                    }
                    else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3)
                    {
                        int lmax = list.Count - 2;
                        for (int k = 0; k < lmax; k += 2)
                        {
                            if (list[k] is PdfString)
                            {
                                cmap.AddChar((PdfString)list[k], list[k + 1]);
                            }
                        }
                    }
                    else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4)
                    {
                        int lmax = list.Count - 3;
                        for (int k = 0; k < lmax; k += 3)
                        {
                            if (list[k] is PdfString && list[k + 1] is PdfString)
                            {
                                cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]);
                            }
                        }
                    }
                    else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName)
                    {
                        ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1);
                    }
                }
            }
            finally {
                inp.Close();
            }
        }
 /**
  * Processes PDF syntax.
  * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
  * @param contentBytes  the bytes of a content stream
  * @param resources     the resources that come with the content stream
  */
 public void ProcessContent(byte[] contentBytes, PdfDictionary resources){
     this.resources.Push(resources);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0){
         PdfLiteral oper = (PdfLiteral)operands[operands.Count-1];
         if ("BI".Equals(oper.ToString())){
             // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
             PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null;
             HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
         } else {
             InvokeOperator(oper, operands);
         }
     }
     this.resources.Pop();
 }
Beispiel #17
0
 /**
  * Parses the content of a page, replacing appearances of annotations
  * with Form XObjects.
  * @param page a page dictionary
  * @throws IOException
  */
 virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) {
     LOGGER.Info("Parsing page with reference " + pageref);
     // initializing member variables
     baos = new MemoryStream();
     this.page = page;
     this.pageref = pageref;
     
     structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
     if(structParents == null)
         throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
     annots = page.GetAsArray(PdfName.ANNOTS);
     if(annots == null)
         annots = new PdfArray();
     PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
     xobjects = resources.GetAsDict(PdfName.XOBJECT);
     if (xobjects == null) {
         xobjects = new PdfDictionary();
         resources.Put(PdfName.XOBJECT, xobjects);
     }
     // parsing the content stream of the page
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     byte[] contentBytes = PdfReader.GetStreamBytes(stream);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0) {
         PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1];
         ProcessOperator(opr, operands);
     }
     // dealing with orphans
     while (items.Count > 0 && items[0].GetPageref() == pageref.Number) {
         StructureItem item = items[0];
         if (item is StructureObject) {
             ConvertToXObject((StructureObject) item);
             items.RemoveAt(0);
         }
     }
     if(annots.Length == 0) {
         page.Remove(PdfName.ANNOTS);
     }
     else {
         PdfDictionary annot;
         for(int i = 0; i < annots.Size; i++) {
             annot = annots.GetAsDict(i);
             if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                 throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
         }
     }
     // replacing the content stream
     baos.Flush();
     baos.Close();
     stream.SetData(baos.ToArray());
     // showing how many items are left
     LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
 }
Beispiel #18
0
 /**
  * Parses an inline image from the provided content parser.  The parser must be positioned immediately following the BI operator in the content stream.
  * The parser will be left with current position immediately following the EI operator that terminates the inline image
  * @param ps the content parser to use for reading the image. 
  * @return the parsed image
  * @throws IOException if anything goes wring with the parsing
  * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing
  */
 public static InlineImageInfo ParseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic) {
     PdfDictionary inlineImageDictionary = ParseInlineImageDictionary(ps);
     byte[] samples = ParseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps);
     return new InlineImageInfo(samples, inlineImageDictionary);
 }
Beispiel #19
0
        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();
            
            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()){
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                    resolvedKey = (PdfName)key;

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();
            if (!PRTokeniser.IsWhitespace(ch))
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            
            return dictionary;
        }
        /**
         * Parses the samples of the image from the underlying content parser, accounting for filters
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * <b>Note:</b>This implementation does not actually apply the filters at this time
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the ID operator

            if (!imageDictionary.Contains(PdfName.FILTER))
            {
                return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps));
            }


            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos        = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int          ch;
            int          found     = 0;
            PRTokeniser  tokeniser = ps.GetTokeniser();

            byte[] ff = null;

            while ((ch = tokeniser.Read()) != -1)
            {
                if (found == 0 && PRTokeniser.IsWhitespace(ch))
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && ch == 'E')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                {
                    // this clause is needed if we have a white space character that is part of the image data
                    // followed by a whitespace character that precedes the EI operator.  In this case, we need
                    // to flush the first whitespace, then treat the current whitespace as the first potential
                    // character for the end of stream check.  Note that we don't increment 'found' here.
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 2 && ch == 'I')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                {
                    byte[] tmp = baos.ToArray();
                    if (InlineImageStreamBytesAreComplete(tmp, imageDictionary))
                    {
                        return(tmp);
                    }
                    byte[] accumulatedArr = accumulated.ToArray();
                    baos.Write(accumulatedArr, 0, accumulatedArr.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
                else
                {
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
            }
            throw new InlineImageParseException("Could not find image data or EI");
        }
Beispiel #21
0
        /**
         * Parses the content of a page, replacing appearances of annotations
         * with Form XObjects.
         * @param page a page dictionary
         * @throws IOException
         */
        public void Parse(PdfDictionary page, PdfIndirectReference pageref)
        {
            LOGGER.Info("Parsing page with reference " + pageref);
            // initializing member variables
            baos         = new MemoryStream();
            this.page    = page;
            this.pageref = pageref;

            structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
            if (structParents == null)
            {
                throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
            }
            annots = page.GetAsArray(PdfName.ANNOTS);
            if (annots == null)
            {
                annots = new PdfArray();
            }
            PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);

            xobjects = resources.GetAsDict(PdfName.XOBJECT);
            if (xobjects == null)
            {
                xobjects = new PdfDictionary();
                resources.Put(PdfName.XOBJECT, xobjects);
            }
            // parsing the content stream of the page
            PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);

            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(opr, operands);
            }
            // dealing with orphans
            while (items.Count > 0 && items[0].GetPageref() == pageref.Number)
            {
                StructureItem item = items[0];
                if (item is StructureObject)
                {
                    ConvertToXObject((StructureObject)item);
                    items.RemoveAt(0);
                }
            }
            if (annots.Length == 0)
            {
                page.Remove(PdfName.ANNOTS);
            }
            else
            {
                PdfDictionary annot;
                for (int i = 0; i < annots.Size; i++)
                {
                    annot = annots.GetAsDict(i);
                    if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                    {
                        throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
                    }
                }
            }
            // replacing the content stream
            baos.Flush();
            baos.Close();
            stream.SetData(baos.ToArray());
            // showing how many items are left
            LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
        }
Beispiel #22
0
        /**
         * Parses the samples of the image from the underlying content parser, accounting for filters
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * <b>Note:</b>This implementation does not actually apply the filters at this time
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) {
            // by the time we get to here, we have already parsed the ID operator
            
            if (!imageDictionary.Contains(PdfName.FILTER)){
                return ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps);
            }
            
            
            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream 
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int ch;
            int found = 0;
            PRTokeniser tokeniser = ps.GetTokeniser();
            byte[] ff = null;
            
            while ((ch = tokeniser.Read()) != -1){
                if (found == 0 && PRTokeniser.IsWhitespace(ch)){
                    found++;
                    accumulated.WriteByte((byte)ch);
                } else if (found == 1 && ch == 'E'){
                    found++;
                    accumulated.WriteByte((byte)ch);
                } else if (found == 1 && PRTokeniser.IsWhitespace(ch)){
                    // this clause is needed if we have a white space character that is part of the image data
                    // followed by a whitespace character that precedes the EI operator.  In this case, we need
                    // to flush the first whitespace, then treat the current whitespace as the first potential
                    // character for the end of stream check.  Note that we don't increment 'found' here.
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    accumulated.WriteByte((byte)ch);
                } else if (found == 2 && ch == 'I'){ 
                    found++;
                    accumulated.WriteByte((byte)ch);
                } else if (found == 3 && PRTokeniser.IsWhitespace(ch)){
                    try
                    {
                        byte[] tmp = baos.ToArray();
                        new PdfImageObject(imageDictionary, tmp, colorSpaceDic);
                        return tmp;
                    }
                    catch (Exception)
                    {
                        byte[] tmp = accumulated.ToArray();
                        baos.Write(tmp, 0, tmp.Length);
                        accumulated.SetLength(0);

                        baos.WriteByte((byte)ch);
                        found = 0;
                    }

                } else {
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    
                    baos.WriteByte((byte)ch);
                    found = 0;
                }
            }
            throw new InlineImageParseException("Could not find image data or EI");
        }