/**
         * Processes PDF syntax.
         * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];
                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null;

                    ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
                    renderListener.RenderImage(renderInfo);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }
            this.resources.Pop();
        }
        public byte[] Modify(byte[] contentBytes, PdfDictionary resourcesDictionary)
        {
            _contentStreamBuilderStack.Push(new PdfContentStreamBuilder());
            _resourceDictionaryStack.Push(resourcesDictionary);
            PRTokeniser      tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps        = new PdfContentParser(tokeniser);

            List <PdfObject> operands = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                //System.Diagnostics.Debug.WriteLine("[Debug] Opr: " + oper.ToString());

                PdfContentOperatorHandler operHandler = null;

                if (_operators.TryGetValue(oper.ToString(), out operHandler))
                {
                    operands = operHandler(oper, operands);
                }

                _contentStreamBuilderStack.Peek().Push(operands);
            }

            _resourceDictionaryStack.Pop();
            return(_contentStreamBuilderStack.Pop().GetBytes());
        }
        /**
         * Processes PDF syntax.
         * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <iTextSharp.text.pdf.PdfObject> operands = new List <iTextSharp.text.pdf.PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // w.GetOperatorInfo(oper)
                //w.wr.Print("operator info {0} type {1} string {2}", oper.GetType().ToString(), oper.Type, oper.ToString());

                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null;

                    // 'iTextSharp.text.pdf.parser.ImageRenderInfo.CreateForEmbeddedImage(iTextSharp.text.pdf.parser.Matrix, iTextSharp.text.pdf.parser.InlineImageInfo, iTextSharp.text.pdf.PdfDictionary)' is inaccessible due to its protection level
                    ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
                    renderListener.RenderImage(renderInfo);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }
            this.resources.Pop();
        }
Exemple #4
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
Exemple #5
0
        public static PdfData ConvertToPdfData(string fileName, int pageNum)
        {
            if ((string.IsNullOrEmpty(fileName) || string.IsNullOrWhiteSpace(fileName)) && pageNum <= 0)
            {
                return(null);
            }
            Helpers.D.Log("PdfConvertIText.ConvertToPdfData({0}, {1})", fileName, pageNum);
            PdfData          data = new PdfData();
            PdfContentParser parser;// = new PdfContentParser();
            PRTokeniser      tokeniser = new PRTokeniser(fileName);
            PdfDictionary    dict;
            ArrayList        items;

            parser = new PdfContentParser(tokeniser);

            dict = parser.ReadDictionary();
            //dict.Contains(PdfName.IMAGE)
            items = parser.Parse(parser.ReadArray().ArrayList);
            Helpers.D.Log("PdfConvertIText.ConvertToPdfData: {0} | {1}", items.Count, string.Join(", ", items.ToArray()));

            return(data);
        }
Exemple #6
0
        /**
         * Parses the content of a page, replacing appearances of annotations
         * with Form XObjects.
         * @param page a page dictionary
         * @throws IOException
         */
        public void Parse(PdfDictionary page, PdfIndirectReference pageref)
        {
            LOGGER.Info("Parsing page with reference " + pageref);
            // initializing member variables
            baos         = new MemoryStream();
            this.page    = page;
            this.pageref = pageref;

            structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
            if (structParents == null)
            {
                throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
            }
            annots = page.GetAsArray(PdfName.ANNOTS);
            if (annots == null)
            {
                annots = new PdfArray();
            }
            PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);

            xobjects = resources.GetAsDict(PdfName.XOBJECT);
            if (xobjects == null)
            {
                xobjects = new PdfDictionary();
                resources.Put(PdfName.XOBJECT, xobjects);
            }
            // parsing the content stream of the page
            PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);

            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(opr, operands);
            }
            // dealing with orphans
            while (items.Count > 0 && items[0].GetPageref() == pageref.Number)
            {
                StructureItem item = items[0];
                if (item is StructureObject)
                {
                    ConvertToXObject((StructureObject)item);
                    items.RemoveAt(0);
                }
            }
            if (annots.Length == 0)
            {
                page.Remove(PdfName.ANNOTS);
            }
            else
            {
                PdfDictionary annot;
                for (int i = 0; i < annots.Size; i++)
                {
                    annot = annots.GetAsDict(i);
                    if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                    {
                        throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
                    }
                }
            }
            // replacing the content stream
            baos.Flush();
            baos.Close();
            stream.SetData(baos.ToArray());
            // showing how many items are left
            LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
        }
Exemple #7
0
        private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level)
        {
            if (level >= MAXLEVEL)
            {
                return;
            }
            PRTokeniser inp = location.GetLocation(cmapName);

            try {
                List <PdfObject> list = new List <PdfObject>();
                PdfContentParser cp   = new PdfContentParser(inp);
                int maxExc            = 50;
                while (true)
                {
                    try {
                        cp.Parse(list);
                    }
                    catch {
                        if (--maxExc < 0)
                        {
                            break;
                        }
                        continue;
                    }
                    if (list.Count == 0)
                    {
                        break;
                    }
                    String last = list[list.Count - 1].ToString();
                    if (level == 0 && list.Count == 3 && last.Equals(DEF))
                    {
                        PdfObject key = list[0];
                        if (PdfName.REGISTRY.Equals(key))
                        {
                            cmap.Registry = list[1].ToString();
                        }
                        else if (PdfName.ORDERING.Equals(key))
                        {
                            cmap.Ordering = list[1].ToString();
                        }
                        else if (CMAPNAME.Equals(key))
                        {
                            cmap.Name = list[1].ToString();
                        }
                        else if (PdfName.SUPPLEMENT.Equals(key))
                        {
                            try {
                                cmap.Supplement = ((PdfNumber)list[1]).IntValue;
                            }
                            catch {}
                        }
                    }
                    else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3)
                    {
                        int lmax = list.Count - 2;
                        for (int k = 0; k < lmax; k += 2)
                        {
                            if (list[k] is PdfString)
                            {
                                cmap.AddChar((PdfString)list[k], list[k + 1]);
                            }
                        }
                    }
                    else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4)
                    {
                        int lmax = list.Count - 3;
                        for (int k = 0; k < lmax; k += 3)
                        {
                            if (list[k] is PdfString && list[k + 1] is PdfString)
                            {
                                cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]);
                            }
                        }
                    }
                    else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName)
                    {
                        ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1);
                    }
                }
            }
            finally {
                inp.Close();
            }
        }
Exemple #8
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
                if ("BI".Equals(@operator.ToString()))
                {
                    int  found = 0;
                    int  ch;
                    bool immediateAfterBI = true;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch))
                        {
                            baos.WriteByte((byte)ch);
                        }
                        immediateAfterBI = false;
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                        }
                        else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                        {
                            // this clause is needed if we have a white space character that is part of the image data
                            // followed by a whitespace character that precedes the EI operator.  In this case, we need
                            // to flush the first whitespace, then treat the current whitespace as the first potential
                            // character for the end of stream check. Note that we don't increment 'found' here.
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            break;
                        }
                        else
                        {
                            found = 0;
                        }
                    }
                }
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
 /**
  * Processes PDF syntax.
  * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
  * @param contentBytes  the bytes of a content stream
  * @param resources     the resources that come with the content stream
  */
 public void ProcessContent(byte[] contentBytes, PdfDictionary resources){
     this.resources.Push(resources);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0){
         PdfLiteral oper = (PdfLiteral)operands[operands.Count-1];
         if ("BI".Equals(oper.ToString())){
             // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
             PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null;
             HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
         } else {
             InvokeOperator(oper, operands);
         }
     }
     this.resources.Pop();
 }
Exemple #10
0
 /**
  * Parses the content of a page, replacing appearances of annotations
  * with Form XObjects.
  * @param page a page dictionary
  * @throws IOException
  */
 virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) {
     LOGGER.Info("Parsing page with reference " + pageref);
     // initializing member variables
     baos = new MemoryStream();
     this.page = page;
     this.pageref = pageref;
     
     structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
     if(structParents == null)
         throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
     annots = page.GetAsArray(PdfName.ANNOTS);
     if(annots == null)
         annots = new PdfArray();
     PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
     xobjects = resources.GetAsDict(PdfName.XOBJECT);
     if (xobjects == null) {
         xobjects = new PdfDictionary();
         resources.Put(PdfName.XOBJECT, xobjects);
     }
     // parsing the content stream of the page
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     byte[] contentBytes = PdfReader.GetStreamBytes(stream);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0) {
         PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1];
         ProcessOperator(opr, operands);
     }
     // dealing with orphans
     while (items.Count > 0 && items[0].GetPageref() == pageref.Number) {
         StructureItem item = items[0];
         if (item is StructureObject) {
             ConvertToXObject((StructureObject) item);
             items.RemoveAt(0);
         }
     }
     if(annots.Length == 0) {
         page.Remove(PdfName.ANNOTS);
     }
     else {
         PdfDictionary annot;
         for(int i = 0; i < annots.Size; i++) {
             annot = annots.GetAsDict(i);
             if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                 throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
         }
     }
     // replacing the content stream
     baos.Flush();
     baos.Close();
     stream.SetData(baos.ToArray());
     // showing how many items are left
     LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
 }
Exemple #11
0
        /**
         * Processes PDF syntax
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // special handling for embedded images.  If we hit an ID oper, we need
                // to skip all content until we reach an EI oper surrounded by whitespace.
                // The following algorithm has one potential issue: what if the image stream
                // contains <ws>EI<ws> ?
                // it sounds like we would have to actually decode the content stream, which
                // I'd rather avoid right now.
                if ("ID".Equals(oper.ToString()))
                {
                    MemoryStream baos        = new MemoryStream();
                    MemoryStream accumulated = new MemoryStream();
                    int          ch;
                    int          found = 0;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("ID"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            // we should probably eventually do something to make the accumulated image content stream available

                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("EI"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            break;
                        }
                        else
                        {
                            accumulated.WriteTo(baos);
                            accumulated.SetLength(0);

                            baos.WriteByte((byte)ch);
                            found = 0;
                        }
                    }
                }
                InvokeOperator(oper, operands);
            }

            this.resources.Pop();
        }