/**
  * Processes PDF syntax.
  * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
  * @param contentBytes  the bytes of a content stream
  * @param resources     the resources that come with the content stream
  */
 public void ProcessContent(byte[] contentBytes, PdfDictionary resources){
     this.resources.Push(resources);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0){
         PdfLiteral oper = (PdfLiteral)operands[operands.Count-1];
         if ("BI".Equals(oper.ToString())){
             // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
             PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null;
             HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
         } else {
             InvokeOperator(oper, operands);
         }
     }
     this.resources.Pop();
 }
Beispiel #2
0
 /// <summary>
 /// Removes ocgs from a page resources </summary>
 /// <param name="page">	a page dictionary </param>
 /// <param name="ocgs">	a set of names of OCG layers </param>
 private void RemoveProperties(PdfDictionary page, ICollection<string> ocgs) {
     PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
     if (resources == null) {
         return;
     }
     PdfDictionary properties = resources.GetAsDict(PdfName.PROPERTIES);
     if (properties == null) {
         return;
     }
     ICollection<PdfName> names = properties.Keys;
     IList<PdfName> remove = new List<PdfName>();
     foreach (PdfName name in names) {
         PdfDictionary dict = properties.GetAsDict(name);
         if (IsToBeRemoved(dict, ocgs)) {
             remove.Add(name);
         } else {
             RemoveOCGsFromArray(dict, PdfName.OCGS, ocgs);
         }
     }
     foreach (PdfName name in remove) {
         properties.Remove(name);
     }
 }
Beispiel #3
0
 /// <summary>
 /// Uses the OCGParser on a page </summary>
 /// <param name="parser">	the OCGParser </param>
 /// <param name="page">		the page dictionary of the page that needs to be parsed. </param>
 /// <exception cref="IOException"> </exception>
 private void Parse(OCGParser parser, PdfDictionary page) {
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
     parser.Parse(stream, resources);
 }
Beispiel #4
0
        /**
         * If the child of a structured element is a dictionary, we inspect the
         * child; we may also draw a tag.
         * 
         * @param k
         *            the child dictionary to inspect
         */
        virtual public void InspectChildDictionary(PdfDictionary k, bool inspectAttributes) {
            if (k == null)
                return;
            PdfName s = k.GetAsName(PdfName.S);
            if (s != null) {
                String tagN = PdfName.DecodeName(s.ToString());
			    String tag = FixTagName(tagN);
                outp.Write("<");
                outp.Write(tag);
                if (inspectAttributes) {
                    PdfDictionary a = k.GetAsDict(PdfName.A);
                    if (a != null) {
                        Dictionary<PdfName, PdfObject>.KeyCollection keys = a.Keys;
                        foreach (PdfName key in keys) {
                            outp.Write(' ');
                            PdfObject value = a.Get(key);
                            value = PdfReader.GetPdfObject(value);
                            outp.Write(XmlName(key));
                            outp.Write("=\"");
                            outp.Write(value.ToString());
                            outp.Write("\"");
                        }
                    }
                }
                outp.Write(">");
                PdfDictionary dict = k.GetAsDict(PdfName.PG);
                if (dict != null)
                    ParseTag(tagN, k.GetDirectObject(PdfName.K), dict);
                InspectChild(k.GetDirectObject(PdfName.K));
                outp.Write("</");
                outp.Write(tag);
                outp.WriteLine(">");
            } else
                InspectChild(k.GetDirectObject(PdfName.K));
        }
Beispiel #5
0
 /**
  * Searches for a tag in a page.
  * 
  * @param tag
  *            the name of the tag
  * @param obj
  *            an identifier to find the marked content
  * @param page
  *            a page dictionary
  * @throws IOException
  */
 public virtual void ParseTag(String tag, PdfObject obj, PdfDictionary page) {
     // if the identifier is a number, we can extract the content right away
     if (obj is PdfNumber) {
         PdfNumber mcid = (PdfNumber) obj;
         RenderFilter filter = new MarkedContentRenderFilter(mcid.IntValue);
         ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
         FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, new RenderFilter[]{filter});
         PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
                 listener);
         processor.ProcessContent(PdfReader.GetPageContent(page), page
                 .GetAsDict(PdfName.RESOURCES));
         outp.Write(XMLUtil.EscapeXML(listener.GetResultantText(), true));
     }
     // if the identifier is an array, we call the parseTag method
     // recursively
     else if (obj is PdfArray) {
         PdfArray arr = (PdfArray) obj;
         int n = arr.Size;
         for (int i = 0; i < n; i++) {
             ParseTag(tag, arr[i], page);
             if (i < n - 1)
                 outp.WriteLine();
         }
     }
     // if the identifier is a dictionary, we get the resources from the
     // dictionary
     else if (obj is PdfDictionary) {
         PdfDictionary mcr = (PdfDictionary) obj;
         ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
                 .GetAsDict(PdfName.PG));
     }
 }
 /**
  * Shows the detail of a dictionary.
  * @param dic   the dictionary of which you want the detail
  * @param depth the depth of the current dictionary (for nested dictionaries)
  * @return  a String representation of the dictionary
  */
 public static  String GetDictionaryDetail(PdfDictionary dic, int depth){
     StringBuilder builder = new StringBuilder();
     builder.Append('(');
     IList<PdfName> subDictionaries = new List<PdfName>();
     foreach (PdfName key in dic.Keys) {
         PdfObject val = dic.GetDirectObject(key);
         if (val.IsDictionary())
             subDictionaries.Add(key);
         builder.Append(key);
         builder.Append('=');
         builder.Append(val);
         builder.Append(", ");
     }
     builder.Length = builder.Length-2;
     builder.Append(')');
     foreach (PdfName pdfSubDictionaryName in subDictionaries) {
         builder.Append('\n');
         for (int i = 0; i < depth+1; i++){
             builder.Append('\t');
         }
         builder.Append("Subdictionary ");
         builder.Append(pdfSubDictionaryName);
         builder.Append(" = ");
         builder.Append(GetDictionaryDetail(dic.GetAsDict(pdfSubDictionaryName), depth+1));
     }
     return builder.ToString();
 }
Beispiel #7
0
 /**
  * Parses the content of a page, replacing appearances of annotations
  * with Form XObjects.
  * @param page a page dictionary
  * @throws IOException
  */
 virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) {
     LOGGER.Info("Parsing page with reference " + pageref);
     // initializing member variables
     baos = new MemoryStream();
     this.page = page;
     this.pageref = pageref;
     
     structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
     if(structParents == null)
         throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
     annots = page.GetAsArray(PdfName.ANNOTS);
     if(annots == null)
         annots = new PdfArray();
     PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
     xobjects = resources.GetAsDict(PdfName.XOBJECT);
     if (xobjects == null) {
         xobjects = new PdfDictionary();
         resources.Put(PdfName.XOBJECT, xobjects);
     }
     // parsing the content stream of the page
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     byte[] contentBytes = PdfReader.GetStreamBytes(stream);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0) {
         PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1];
         ProcessOperator(opr, operands);
     }
     // dealing with orphans
     while (items.Count > 0 && items[0].GetPageref() == pageref.Number) {
         StructureItem item = items[0];
         if (item is StructureObject) {
             ConvertToXObject((StructureObject) item);
             items.RemoveAt(0);
         }
     }
     if(annots.Length == 0) {
         page.Remove(PdfName.ANNOTS);
     }
     else {
         PdfDictionary annot;
         for(int i = 0; i < annots.Size; i++) {
             annot = annots.GetAsDict(i);
             if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                 throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
         }
     }
     // replacing the content stream
     baos.Flush();
     baos.Close();
     stream.SetData(baos.ToArray());
     // showing how many items are left
     LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
 }
        /**
         * Displays a summary of the entries in the XObject dictionary for the stream
         * @param resourceDic the resource dictionary for the stream
         * @return a string with the summary of the entries
         * @throws IOException
         * @since 5.0.2
         */
        public static String GetXObjectDetail(PdfDictionary resourceDic) {
            StringBuilder sb = new StringBuilder();
            
            PdfDictionary xobjects = resourceDic.GetAsDict(PdfName.XOBJECT);
            if (xobjects == null)
                return "No XObjects";
            foreach (PdfName entryName in xobjects.Keys) {
                PdfStream xobjectStream = xobjects.GetAsStream(entryName);
                
                sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + " = " + xobjectStream.GetAsNumber(PdfName.LENGTH) + " bytes ------\n");
                
                if (!xobjectStream.Get(PdfName.SUBTYPE).Equals(PdfName.IMAGE)){
                
                    byte[] contentBytes = ContentByteUtils.GetContentBytesFromContentObject(xobjectStream);

                    foreach (byte b in contentBytes) {
                        sb.Append((char)b);
                    }
        
                    sb.Append("------ " + entryName + " - subtype = " + xobjectStream.Get(PdfName.SUBTYPE) + "End of Content" + "------\n");
                }
            }
           
            return sb.ToString();
        }
Beispiel #9
0
        /**
         * Parses the content of a page, replacing appearances of annotations
         * with Form XObjects.
         * @param page a page dictionary
         * @throws IOException
         */
        virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref)
        {
            LOGGER.Info("Parsing page with reference " + pageref);
            // initializing member variables
            baos         = new MemoryStream();
            this.page    = page;
            this.pageref = pageref;

            structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
            if (structParents == null)
            {
                throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
            }
            annots = page.GetAsArray(PdfName.ANNOTS);
            if (annots == null)
            {
                annots = new PdfArray();
            }
            PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);

            xobjects = resources.GetAsDict(PdfName.XOBJECT);
            if (xobjects == null)
            {
                xobjects = new PdfDictionary();
                resources.Put(PdfName.XOBJECT, xobjects);
            }
            // parsing the content stream of the page
            PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);

            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(opr, operands);
            }
            // dealing with orphans
            while (items.Count > 0 && items[0].GetPageref() == pageref.Number)
            {
                StructureItem item = items[0];
                if (item is StructureObject)
                {
                    ConvertToXObject((StructureObject)item);
                    items.RemoveAt(0);
                }
            }
            if (annots.Length == 0)
            {
                page.Remove(PdfName.ANNOTS);
            }
            else
            {
                PdfDictionary annot;
                for (int i = 0; i < annots.Size; i++)
                {
                    annot = annots.GetAsDict(i);
                    if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                    {
                        throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
                    }
                }
            }
            // replacing the content stream
            baos.Flush();
            baos.Close();
            stream.SetData(baos.ToArray());
            // showing how many items are left
            LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
        }
Beispiel #10
0
        /**
         * Converts an annotation structure item to a Form XObject annotation.
         * @param item the structure item
         * @throws IOException
         */
        virtual protected void ConvertToXObject(StructureObject item)
        {
            PdfDictionary structElem = item.GetStructElem();

            if (structElem == null)
            {
                return;
            }
            PdfDictionary dict = item.GetObjAsDict();

            if (dict == null || !dict.CheckType(PdfName.ANNOT))
            {
                return;
            }
            PdfDictionary ap = dict.GetAsDict(PdfName.AP);

            if (ap == null)
            {
                return;
            }
            PdfNumber structParent = dict.GetAsNumber(PdfName.STRUCTPARENT);

            if (structParent == null)
            {
                return;
            }
            PdfStream stream = ap.GetAsStream(PdfName.N);

            if (stream == null)
            {
                return;
            }
            stream.Put(PdfName.STRUCTPARENT, structParent);
            PdfIndirectReference xobjr = ap.GetAsIndirectObject(PdfName.N);

            if (xobjr == null)
            {
                return;
            }
            // remove the annotation from the page
            for (int i = 0; i < annots.Length; i++)
            {
                PdfIndirectReference annotref = annots.GetAsIndirectObject(i);
                if (item.GetObjRef().Number == annotref.Number)
                {
                    annots.Remove(i);
                    break;
                }
            }
            // replace the existing attributes by a PrintField attribute
            PdfDictionary attribute = new PdfDictionary();

            attribute.Put(PdfName.O, PdfName.PRINTFIELD);
            PdfString description = dict.GetAsString(PdfName.TU);

            if (description == null)
            {
                description = dict.GetAsString(PdfName.T);
            }
            if (PdfName.BTN.Equals(dict.Get(PdfName.FT)))
            {
                PdfNumber fflags = dict.GetAsNumber(PdfName.FF);
                if (fflags != null)
                {
                    int ff = fflags.IntValue;
                    if ((ff & PdfFormField.FF_PUSHBUTTON) != 0)
                    {
                        attribute.Put(PdfName.ROLE, PdfName.PB);
                    }
                    // I don't think the condition below will ever be true
                    if ((ff & PdfFormField.FF_RADIO) != 0)
                    {
                        attribute.Put(PdfName.ROLE, PdfName.rb);
                    }
                    else
                    {
                        attribute.Put(PdfName.ROLE, PdfName.CB);
                    }
                }
            }
            else
            {
                attribute.Put(PdfName.ROLE, PdfName.TV);
            }
            attribute.Put(PdfName.DESC, description);
            // Updating the values of the StructElem dictionary
            PdfString t = structElem.GetAsString(PdfName.T);

            if (t == null || t.ToString().Trim().Length == 0)
            {
                structElem.Put(PdfName.T, dict.GetAsString(PdfName.T));
            }
            structElem.Put(PdfName.A, attribute);
            structElem.Put(PdfName.S, PdfName.P);
            structElem.Put(PdfName.PG, pageref);
            // Defining a new MCID
            int mcid = items.ProcessMCID(structParents, item.GetRef());

            LOGGER.Info("Using MCID " + mcid);
            structElem.Put(PdfName.K, new PdfNumber(mcid));
            // removing the annotation from the parent tree
            items.RemoveFromParentTree(structParent);
            // Adding the XObject to the page
            PdfName xobj = new PdfName("XObj" + structParent.IntValue);

            LOGGER.Info("Creating XObject with name " + xobj);
            xobjects.Put(xobj, xobjr);
            PdfArray array = dict.GetAsArray(PdfName.RECT);
            // Getting the position of the annotation
            Rectangle rect = new Rectangle(
                array.GetAsNumber(0).FloatValue, array.GetAsNumber(1).FloatValue,
                array.GetAsNumber(2).FloatValue, array.GetAsNumber(3).FloatValue);

            rect.Normalize();
            // A Do operator is forbidden inside a text block
            if (inText && !btWrite)
            {
                LOGGER.Debug("Introducing extra ET");
                byte[] bytes = Encoding.ASCII.GetBytes("ET\n");
                baos.Write(bytes, 0, bytes.Length);
                etExtra = true;
            }
            // Writing the marked-content sequence with the Do operator
            // Note that the position assumes that the CTM wasn't changed in the graphics state
            // TODO: do the math if the CTM did change!
            ByteBuffer buf = new ByteBuffer();

            buf.Append("/P <</MCID ");
            buf.Append(mcid);
            buf.Append(">> BDC\n");
            buf.Append("q 1 0 0 1 ");
            buf.Append(rect.Left.ToString(CultureInfo.InvariantCulture));
            buf.Append(" ");
            buf.Append(rect.Bottom.ToString(CultureInfo.InvariantCulture));
            buf.Append(" cm ");
            buf.Append(xobj.GetBytes());
            buf.Append(" Do Q\n");
            buf.Append("EMC\n");
            buf.Flush();
            buf.WriteTo(baos);
            // if we were inside a text block, we've introduced an ET, so we'll need to write a BT
            if (inText)
            {
                btWrite = true;
            }
        }
Beispiel #11
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
                if ("BI".Equals(@operator.ToString()))
                {
                    int  found = 0;
                    int  ch;
                    bool immediateAfterBI = true;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch))
                        {
                            baos.WriteByte((byte)ch);
                        }
                        immediateAfterBI = false;
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                        }
                        else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                        {
                            // this clause is needed if we have a white space character that is part of the image data
                            // followed by a whitespace character that precedes the EI operator.  In this case, we need
                            // to flush the first whitespace, then treat the current whitespace as the first potential
                            // character for the end of stream check. Note that we don't increment 'found' here.
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            break;
                        }
                        else
                        {
                            found = 0;
                        }
                    }
                }
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
Beispiel #12
0
        public static List <Attachment> GetAttachments(Stream stream, string password = null)
        {
            PdfReader reader = null;

            if (!string.IsNullOrWhiteSpace(password))
            {
                reader = new PdfReader(stream, Encoding.ASCII.GetBytes(password));
            }
            else
            {
                reader = new PdfReader(stream);
            }

            #region Variables

            PdfDictionary catalog       = null;
            PdfDictionary documentNames = null;
            PdfDictionary embeddedFiles = null;
            PdfDictionary fileArray     = null;
            PdfDictionary file          = null;

            PRStream prstream = null;

            Attachment        fContent = null;
            List <Attachment> lstAtt   = null;

            #endregion Variables

            catalog = reader.Catalog;

            lstAtt = new List <Attachment>();

            documentNames = (PdfDictionary)PdfReader.GetPdfObject(catalog.Get(PdfName.NAMES));

            if (documentNames != null)
            {
                embeddedFiles = (PdfDictionary)PdfReader.GetPdfObject(documentNames.Get(PdfName.EMBEDDEDFILES));
                if (embeddedFiles != null)
                {
                    PdfArray filespecs = embeddedFiles.GetAsArray(PdfName.NAMES);

                    for (int i = 0; i < filespecs.Size; i++)
                    {
                        i++;
                        fileArray = filespecs.GetAsDict(i);

                        file = fileArray.GetAsDict(PdfName.EF);

                        foreach (PdfName key in file.Keys)
                        {
                            prstream = (PRStream)PdfReader.GetPdfObject(file.GetAsIndirectObject(key));

                            fContent      = new Attachment();
                            fContent.Name = fileArray.GetAsString(key).ToString();

                            fContent.Content = PdfReader.GetStreamBytes(prstream);
                            lstAtt.Add(fContent);
                        }
                    }
                }
            }

            return(lstAtt);
        }