Пример #1
0
 /// <summary>
 /// CustomPdfReader to be able to work with streams.
 /// </summary>
 public CustomPdfReader(Stream isp, X509Certificate certificate, ICipherParameters certificateKey)
 {
     this.certificate    = certificate;
     this.certificateKey = certificateKey;
     tokens = new PRTokeniser(new RandomAccessFileOrArray(isp));
     ReadPdf();
 }
Пример #2
0
        /**
         * Unescapes an URL. All the "%xx" are replaced by the 'xx' hex char value.
         * @param src the url to unescape
         * @return the eunescaped value
         */
        public static String UnEscapeURL(String src)
        {
            StringBuilder bf = new StringBuilder();

            char[] s = src.ToCharArray();
            for (int k = 0; k < s.Length; ++k)
            {
                char c = s[k];
                if (c == '%')
                {
                    if (k + 2 >= s.Length)
                    {
                        bf.Append(c);
                        continue;
                    }
                    int a0 = PRTokeniser.GetHex((int)s[k + 1]);
                    int a1 = PRTokeniser.GetHex((int)s[k + 2]);
                    if (a0 < 0 || a1 < 0)
                    {
                        bf.Append(c);
                        continue;
                    }
                    bf.Append((char)(a0 * 16 + a1));
                    k += 2;
                }
                else
                {
                    bf.Append(c);
                }
            }
            return(bf.ToString());
        }
Пример #3
0
        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();

            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject())
            {
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                {
                    resolvedKey = (PdfName)key;
                }

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();

            if (!PRTokeniser.IsWhitespace(ch))
            {
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            }

            return(dictionary);
        }
Пример #4
0
        /**
         * Processes PDF syntax.
         * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <iTextSharp.text.pdf.PdfObject> operands = new List <iTextSharp.text.pdf.PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // w.GetOperatorInfo(oper)
                //w.wr.Print("operator info {0} type {1} string {2}", oper.GetType().ToString(), oper.Type, oper.ToString());

                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null;

                    // 'iTextSharp.text.pdf.parser.ImageRenderInfo.CreateForEmbeddedImage(iTextSharp.text.pdf.parser.Matrix, iTextSharp.text.pdf.parser.InlineImageInfo, iTextSharp.text.pdf.PdfDictionary)' is inaccessible due to its protection level
                    ImageRenderInfo renderInfo = ImageRenderInfo.CreateForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
                    renderListener.RenderImage(renderInfo);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }
            this.resources.Pop();
        }
        public byte[] Modify(byte[] contentBytes, PdfDictionary resourcesDictionary)
        {
            _contentStreamBuilderStack.Push(new PdfContentStreamBuilder());
            _resourceDictionaryStack.Push(resourcesDictionary);
            PRTokeniser      tokeniser = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps        = new PdfContentParser(tokeniser);

            List <PdfObject> operands = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                PdfContentOperatorHandler operHandler = null;

                if (_operators.TryGetValue(oper.ToString(), out operHandler))
                {
                    operands = operHandler(oper, operands);
                }

                _contentStreamBuilderStack.Peek().Push(operands);
            }

            _resourceDictionaryStack.Pop();
            return(_contentStreamBuilderStack.Pop().GetBytes());
        }
        private static void ParsePdf(byte[] pdf, IPdfParsingStrategy strategy)
        {
            PdfReader reader = new PdfReader(pdf);

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                byte[] page = reader.GetPageContent(i);
                if (page != null)
                {
                    PRTokeniser     tokenizer  = new PRTokeniser(page);
                    List <PdfToken> parameters = new List <PdfToken>();
                    while (tokenizer.NextToken())
                    {
                        var token = PdfToken.Create(tokenizer);
                        if (token.IsOperand)
                        {
                            strategy.Execute(new PdfOperation(token, parameters));
                            parameters.Clear();
                        }
                        else
                        {
                            parameters.Add(token);
                        }
                    }
                }
            }
        }
Пример #7
0
        /**
         * Processes PDF syntax
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];
                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary   colorSpaceDic = resources.GetAsDict(PdfName.COLORSPACE);
                    ImageRenderInfo renderInfo    = ImageRenderInfo.CreatedForEmbeddedImage(Gs().ctm, InlineImageUtils.ParseInlineImage(ps, colorSpaceDic));
                    renderListener.RenderImage(renderInfo);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }

            this.resources.Pop();
        }
Пример #8
0
        private void CheckNumberValue(String data, String expectedValue)
        {
            PRTokeniser tok = new PRTokeniser(new RandomAccessFileOrArray(GetBytes(data)));

            tok.NextValidToken();
            Assert.AreEqual(PRTokeniser.TokType.NUMBER, tok.TokenType, "Wrong type");
            Assert.AreEqual(expectedValue, tok.StringValue, "Wrong multiple minus signs number handling");
        }
Пример #9
0
        private void CheckTokenTypes(String data, params PRTokeniser.TokType[] expectedTypes)
        {
            PRTokeniser tok = new PRTokeniser(new RandomAccessFileOrArray(GetBytes(data)));

            for (int i = 0; i < expectedTypes.Length; i++)
            {
                tok.NextValidToken();
                //System.out.println(tok.getTokenType() + " -> " + tok.getStringValue());
                Assert.AreEqual(expectedTypes[i], tok.TokenType, "Position " + i);
            }
        }
Пример #10
0
        IDictionary <string, IList <object> > ParseDAParam(PdfString DA)
        {
            IDictionary <string, IList <object> > commandArguments = new Dictionary <string, IList <object> >();

            PRTokeniser    tokeniser        = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(DA.GetBytes())));
            IList <object> currentArguments = new List <object>();

            while (tokeniser.NextToken())
            {
                if (tokeniser.TokenType == PRTokeniser.TokType.OTHER)
                {
                    String key = tokeniser.StringValue;

                    if (key == "RG" || key == "G" || key == "K")
                    {
                        key = STROKE_COLOR;
                    }
                    else if (key == "rg" || key == "g" || key == "k")
                    {
                        key = FILL_COLOR;
                    }

                    if (commandArguments.ContainsKey(key))
                    {
                        commandArguments[key] = currentArguments;
                    }
                    else
                    {
                        commandArguments.Add(key, currentArguments);
                    }

                    currentArguments = new List <object>();
                }
                else
                {
                    switch (tokeniser.TokenType)
                    {
                    case PRTokeniser.TokType.NUMBER:
                        currentArguments.Add(new PdfNumber(tokeniser.StringValue));
                        break;

                    case PRTokeniser.TokType.NAME:
                        currentArguments.Add(new PdfName(tokeniser.StringValue));
                        break;

                    default:
                        currentArguments.Add(tokeniser.StringValue);
                        break;
                    }
                }
            }

            return(commandArguments);
        }
Пример #11
0
        public string ParsePdf(string filePath)
        {
            string text = string.Empty;

            PdfReader reader = new iTextSharp.text.pdf.PdfReader(filePath);

            byte[] streamBytes = reader.GetPageContent(1);

            FileStream fStream = File.OpenRead(filePath);

            byte[] contents = new byte[fStream.Length];

            fStream.Read(contents, 0, (int)fStream.Length);

            fStream.Close();

            string s     = Encoding.UTF8.GetString(contents, 0, contents.Length);
            var    table = (Encoding.Default.GetString(streamBytes, 0, streamBytes.Length - 1)).Split(new string[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);

            byte[]      buf        = Encoding.Convert(Encoding.GetEncoding("iso-8859-1"), Encoding.UTF8, streamBytes);
            string      tempString = Encoding.UTF8.GetString(buf, 0, buf.Count());
            PRTokeniser tokenizer  = new PRTokeniser(streamBytes);

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                {
                    text += tokenizer.StringValue;
                }
            }

            // create a reader (constructor overloaded for path to local file or URL)
            //PdfReader reader
            //    = new PdfReader("http://www.chinehamchat.com/Chineham_Chat_Advertisements.pdf");
            // total number of pages
            int n = reader.NumberOfPages;
            // size of the first page
            Rectangle psize = reader.GetPageSize(1);
            //float width = psize.Width;
            //float height = psize.Height;
            //Console.WriteLine("Size of page 1 of {0} => {1} × {2}", n, width, height);
            // file properties
            Hashtable   infoHash = reader.Info;
            ICollection keys     = infoHash.Keys;

            // Dictionary<string, string> infodict = (Dictionary<string,string>)reader.Info;
            foreach (string key in keys)
            {
                text += key + " => " + infoHash[key];
            }
            // Console.WriteLine(key+ " => " + infoHash[key]);
            return(text);
        }
Пример #12
0
        virtual public bool CompareInnerText(String path1, String path2)
        {
            PdfReader reader1 = new PdfReader(path1);

            byte[]      streamBytes1 = reader1.GetPageContent(1);
            PRTokeniser tokenizer1   =
                new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes1)));



            PdfReader reader2 = new PdfReader(path2);

            byte[]      streamBytes2 = reader2.GetPageContent(1);
            PRTokeniser tokenizer2   =
                new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(streamBytes2)));

            try {
                while (tokenizer1.NextToken())
                {
                    if (!tokenizer2.NextToken())
                    {
                        return(false);
                    }
                    else
                    {
                        if (tokenizer1.TokenType != tokenizer2.TokenType)
                        {
                            return(false);
                        }
                        else
                        {
                            if (tokenizer1.TokenType == tokenizer2.TokenType && tokenizer2.TokenType == PRTokeniser.TokType.NUMBER)
                            {
                                if (Math.Abs(float.Parse(tokenizer1.StringValue, CultureInfo.InvariantCulture)
                                             - float.Parse(tokenizer2.StringValue, CultureInfo.InvariantCulture)) > 0.001)
                                {
                                    return(false);
                                }
                            }
                            else if (!tokenizer1.StringValue.Equals(tokenizer2.StringValue))
                            {
                                return(false);
                            }
                        }
                    }
                }
                return(true);
            }
            finally {
                reader1.Close();
                reader2.Close();
            }
        }
Пример #13
0
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
            // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
            int startIndex = 0;

            if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0)  // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
            {
                bytes[0] = (byte)shouldBeWhiteSpace;
                startIndex++;
            }
            for (int i = startIndex; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                // Some PDF producers seem to add another non-whitespace character after the image data.
                // Let's try to handle that case here.
                PdfObject ei2 = ps.ReadPRObject();
                if (!ei2.ToString().Equals("EI"))
                {
                    throw new InlineImageParseException("EI not found after end of image data");
                }
            }

            return(bytes);
        }
Пример #14
0
// ---------------------------------------------------------------------------

        /**
         * Parses the PDF using PRTokeniser
         * @param src the ]original PDF file
         * ]     */
        public string ParsePdf(byte[] src)
        {
            PdfReader reader = new PdfReader(src);

            // we can inspect the syntax of the imported page
            byte[]        streamBytes = reader.GetPageContent(1);
            StringBuilder sb          = new StringBuilder();
            PRTokeniser   tokenizer   = new PRTokeniser(streamBytes);

            while (tokenizer.NextToken())
            {
                if (tokenizer.TokenType == PRTokeniser.TokType.STRING)
                {
                    sb.AppendLine(tokenizer.StringValue);
                }
            }
            return(sb.ToString());
        }
Пример #15
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
Пример #16
0
        public List <DataTable> Load(MemoryStream stream)
        {
            var tables = new List <DataTable>();
            var sb     = new StringBuilder();
            var reader = new PdfReader(stream);

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                var cpage   = reader.GetPageN(page);
                var content = cpage.Get(PdfName.CONTENTS);

                var ir = (PRIndirectReference)content;

                var value = reader.GetPdfObject(ir.Number);

                if (value.IsStream())
                {
                    PRStream prstream = (PRStream)value;

                    var streamBytes = PdfReader.GetStreamBytes(prstream);

                    var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                    try
                    {
                        while (tokenizer.NextToken())
                        {
                            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                            {
                                string str = tokenizer.StringValue;
                                sb.AppendLine(str);
                            }
                        }
                    }
                    finally
                    {
                        tokenizer.Close();
                    }
                }
            }
            Console.WriteLine(sb.ToString());
            return(tables);
        }
Пример #17
0
        /// <summary>
        /// Old algorithm designed to work with iTextSharp 4.1.6. Use iTextSharp version >= 5 if possible (license changes were made).
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        internal static string ExtractTextFromPdfBytes(byte[] input)
        {
            if (input == null || input.Length == 0)
            {
                return("");
            }

            var result    = new StringBuilder();
            var tokeniser = new PRTokeniser(input);

            try
            {
                while (tokeniser.NextToken())
                {
                    var tknType  = tokeniser.TokenType;
                    var tknValue = tokeniser.StringValue.Replace('\0', ' ');

                    if (tknType == PRTokeniser.TK_STRING)
                    {
                        result.Append(tknValue);
                    }
                    else
                    {
                        switch (tknValue)
                        {
                        case "-600":
                            result.Append(" ");
                            break;

                        case "TJ":
                            result.Append(" ");
                            break;
                        }
                    }
                }
            }
            finally
            {
                tokeniser.Close();
            }

            return(result.ToString());
        }
Пример #18
0
        public void TestPRTokenizer()
        {
            String obj = "13 0 obj\n" +
                         "<< /Type /StructElem /Pg 111117220777773888836 0 R>>\n" +
                         "endobj";
            PRTokeniser tokens = new PRTokeniser(new RandomAccessFileOrArray(Encoding.ASCII.GetBytes(obj)));

            for (int i = 0; i < 11; i++)
            {
                tokens.NextValidToken();
                if (tokens.TokenType == PRTokeniser.TokType.REF)
                {
                    Assert.IsTrue(tokens.Reference < 0);
                }
                if (tokens.TokenType == PRTokeniser.TokType.ENDOFFILE)
                {
                    break;
                }
            }
        }
        static void Main(string[] args)
        {
            string        pdfPath = "C:\\mypdf.pdf";
            PdfReader     reader  = new PdfReader(pdfPath);
            StringBuilder sb      = new StringBuilder();

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                var cpage   = reader.GetPageN(page);
                var content = cpage.Get(PdfName.CONTENTS);
                var ir      = (PRIndirectReference)content;
                var value   = reader.GetPdfObject(ir.Number);
                if (value.IsStream())
                {
                    PRStream stream      = (PRStream)value;
                    var      streamBytes = PdfReader.GetStreamBytes(stream);
                    var      tokenizer   = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));

                    try
                    {
                        while (tokenizer.NextToken())
                        {
                            if (tokenizer.TokenType == PRTokeniser.TK_STRING)
                            {
                                string str = tokenizer.StringValue;
                                sb.Append(str);
                            }
                        }
                    }
                    finally
                    {
                        tokenizer.Close();
                    }
                }
            }

            Console.Write("PDF Content:" + Environment.NewLine);
            Console.Write(sb.ToString());
            Console.Write(Environment.NewLine + "--EOF--");
        }
Пример #20
0
        public static PdfData ConvertToPdfData(string fileName, int pageNum)
        {
            if ((string.IsNullOrEmpty(fileName) || string.IsNullOrWhiteSpace(fileName)) && pageNum <= 0)
            {
                return(null);
            }
            Helpers.D.Log("PdfConvertIText.ConvertToPdfData({0}, {1})", fileName, pageNum);
            PdfData          data = new PdfData();
            PdfContentParser parser;// = new PdfContentParser();
            PRTokeniser      tokeniser = new PRTokeniser(fileName);
            PdfDictionary    dict;
            ArrayList        items;

            parser = new PdfContentParser(tokeniser);

            dict = parser.ReadDictionary();
            //dict.Contains(PdfName.IMAGE)
            items = parser.Parse(parser.ReadArray().ArrayList);
            Helpers.D.Log("PdfConvertIText.ConvertToPdfData: {0} | {1}", items.Count, string.Join(", ", items.ToArray()));

            return(data);
        }
Пример #21
0
        /**
         * Processes PDF syntax.
         * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        virtual public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes)));
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];
                if ("BI".Equals(oper.ToString()))
                {
                    // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
                    PdfDictionary colorSpaceDic = resources != null?resources.GetAsDict(PdfName.COLORSPACE) : null;

                    HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
                }
                else
                {
                    InvokeOperator(oper, operands);
                }
            }
            this.resources.Pop();
        }
Пример #22
0
        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            for (int i = 0; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                throw new InlineImageParseException("EI not found after end of image data");
            }

            return(bytes);
        }
Пример #23
0
        /**
         * Parses the samples of the image from the underlying content parser, accounting for filters
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * <b>Note:</b>This implementation does not actually apply the filters at this time
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the ID operator

            if (!imageDictionary.Contains(PdfName.FILTER))
            {
                return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps));
            }


            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos        = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int          ch;
            int          found     = 0;
            PRTokeniser  tokeniser = ps.GetTokeniser();

            byte[] ff = null;

            while ((ch = tokeniser.Read()) != -1)
            {
                if (found == 0 && PRTokeniser.IsWhitespace(ch))
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && ch == 'E')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                {
                    // this clause is needed if we have a white space character that is part of the image data
                    // followed by a whitespace character that precedes the EI operator.  In this case, we need
                    // to flush the first whitespace, then treat the current whitespace as the first potential
                    // character for the end of stream check.  Note that we don't increment 'found' here.
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 2 && ch == 'I')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                {
                    byte[] tmp = baos.ToArray();
                    if (InlineImageStreamBytesAreComplete(tmp, imageDictionary))
                    {
                        return(tmp);
                    }
                    byte[] accumulatedArr = accumulated.ToArray();
                    baos.Write(accumulatedArr, 0, accumulatedArr.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
                else
                {
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
            }
            throw new InlineImageParseException("Could not find image data or EI");
        }
Пример #24
0
        /**
         * Processes PDF syntax
         * @param contentBytes  the bytes of a content stream
         * @param resources     the resources that come with the content stream
         */
        public void ProcessContent(byte[] contentBytes, PdfDictionary resources)
        {
            this.resources.Push(resources);
            PRTokeniser      tokeniser = new PRTokeniser(contentBytes);
            PdfContentParser ps        = new PdfContentParser(tokeniser);
            List <PdfObject> operands  = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral oper = (PdfLiteral)operands[operands.Count - 1];

                // special handling for embedded images.  If we hit an ID oper, we need
                // to skip all content until we reach an EI oper surrounded by whitespace.
                // The following algorithm has one potential issue: what if the image stream
                // contains <ws>EI<ws> ?
                // it sounds like we would have to actually decode the content stream, which
                // I'd rather avoid right now.
                if ("ID".Equals(oper.ToString()))
                {
                    MemoryStream baos        = new MemoryStream();
                    MemoryStream accumulated = new MemoryStream();
                    int          ch;
                    int          found = 0;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                            accumulated.WriteByte((byte)ch);
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("ID"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            // we should probably eventually do something to make the accumulated image content stream available

                            operands = new List <PdfObject>();
                            operands.Add(new PdfLiteral("EI"));
                            InvokeOperator((PdfLiteral)operands[operands.Count - 1], operands);

                            break;
                        }
                        else
                        {
                            accumulated.WriteTo(baos);
                            accumulated.SetLength(0);

                            baos.WriteByte((byte)ch);
                            found = 0;
                        }
                    }
                }
                InvokeOperator(oper, operands);
            }

            this.resources.Pop();
        }
Пример #25
0
 public static PdfToken Create(PRTokeniser tokenizer)
 {
     return(new PdfToken(tokenizer.TokenType, tokenizer.StringValue));
 }
Пример #26
0
        private static void ParseCid(String cmapName, AbstractCMap cmap, ICidLocation location, int level)
        {
            if (level >= MAXLEVEL)
            {
                return;
            }
            PRTokeniser inp = location.GetLocation(cmapName);

            try {
                List <PdfObject> list = new List <PdfObject>();
                PdfContentParser cp   = new PdfContentParser(inp);
                int maxExc            = 50;
                while (true)
                {
                    try {
                        cp.Parse(list);
                    }
                    catch {
                        if (--maxExc < 0)
                        {
                            break;
                        }
                        continue;
                    }
                    if (list.Count == 0)
                    {
                        break;
                    }
                    String last = list[list.Count - 1].ToString();
                    if (level == 0 && list.Count == 3 && last.Equals(DEF))
                    {
                        PdfObject key = list[0];
                        if (PdfName.REGISTRY.Equals(key))
                        {
                            cmap.Registry = list[1].ToString();
                        }
                        else if (PdfName.ORDERING.Equals(key))
                        {
                            cmap.Ordering = list[1].ToString();
                        }
                        else if (CMAPNAME.Equals(key))
                        {
                            cmap.Name = list[1].ToString();
                        }
                        else if (PdfName.SUPPLEMENT.Equals(key))
                        {
                            try {
                                cmap.Supplement = ((PdfNumber)list[1]).IntValue;
                            }
                            catch {}
                        }
                    }
                    else if ((last.Equals(ENDCIDCHAR) || last.Equals(ENDBFCHAR)) && list.Count >= 3)
                    {
                        int lmax = list.Count - 2;
                        for (int k = 0; k < lmax; k += 2)
                        {
                            if (list[k] is PdfString)
                            {
                                cmap.AddChar((PdfString)list[k], list[k + 1]);
                            }
                        }
                    }
                    else if ((last.Equals(ENDCIDRANGE) || last.Equals(ENDBFRANGE)) && list.Count >= 4)
                    {
                        int lmax = list.Count - 3;
                        for (int k = 0; k < lmax; k += 3)
                        {
                            if (list[k] is PdfString && list[k + 1] is PdfString)
                            {
                                cmap.AddRange((PdfString)list[k], (PdfString)list[k + 1], list[k + 2]);
                            }
                        }
                    }
                    else if (last.Equals(USECMAP) && list.Count == 2 && list[0] is PdfName)
                    {
                        ParseCid(PdfName.DecodeName(list[0].ToString()), cmap, location, level + 1);
                    }
                }
            }
            finally {
                inp.Close();
            }
        }
Пример #27
0
        public void ExtractTextTest1()
        {
            PDFManager pdfManager = new PDFManager(); // TODO: Initialize to an appropriate value

            //byte[] input = File.ReadAllBytes(DiscoveryManager.GetDiscoveryPath("M:\\DFD", "http://unicode.org/charts/PDF/U0590.pdf", ".pdf"));

            byte[] input = File.ReadAllBytes(@"");


            string path = @"M:\COL\hebrew.pdf";
            string destinationFileName = @"M:\COL\hebrew1.pdf";


            PdfReader reader   = new PdfReader(path);
            int       n        = reader.NumberOfPages;
            Document  document = new Document(PageSize.A4);

            PdfWriter writer = PdfWriter.GetInstance(document, new FileStream(destinationFileName, FileMode.Create));

            int i = 0;

            document.Open();

            PdfContentByte cb = writer.DirectContent;


            PdfTemplate template = cb.CreateTemplate(0, 0);


            while (i < n)
            {
                document.NewPage();
                i++;

                PdfImportedPage importedPage = writer.GetImportedPage(reader, i);


                Image img = Image.GetInstance(importedPage);

                img.ScalePercent(100);
                document.Add(img);
                cb.AddTemplate(importedPage, 0, 100);
            }


            document.Close();
            writer.Close();


            PdfReader pdfReader = new PdfReader(input);

            StringBuilder stringBuilder = new StringBuilder();

            string dingle = string.Empty;

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                stringBuilder.Append(pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " ");

                PRTokeniser prTokeniser = new PRTokeniser(pdfReader.GetPageContent(page));


                PdfDictionary pdfDictionary = pdfReader.GetPageN(page);

                byte[] dinas = pdfReader.GetPageContent(page);

                string winsdgf = Encoding.GetEncoding(1255).GetString(dinas);


                try
                {
                    while (prTokeniser.NextToken())
                    {
                        if (prTokeniser.TokenType == PRTokeniser.TokType.STRING)
                        {
                            dingle += prTokeniser.StringValue;

                            try
                            {
                                //dingle += (char)(int.Parse(prTokeniser.StringValue));

                                //dingle += iTextSharp.text.Utilities.ConvertFromUtf32(prTokeniser.FilePointer);

                                //dingle += ((char)prTokeniser.Read()).ToString();

                                dingle += prTokeniser.ReadString(2);
                                Chunk chunk = new Chunk(prTokeniser.StringValue);

                                //string wangle = PRTokeniser.GetHex(prTokeniser.IntValue).ToString();
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
                catch (Exception)
                {
                    {
                    }
                    //throw;
                }

                //int ij = 0;

//                #
//If Not IsNothing(pageBytes) Then
//#
//                    token = New PRTokeniser(pageBytes)
//#
//                    While token.NextToken()
//#
//                        tknType = token.TokenType()
//#
//                        tknValue = token.StringValue
//#
//                        If tknType = PRTokeniser.TK_STRING Then
//#
//                            sb.Append(token.StringValue)
//#
//                        'I need to add these additional tests to properly add whitespace to the output string
//#
//                        ElseIf tknType = 1 AndAlso tknValue = "-600" Then
//#
//                            sb.Append(" ")
//#
//                        ElseIf tknType = 10 AndAlso tknValue = "TJ" Then
//#
//                            sb.Append(" ")
//#
//                        End If
//#
//                   End While
            }

            string actual = pdfManager.ExtractText(input);
        }
Пример #28
0
        /// <summary>
        /// Parses a stream object and removes OCGs. </summary>
        /// <param name="stream">	a stream object </param>
        /// <param name="resources">	the resources dictionary of that object (containing info about the OCGs) </param>
        public virtual void Parse(PRStream stream, PdfDictionary resources)
        {
            baos       = new MemoryStream();
            properties = resources.GetAsDict(PdfName.PROPERTIES);
            xobj       = new HashSet2 <PdfName>();
            PdfDictionary xobjects = resources.GetAsDict(PdfName.XOBJECT);

            if (xobjects != null)
            {
                // remove XObject (form or image) that belong to an OCG that needs to be removed
                foreach (PdfName name in xobjects.Keys)
                {
                    PRStream      xobject = (PRStream)xobjects.GetAsStream(name);
                    PdfDictionary oc      = xobject.GetAsDict(PdfName.OC);
                    if (oc != null)
                    {
                        PdfString ocname = oc.GetAsString(PdfName.NAME);
                        if (ocname != null && ocgs.Contains(ocname.ToString()))
                        {
                            xobj.Add(name);
                        }
                    }
                }
                foreach (PdfName name in xobj)
                {
                    xobjects.Remove(name);
                }
            }
            // parse the content stream
            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(contentBytes));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral @operator = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(this, @operator, operands);
                if ("BI".Equals(@operator.ToString()))
                {
                    int  found = 0;
                    int  ch;
                    bool immediateAfterBI = true;
                    while ((ch = tokeniser.Read()) != -1)
                    {
                        if (!immediateAfterBI || !PRTokeniser.IsWhitespace(ch))
                        {
                            baos.WriteByte((byte)ch);
                        }
                        immediateAfterBI = false;
                        if (found == 0 && PRTokeniser.IsWhitespace(ch))
                        {
                            found++;
                        }
                        else if (found == 1 && ch == 'E')
                        {
                            found++;
                        }
                        else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                        {
                            // this clause is needed if we have a white space character that is part of the image data
                            // followed by a whitespace character that precedes the EI operator.  In this case, we need
                            // to flush the first whitespace, then treat the current whitespace as the first potential
                            // character for the end of stream check. Note that we don't increment 'found' here.
                        }
                        else if (found == 2 && ch == 'I')
                        {
                            found++;
                        }
                        else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                        {
                            break;
                        }
                        else
                        {
                            found = 0;
                        }
                    }
                }
            }
            baos.Flush();
            baos.Close();
            stream.SetData(baos.GetBuffer());
        }
Пример #29
0
 /**
  * Processes PDF syntax.
  * <b>Note:</b> If you re-use a given {@link PdfContentStreamProcessor}, you must call {@link PdfContentStreamProcessor#reset()}
  * @param contentBytes  the bytes of a content stream
  * @param resources     the resources that come with the content stream
  */
 public void ProcessContent(byte[] contentBytes, PdfDictionary resources){
     this.resources.Push(resources);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0){
         PdfLiteral oper = (PdfLiteral)operands[operands.Count-1];
         if ("BI".Equals(oper.ToString())){
             // we don't call invokeOperator for embedded images - this is one area of the PDF spec that is particularly nasty and inconsistent
             PdfDictionary colorSpaceDic = resources != null ? resources.GetAsDict(PdfName.COLORSPACE) : null;
             HandleInlineImage(InlineImageUtils.ParseInlineImage(ps, colorSpaceDic), colorSpaceDic);
         } else {
             InvokeOperator(oper, operands);
         }
     }
     this.resources.Pop();
 }
Пример #30
0
        private static List <Line> FindRectangles(string sourceFile, int pageNumber)
        {
            //Source file to read from

            var listOfLines = new List <Line>();


            //Bind a reader to our PDF
            using (PdfReader reader = new PdfReader(sourceFile))
            {
                //Create our buffer for previous token values. For Java users, List<string> is a generic list, probably most similar to an ArrayList
                List <string> buf = new List <string>();

                //Get the raw bytes for the page
                byte[] pageBytes = reader.GetPageContent(pageNumber);
                //Get the raw tokens from the bytes

                PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(pageBytes));

                //Create some variables to set later
                PRTokeniser.TokType tokenType;
                string tokenValue;

                int countOfLines      = 0;
                var AllowDecimalPoint = System.Globalization.NumberStyles.AllowDecimalPoint;

                //Loop through each token
                while (tokeniser.NextToken())
                {
                    //Get the types and value
                    tokenType  = tokeniser.TokenType;
                    tokenValue = tokeniser.StringValue;
                    //If the type is a numeric type
                    if (tokenType == PRTokeniser.TokType.NUMBER)
                    {
                        //Store it in our buffer for later user
                        buf.Add(tokenValue);
                        //Otherwise we only care about raw commands which are categorized as "OTHER"
                    }
                    else if (tokenType == PRTokeniser.TokType.OTHER)
                    {
                        //Look for a rectangle token
                        //if (tokenValue == "re")
                        if (tokenValue == "l")
                        {
                            //Sanity check, make sure we have enough items in the buffer
                            if (buf.Count < 2)
                            {
                                throw new Exception("Not enough elements in buffer for a rectangle");
                            }
                            countOfLines += 1;
                            //Read and convert the values
                            float x2 = float.Parse(buf[buf.Count - 2], AllowDecimalPoint);
                            float y2 = float.Parse(buf[buf.Count - 1], AllowDecimalPoint);
                            float x1 = float.Parse(buf[buf.Count - 4], AllowDecimalPoint);
                            float y1 = float.Parse(buf[buf.Count - 3], AllowDecimalPoint);
                            //Console.WriteLine($"{countOfLines} : ({x1}, {y1}) - ({x2}, {y2})");

                            listOfLines.Add(new Line()
                            {
                                BeginX = x1, BeginY = y1, EndX = x2, EndY = y2
                            });
                            //..do something with them here
                        }
                    }
                }
            }

            listOfLines.Sort();


            //foreach (Line line in listOfLines)
            //{
            //    countOfLines += 1;
            //    Console.WriteLine($"{countOfLines}: {line}");

            //}

            return(listOfLines);
        }
Пример #31
0
        /**
         * Parses the content of a page, replacing appearances of annotations
         * with Form XObjects.
         * @param page a page dictionary
         * @throws IOException
         */
        public void Parse(PdfDictionary page, PdfIndirectReference pageref)
        {
            LOGGER.Info("Parsing page with reference " + pageref);
            // initializing member variables
            baos         = new MemoryStream();
            this.page    = page;
            this.pageref = pageref;

            structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
            if (structParents == null)
            {
                throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
            }
            annots = page.GetAsArray(PdfName.ANNOTS);
            if (annots == null)
            {
                annots = new PdfArray();
            }
            PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);

            xobjects = resources.GetAsDict(PdfName.XOBJECT);
            if (xobjects == null)
            {
                xobjects = new PdfDictionary();
                resources.Put(PdfName.XOBJECT, xobjects);
            }
            // parsing the content stream of the page
            PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);

            byte[]           contentBytes = PdfReader.GetStreamBytes(stream);
            PRTokeniser      tokeniser    = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
            PdfContentParser ps           = new PdfContentParser(tokeniser);
            List <PdfObject> operands     = new List <PdfObject>();

            while (ps.Parse(operands).Count > 0)
            {
                PdfLiteral opr = (PdfLiteral)operands[operands.Count - 1];
                ProcessOperator(opr, operands);
            }
            // dealing with orphans
            while (items.Count > 0 && items[0].GetPageref() == pageref.Number)
            {
                StructureItem item = items[0];
                if (item is StructureObject)
                {
                    ConvertToXObject((StructureObject)item);
                    items.RemoveAt(0);
                }
            }
            if (annots.Length == 0)
            {
                page.Remove(PdfName.ANNOTS);
            }
            else
            {
                PdfDictionary annot;
                for (int i = 0; i < annots.Size; i++)
                {
                    annot = annots.GetAsDict(i);
                    if (annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                    {
                        throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
                    }
                }
            }
            // replacing the content stream
            baos.Flush();
            baos.Close();
            stream.SetData(baos.ToArray());
            // showing how many items are left
            LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
        }
Пример #32
0
 /**
  * Parses the content of a page, replacing appearances of annotations
  * with Form XObjects.
  * @param page a page dictionary
  * @throws IOException
  */
 virtual public void Parse(PdfDictionary page, PdfIndirectReference pageref) {
     LOGGER.Info("Parsing page with reference " + pageref);
     // initializing member variables
     baos = new MemoryStream();
     this.page = page;
     this.pageref = pageref;
     
     structParents = page.GetAsNumber(PdfName.STRUCTPARENTS);
     if(structParents == null)
         throw new DocumentException(MessageLocalization.GetComposedMessage("can.t.read.document.structure"));
     annots = page.GetAsArray(PdfName.ANNOTS);
     if(annots == null)
         annots = new PdfArray();
     PdfDictionary resources = page.GetAsDict(PdfName.RESOURCES);
     xobjects = resources.GetAsDict(PdfName.XOBJECT);
     if (xobjects == null) {
         xobjects = new PdfDictionary();
         resources.Put(PdfName.XOBJECT, xobjects);
     }
     // parsing the content stream of the page
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     byte[] contentBytes = PdfReader.GetStreamBytes(stream);
     PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.CreateSource(contentBytes)));
     PdfContentParser ps = new PdfContentParser(tokeniser);
     List<PdfObject> operands = new List<PdfObject>();
     while (ps.Parse(operands).Count > 0) {
         PdfLiteral opr = (PdfLiteral) operands[operands.Count - 1];
         ProcessOperator(opr, operands);
     }
     // dealing with orphans
     while (items.Count > 0 && items[0].GetPageref() == pageref.Number) {
         StructureItem item = items[0];
         if (item is StructureObject) {
             ConvertToXObject((StructureObject) item);
             items.RemoveAt(0);
         }
     }
     if(annots.Length == 0) {
         page.Remove(PdfName.ANNOTS);
     }
     else {
         PdfDictionary annot;
         for(int i = 0; i < annots.Size; i++) {
             annot = annots.GetAsDict(i);
             if(annot.GetAsNumber(PdfName.STRUCTPARENT) == null)
                 throw new DocumentException(MessageLocalization.GetComposedMessage("could.not.flatten.file.untagged.annotations.found"));
         }
     }
     // replacing the content stream
     baos.Flush();
     baos.Close();
     stream.SetData(baos.ToArray());
     // showing how many items are left
     LOGGER.Info(String.Format("There are {0} items left for processing", items.Count));
 }