PdfContentParser.GetTokeniser C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

ファイル: InlineImageUtils.cs プロジェクト: RawanRadi/Simple-PDFMerge

        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();

            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject())
            {
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                {
                    resolvedKey = (PdfName)key;
                }

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();

            if (!PRTokeniser.IsWhitespace(ch))
            {
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            }

            return(dictionary);
        }

コード例 #2

0

ファイルを表示

ファイル: InlineImageUtils.cs プロジェクト: RawanRadi/Simple-PDFMerge

        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
            // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
            int startIndex = 0;

            if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0)  // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
            {
                bytes[0] = (byte)shouldBeWhiteSpace;
                startIndex++;
            }
            for (int i = startIndex; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                // Some PDF producers seem to add another non-whitespace character after the image data.
                // Let's try to handle that case here.
                PdfObject ei2 = ps.ReadPRObject();
                if (!ei2.ToString().Equals("EI"))
                {
                    throw new InlineImageParseException("EI not found after end of image data");
                }
            }

            return(bytes);
        }

コード例 #3

0

ファイルを表示

        /**
         * Parses the samples of the image from the underlying content parser, ignoring all filters.
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * This is primarily useful if no filters have been applied.
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfContentParser ps)
        {
            // special case:  when no filter is specified, we just read the number of bits
            // per component, multiplied by the width and height.
            if (imageDictionary.Contains(PdfName.FILTER))
            {
                throw new ArgumentException("Dictionary contains filters");
            }

            PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);

            int bytesToRead = ComputeBytesPerRow(imageDictionary) * h.IntValue;

            byte[]      bytes     = new byte[bytesToRead];
            PRTokeniser tokeniser = ps.GetTokeniser();

            tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
            for (int i = 0; i < bytesToRead; i++)
            {
                int ch = tokeniser.Read();
                if (ch == -1)
                {
                    throw new InlineImageParseException("End of content stream reached before end of image data");
                }

                bytes[i] = (byte)ch;
            }
            PdfObject ei = ps.ReadPRObject();

            if (!ei.ToString().Equals("EI"))
            {
                throw new InlineImageParseException("EI not found after end of image data");
            }

            return(bytes);
        }

コード例 #4

0

ファイルを表示

ファイル: InlineImageUtils.cs プロジェクト: RawanRadi/Simple-PDFMerge

        /**
         * Parses the samples of the image from the underlying content parser, accounting for filters
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * <b>Note:</b>This implementation does not actually apply the filters at this time
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps)
        {
            // by the time we get to here, we have already parsed the ID operator

            if (!imageDictionary.Contains(PdfName.FILTER))
            {
                return(ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps));
            }


            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos        = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int          ch;
            int          found     = 0;
            PRTokeniser  tokeniser = ps.GetTokeniser();

            byte[] ff = null;

            while ((ch = tokeniser.Read()) != -1)
            {
                if (found == 0 && PRTokeniser.IsWhitespace(ch))
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && ch == 'E')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 1 && PRTokeniser.IsWhitespace(ch))
                {
                    // this clause is needed if we have a white space character that is part of the image data
                    // followed by a whitespace character that precedes the EI operator.  In this case, we need
                    // to flush the first whitespace, then treat the current whitespace as the first potential
                    // character for the end of stream check.  Note that we don't increment 'found' here.
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 2 && ch == 'I')
                {
                    found++;
                    accumulated.WriteByte((byte)ch);
                }
                else if (found == 3 && PRTokeniser.IsWhitespace(ch))
                {
                    byte[] tmp = baos.ToArray();
                    if (InlineImageStreamBytesAreComplete(tmp, imageDictionary))
                    {
                        return(tmp);
                    }
                    byte[] accumulatedArr = accumulated.ToArray();
                    baos.Write(accumulatedArr, 0, accumulatedArr.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
                else
                {
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);

                    baos.WriteByte((byte)ch);
                    found = 0;
                }
            }
            throw new InlineImageParseException("Could not find image data or EI");
        }

コード例 #5

0

ファイルを表示

        /**
         * Parses the samples of the image from the underlying content parser, accounting for filters
         * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
         * The parser will be left positioned immediately following the EI operator.
         * <b>Note:</b>This implementation does not actually apply the filters at this time
         * @param imageDictionary the dictionary of the inline image
         * @param ps the content parser
         * @return the samples of the image
         * @throws IOException if anything bad happens during parsing
         */
        private static byte[] ParseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) {
            // by the time we get to here, we have already parsed the ID operator
            
            if (!imageDictionary.Contains(PdfName.FILTER)){
                return ParseUnfilteredSamples(imageDictionary, colorSpaceDic, ps);
            }
            
            
            // read all content until we reach an EI operator surrounded by whitespace.
            // The following algorithm has two potential issues: what if the image stream 
            // contains <ws>EI<ws> ?
            // Plus, there are some streams that don't have the <ws> before the EI operator
            // it sounds like we would have to actually decode the content stream, which
            // I'd rather avoid right now.
            MemoryStream baos = new MemoryStream();
            MemoryStream accumulated = new MemoryStream();
            int ch;
            int found = 0;
            PRTokeniser tokeniser = ps.GetTokeniser();
            byte[] ff = null;
            
            while ((ch = tokeniser.Read()) != -1){
                if (found == 0 && PRTokeniser.IsWhitespace(ch)){
                    found++;
                    accumulated.WriteByte((byte)ch);
                } else if (found == 1 && ch == 'E'){
                    found++;
                    accumulated.WriteByte((byte)ch);
                } else if (found == 1 && PRTokeniser.IsWhitespace(ch)){
                    // this clause is needed if we have a white space character that is part of the image data
                    // followed by a whitespace character that precedes the EI operator.  In this case, we need
                    // to flush the first whitespace, then treat the current whitespace as the first potential
                    // character for the end of stream check.  Note that we don't increment 'found' here.
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    accumulated.WriteByte((byte)ch);
                } else if (found == 2 && ch == 'I'){ 
                    found++;
                    accumulated.WriteByte((byte)ch);
                } else if (found == 3 && PRTokeniser.IsWhitespace(ch)){
                    try
                    {
                        byte[] tmp = baos.ToArray();
                        new PdfImageObject(imageDictionary, tmp, colorSpaceDic);
                        return tmp;
                    }
                    catch (Exception)
                    {
                        byte[] tmp = accumulated.ToArray();
                        baos.Write(tmp, 0, tmp.Length);
                        accumulated.SetLength(0);

                        baos.WriteByte((byte)ch);
                        found = 0;
                    }

                } else {
                    baos.Write(ff = accumulated.ToArray(), 0, ff.Length);
                    accumulated.SetLength(0);
                    
                    baos.WriteByte((byte)ch);
                    found = 0;
                }
            }
            throw new InlineImageParseException("Could not find image data or EI");
        }

コード例 #6

0

ファイルを表示

 /**
  * Parses the samples of the image from the underlying content parser, ignoring all filters.
  * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary.
  * The parser will be left positioned immediately following the EI operator.
  * This is primarily useful if no filters have been applied. 
  * @param imageDictionary the dictionary of the inline image
  * @param ps the content parser
  * @return the samples of the image
  * @throws IOException if anything bad happens during parsing
  */
 private static byte[] ParseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) {
     // special case:  when no filter is specified, we just read the number of bits
     // per component, multiplied by the width and height.
     if (imageDictionary.Contains(PdfName.FILTER))
         throw new ArgumentException("Dictionary contains filters");
     
     PdfNumber h = imageDictionary.GetAsNumber(PdfName.HEIGHT);
     
     int bytesToRead = ComputeBytesPerRow(imageDictionary, colorSpaceDic) * h.IntValue;
     byte[] bytes = new byte[bytesToRead];
     PRTokeniser tokeniser = ps.GetTokeniser();
     
     int shouldBeWhiteSpace = tokeniser.Read(); // skip next character (which better be a whitespace character - I suppose we could check for this)
     // from the PDF spec:  Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data.
     // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it
     int startIndex = 0;
     if (!PRTokeniser.IsWhitespace(shouldBeWhiteSpace) || shouldBeWhiteSpace == 0){ // tokeniser treats 0 as whitespace, but for our purposes, we shouldn't)
         bytes[0] = (byte)shouldBeWhiteSpace;
         startIndex++;
     }
     for (int i = startIndex; i < bytesToRead; i++){
         int ch = tokeniser.Read();
         if (ch == -1)
             throw new InlineImageParseException("End of content stream reached before end of image data");
         
         bytes[i] = (byte)ch;
     }
     PdfObject ei = ps.ReadPRObject();
     if (!ei.ToString().Equals("EI"))
         throw new InlineImageParseException("EI not found after end of image data");
     
     return bytes;
 }

コード例 #7

0

ファイルを表示

        /**
         * Parses the next inline image dictionary from the parser.  The parser must be positioned immediately following the EI operator.
         * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary.
         * @param ps the parser to extract the embedded image information from
         * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values
         * @throws IOException if the parse fails
         */
        private static PdfDictionary ParseInlineImageDictionary(PdfContentParser ps) {
            // by the time we get to here, we have already parsed the BI operator
            PdfDictionary dictionary = new PdfDictionary();
            
            for (PdfObject key = ps.ReadPRObject(); key != null && !"ID".Equals(key.ToString()); key = ps.ReadPRObject()){
                PdfObject value = ps.ReadPRObject();

                PdfName resolvedKey;
                inlineImageEntryAbbreviationMap.TryGetValue((PdfName)key, out resolvedKey);
                if (resolvedKey == null)
                    resolvedKey = (PdfName)key;

                dictionary.Put(resolvedKey, GetAlternateValue(resolvedKey, value));
            }

            int ch = ps.GetTokeniser().Read();
            if (!PRTokeniser.IsWhitespace(ch))
                throw new IOException("Unexpected character " + ch + " found after ID in inline image");
            
            return dictionary;
        }

C# (CSharp) PdfContentParser.GetTokeniserの例