예제 #1
0
        private static List <long> GetObjectNumbers(PdfRawStream stream)
        {
            var indexArray = (COSArray)stream.Dictionary.GetDictionaryObject(CosName.INDEX);

            // If Index doesn't exist, we will use the default values.
            if (indexArray == null)
            {
                indexArray = new COSArray();
                indexArray.add(CosInt.Zero);
                indexArray.add(stream.Dictionary.GetDictionaryObject(CosName.SIZE));
            }

            List <long> objNums = new List <long>();

            // Populates objNums with all object numbers available

            for (int i = 0; i < indexArray.Count; i += 2)
            {
                var longId = ((CosInt)indexArray.get(i)).AsLong();
                var size   = ((CosInt)indexArray.get(i + 1)).AsInt();

                for (int j = 0; j < size; j++)
                {
                    objNums.Add(longId + j);
                }
            }

            return(objNums);
        }
예제 #2
0
        private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
                                               bool isLenientParsing,
                                               out string endObjectKey)
        {
            if (currentBase is PdfDictionary dictionary)
            {
                PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null);

                currentBase = stream;
            }
            else
            {
                // this is not legal
                // the combination of a dict and the stream/endstream
                // forms a complete stream object
                throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
            }

            ReadHelper.SkipSpaces(reader);
            endObjectKey = ReadHelper.ReadLine(reader);

            // we have case with a second 'endstream' before endobj
            if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
            {
                endObjectKey = endObjectKey.Substring(9).Trim();
                if (endObjectKey.Length == 0)
                {
                    // no other characters in extra endstream line
                    // read next line
                    endObjectKey = ReadHelper.ReadLine(reader);
                }
            }

            return(currentBase);
        }
예제 #3
0
        public IReadOnlyList <CosObject> Parse(PdfRawStream stream, CosObjectPool pool)
        {
            if (stream == null)
            {
                throw new ArgumentNullException(nameof(stream));
            }

            //need to first parse the header.
            var numberOfObjects = stream.Dictionary.GetIntOrDefault(CosName.N);
            var objectNumbers   = new List <long>(numberOfObjects);

            var streamObjects = new List <CosObject>(numberOfObjects);

            var bytes = stream.Decode(filterProvider);

            var reader = new RandomAccessBuffer(bytes);

            for (int i = 0; i < numberOfObjects; i++)
            {
                long objectNumber = ObjectHelper.ReadObjectNumber(reader);
                // skip offset
                ReadHelper.ReadLong(reader);
                objectNumbers.Add(objectNumber);
            }

            CosObject obj;
            CosBase   cosObject;
            int       objectCounter = 0;

            while ((cosObject = baseParser.Parse(reader, pool)) != null)
            {
                obj = new CosObject(cosObject);
                obj.SetGenerationNumber(0);

                if (objectCounter >= objectNumbers.Count)
                {
                    log.Error("/ObjStm (object stream) has more objects than /N " + numberOfObjects);
                    break;
                }

                obj.SetObjectNumber(objectNumbers[objectCounter]);
                streamObjects.Add(obj);

                // According to the spec objects within an object stream shall not be enclosed
                // by obj/endobj tags, but there are some pdfs in the wild using those tags
                // skip endobject marker if present
                if (!reader.IsEof() && reader.Peek() == 'e')
                {
                    ReadHelper.ReadLine(reader);
                }

                objectCounter++;
            }

            return(streamObjects);
        }
예제 #4
0
        public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser)
        {
            PdfRawStream result;

            // read 'stream'; this was already tested in parseObjectsDynamically()
            ReadHelper.ReadExpectedString(reader, "stream");

            skipWhiteSpaces(reader);

            // This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null.
            ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser);

            ValidateStreamLength(reader, isLenientParsing, streamLength);

            // get output stream to copy data to
            using (var stream = new MemoryStream())
                using (var writer = new BinaryWriter(stream))
                {
                    if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length()))
                    {
                        ReadValidStream(reader, writer, streamLength);
                    }
                    else
                    {
                        ReadUntilEndStream(reader, writer);
                    }

                    result = new PdfRawStream(stream.ToArray(), streamDictionary);
                }

            String endStream = ReadHelper.ReadString(reader);

            if (endStream.Equals("endobj") && isLenientParsing)
            {
                log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}");

                // avoid follow-up warning about missing endobj
                reader.Rewind("endobj".Length);
            }
            else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream"))
            {
                log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition());
                // unread the "extra" bytes
                reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length);
            }
            else if (!endStream.Equals("endstream"))
            {
                throw new InvalidOperationException("Error reading stream, expected='endstream' actual='"
                                                    + endStream + "' at offset " + reader.GetPosition());
            }

            return(result);
        }
예제 #5
0
        /// <summary>
        /// Parses through the unfiltered stream and populates the xrefTable HashMap.
        /// </summary>
        public CrossReferenceTablePart Parse(long streamOffset, PdfRawStream stream)
        {
            var w = stream.Dictionary.GetDictionaryObject(CosName.W);

            if (!(w is COSArray format))
            {
                throw new IOException("/W array is missing in Xref stream");
            }

            var objNums = GetObjectNumbers(stream);

            /*
             * Calculating the size of the line in bytes
             */
            int w0       = format.getInt(0);
            int w1       = format.getInt(1);
            int w2       = format.getInt(2);
            int lineSize = w0 + w1 + w2;

            var decoded = stream.Decode(filterProvider);

            var lineCount  = decoded.Length / lineSize;
            var lineNumber = 0;

            var builder = new CrossReferenceTablePartBuilder
            {
                Offset     = streamOffset,
                Previous   = stream.Dictionary.GetLongOrDefault(CosName.PREV),
                Dictionary = stream.Dictionary,
                XRefType   = CrossReferenceType.Stream
            };

            using (IEnumerator <long> objIter = objNums.GetEnumerator())
            {
                var currLine = new byte[lineSize];

                while (lineNumber < lineCount && objIter.MoveNext())
                {
                    var byteOffset = lineNumber * lineSize;
                    for (int i = 0; i < lineSize; i++)
                    {
                        currLine[i] = decoded[byteOffset + i];
                    }

                    int type;
                    if (w0 == 0)
                    {
                        // "If the first element is zero,
                        // the type field shall not be present, and shall default to type 1"
                        type = 1;
                    }
                    else
                    {
                        type = 0;

                        /*
                         * Grabs the number of bytes specified for the first column in
                         * the W array and stores it.
                         */
                        for (int i = 0; i < w0; i++)
                        {
                            type += (currLine[i] & 0x00ff) << ((w0 - i - 1) * 8);
                        }
                    }
                    //Need to remember the current objID
                    long objectId = objIter.Current;

                    /*
                     * 3 different types of entries.
                     */
                    switch (type)
                    {
                    case 0:
                        /*
                         * Skipping free objects
                         */
                        break;

                    case 1:
                        int offset = 0;
                        for (int i = 0; i < w1; i++)
                        {
                            offset += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8);
                        }
                        int genNum = 0;
                        for (int i = 0; i < w2; i++)
                        {
                            genNum += (currLine[i + w0 + w1] & 0x00ff) << ((w2 - i - 1) * 8);
                        }

                        builder.Add(objectId, genNum, offset);

                        break;

                    case 2:
                        /*
                         * object stored in object stream:
                         * 2nd argument is object number of object stream
                         * 3rd argument is index of object within object stream
                         *
                         * For sequential PDFParser we do not need this information
                         * because
                         * These objects are handled by the dereferenceObjects() method
                         * since they're only pointing to object numbers
                         *
                         * However for XRef aware parsers we have to know which objects contain
                         * object streams. We will store this information in normal xref mapping
                         * table but add object stream number with minus sign in order to
                         * distinguish from file offsets
                         */
                        int objstmObjNr = 0;
                        for (int i = 0; i < w1; i++)
                        {
                            objstmObjNr += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8);
                        }

                        builder.Add(objectId, 0, -objstmObjNr);

                        break;
                    }

                    lineNumber++;
                }
            }

            return(builder.Build());
        }