Example #1
0
        private bool checkObjectKeys(IRandomAccessRead source, CosObjectKey objectKey, long offset)
        {
            // there can't be any object at the very beginning of a pdf
            if (offset < MINIMUM_SEARCH_OFFSET)
            {
                return(false);
            }
            long   objectNr     = objectKey.Number;
            long   objectGen    = objectKey.Generation;
            long   originOffset = source.GetPosition();
            string objectString = ObjectHelper.createObjectString(objectNr, objectGen);

            try
            {
                source.Seek(offset);
                if (ReadHelper.IsString(source, OtherEncodings.StringAsLatin1Bytes(objectString)))
                {
                    // everything is ok, return origin object key
                    source.Seek(originOffset);
                    return(true);
                }
            }
            catch (InvalidOperationException exception)
            {
                // Swallow the exception, obviously there isn't any valid object number
            }
            finally
            {
                source.Seek(originOffset);
            }
            // no valid object number found
            return(false);
        }
Example #2
0
        private bool validateStreamLength(IRandomAccessRead source, long streamLength, long fileLength)
        {
            bool streamLengthIsValid = true;
            long originOffset        = source.GetPosition();
            long expectedEndOfStream = originOffset + streamLength;

            if (expectedEndOfStream > fileLength)
            {
                streamLengthIsValid = false;
                //LOG.warn("The end of the stream is out of range, using workaround to read the stream, "
                //        + "stream start position: " + originOffset + ", length: " + streamLength
                //        + ", expected end position: " + expectedEndOfStream);
            }
            else
            {
                source.Seek(expectedEndOfStream);
                ReadHelper.SkipSpaces(source);
                if (!ReadHelper.IsString(source, "endstream"))
                {
                    streamLengthIsValid = false;
                    //LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, "
                    //        + "stream start position: " + originOffset + ", length: " + streamLength
                    //        + ", expected end position: " + expectedEndOfStream);
                }
                source.Seek(originOffset);
            }
            return(streamLengthIsValid);
        }
Example #3
0
 public void Seek(long position)
 {
     if (Throw)
     {
         throw new InvalidOperationException();
     }
     reader.Seek(position);
 }
Example #4
0
        private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenient || startXRefOffset == 0)
            {
                return(true);
            }
            // seek to offset-1
            source.Seek(startXRefOffset - 1);
            int nextValue = source.Read();

            // the first character has to be a whitespace, and then a digit
            if (ReadHelper.IsWhitespace(nextValue))
            {
                ReadHelper.SkipSpaces(source);
                if (ReadHelper.IsDigit(source))
                {
                    try
                    {
                        // it's a XRef stream
                        ObjectHelper.ReadObjectNumber(source);
                        ObjectHelper.ReadGenerationNumber(source);

                        ReadHelper.ReadExpectedString(source, "obj", true);

                        // check the dictionary to avoid false positives
                        PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool);
                        source.Seek(startXRefOffset);

                        if (dict.IsType(CosName.XREF))
                        {
                            return(true);
                        }
                    }
                    catch (Exception ex)
                    {
                        log.Error("Couldn't read the xref stream object.", ex);
                        // there wasn't an object of a xref stream
                        source.Seek(startXRefOffset);
                    }
                }
            }
            return(false);
        }
Example #5
0
        private ICosNumber GetLength(IRandomAccessRead source, CosBase lengthBaseObj, CosName streamType, bool isLenientParsing, IPdfObjectParser parser)
        {
            if (lengthBaseObj == null)
            {
                return(null);
            }

            // Length is given directly in the stream dictionary
            if (lengthBaseObj is ICosNumber number)
            {
                return(number);
            }

            // length in referenced object
            if (lengthBaseObj is CosObject lengthObj)
            {
                var currentObject = lengthObj.GetObject();

                if (currentObject == null)
                {
                    if (parser == null)
                    {
                        throw new InvalidOperationException("This method required access to the PDF object parser but it was not created yet. Figure out how to fix this.");
                    }

                    var currentOffset = source.GetPosition();

                    var obj = parser.Parse(lengthObj.ToIndirectReference(), source, isLenientParsing);

                    source.Seek(currentOffset);

                    if (obj is ICosNumber referenceNumber)
                    {
                        return(referenceNumber);
                    }

                    throw new InvalidOperationException("Length object content was not read.");
                }

                if (currentObject is ICosNumber objectNumber)
                {
                    return(objectNumber);
                }


                throw new InvalidOperationException("Wrong type of referenced length object " + lengthObj
                                                    + ": " + lengthObj.GetObject().GetType().Name);
            }

            throw new InvalidOperationException($"Wrong type of length object: {lengthBaseObj.GetType().Name}");
        }
Example #6
0
        private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
                                            CosObjectKey key,
                                            CosObjectPool pool,
                                            bool isLenientParsing)
        {
            reader.Seek(offset);

            var objectNumber     = ObjectHelper.ReadObjectNumber(reader);
            var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);

            ReadHelper.ReadExpectedString(reader, "obj", true);

            if (objectNumber != key.Number || objectGeneration != key.Generation)
            {
                throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
            }

            ReadHelper.SkipSpaces(reader);

            var baseObject = baseParser.Parse(reader, pool);

            var endObjectKey = ReadHelper.ReadString(reader);

            var atStreamStart = string.Equals(endObjectKey, "stream");

            if (atStreamStart)
            {
                var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);

                reader.Rewind(streamStartBytes.Length);

                baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
            }

            if (!string.Equals(endObjectKey, "endobj"))
            {
                var message =
                    $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";

                if (isLenientParsing)
                {
                    log.Warn(message);
                }
                else
                {
                    throw new InvalidOperationException(message);
                }
            }

            return(baseObject);
        }
Example #7
0
 private void bfSearchForLastEOFMarker(IRandomAccessRead source)
 {
     if (lastEOFMarker == null)
     {
         long originOffset = source.GetPosition();
         source.Seek(MINIMUM_SEARCH_OFFSET);
         while (!source.IsEof())
         {
             // search for EOF marker
             if (ReadHelper.IsString(source, "%%EOF"))
             {
                 long tempMarker = source.GetPosition();
                 source.Seek(tempMarker + 5);
                 try
                 {
                     // check if the following data is some valid pdf content
                     // which most likely indicates that the pdf is linearized,
                     // updated or just cut off somewhere in the middle
                     ReadHelper.SkipSpaces(source);
                     ObjectHelper.ReadObjectNumber(source);
                     ObjectHelper.ReadGenerationNumber(source);
                 }
                 catch (InvalidOperationException exception)
                 {
                     // save the EOF marker as the following data is most likely some garbage
                     lastEOFMarker = tempMarker;
                 }
             }
             source.Read();
         }
         source.Seek(originOffset);
         // no EOF marker found
         if (lastEOFMarker == null)
         {
             lastEOFMarker = long.MaxValue;
         }
     }
 }
Example #8
0
        public static bool IsString(IRandomAccessRead reader, IEnumerable <byte> str)
        {
            bool bytesMatching = true;
            long originOffset  = reader.GetPosition();

            foreach (var c in str)
            {
                if (reader.Read() != c)
                {
                    bytesMatching = false;
                    break;
                }
            }
            reader.Seek(originOffset);

            return(bytesMatching);
        }
            private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
            {
                trailer = null;
                // parse the last trailer.
                var trailerOffset = source.GetPosition();

                // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
                if (isLenientParsing)
                {
                    int nextCharacter = source.Peek();
                    while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
                    {
                        if (source.GetPosition() == trailerOffset)
                        {
                            // warn only the first time
                            //LOG.warn("Expected trailer object at position " + trailerOffset
                            //        + ", keep trying");
                        }
                        ReadHelper.ReadLine(source);
                        nextCharacter = source.Peek();
                    }
                }
                if (source.Peek() != 't')
                {
                    return(false);
                }
                //read "trailer"
                long   currentOffset = source.GetPosition();
                string nextLine      = ReadHelper.ReadLine(source);

                if (!nextLine.Trim().Equals("trailer"))
                {
                    // in some cases the EOL is missing and the trailer immediately
                    // continues with "<<" or with a blank character
                    // even if this does not comply with PDF reference we want to support as many PDFs as possible
                    // Acrobat reader can also deal with this.
                    if (nextLine.StartsWith("trailer"))
                    {
                        // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
                        int len = "trailer".Length;
                        // jump back right after "trailer"
                        source.Seek(currentOffset + len);
                    }
                    else
                    {
                        return(false);
                    }
                }

                // in some cases the EOL is missing and the trailer continues with " <<"
                // even if this does not comply with PDF reference we want to support as many PDFs as possible
                // Acrobat reader can also deal with this.
                ReadHelper.SkipSpaces(source);

                PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);

                trailer = parsedTrailer;

                ReadHelper.SkipSpaces(source);
                return(true);
            }
Example #10
0
        public IReadOnlyDictionary <CosObjectKey, long> GetObjectLocations()
        {
            if (objectLocations != null)
            {
                return(objectLocations);
            }

            var lastEndOfFile = GetLastEndOfFileMarker();

            var results = new Dictionary <CosObjectKey, long>();

            var originPosition = reader.GetPosition();

            long currentOffset    = MinimumSearchOffset;
            long lastObjectId     = long.MinValue;
            int  lastGenerationId = int.MinValue;
            long lastObjOffset    = long.MinValue;

            byte[] objString    = OtherEncodings.StringAsLatin1Bytes(" obj");
            byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj");

            bool endobjFound = false;

            do
            {
                reader.Seek(currentOffset);
                if (ReadHelper.IsString(reader, objString))
                {
                    long tempOffset = currentOffset - 1;
                    reader.Seek(tempOffset);
                    int generationId = reader.Peek();

                    // is the next char a digit?
                    if (ReadHelper.IsDigit(generationId))
                    {
                        generationId -= 48;
                        tempOffset--;
                        reader.Seek(tempOffset);
                        if (ReadHelper.IsSpace(reader))
                        {
                            while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader))
                            {
                                reader.Seek(--tempOffset);
                            }

                            bool objectIdFound = false;
                            while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
                            {
                                reader.Seek(--tempOffset);
                                objectIdFound = true;
                            }

                            if (objectIdFound)
                            {
                                reader.Read();
                                long objectId = ObjectHelper.ReadObjectNumber(reader);
                                if (lastObjOffset > 0)
                                {
                                    // add the former object ID only if there was a subsequent object ID
                                    results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset;
                                }
                                lastObjectId     = objectId;
                                lastGenerationId = generationId;
                                lastObjOffset    = tempOffset + 1;
                                currentOffset   += objString.Length - 1;
                                endobjFound      = false;
                            }
                        }
                    }
                }
                else if (ReadHelper.IsString(reader, "endobj"))
                {
                    endobjFound    = true;
                    currentOffset += endobjString.Length - 1;
                }
                currentOffset++;
            } while (currentOffset < lastEndOfFile && !reader.IsEof());
            if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
            {
                // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
                // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
                results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset;
            }

            // reestablish origin position
            reader.Seek(originPosition);

            objectLocations = results;

            return(objectLocations);
        }
Example #11
0
        public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenientParsing)
            {
                return(startXRefOffset);
            }

            source.Seek(startXRefOffset);

            ReadHelper.SkipSpaces(source);

            if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref"))
            {
                return(startXRefOffset);
            }
            if (startXRefOffset > 0)
            {
                if (CheckXRefStreamOffset(source, startXRefOffset, true, pool))
                {
                    return(startXRefOffset);
                }

                return(CalculateXRefFixedOffset(startXRefOffset));
            }
            // can't find a valid offset
            return(-1);
        }
Example #12
0
        private void bfSearchForObjects(IRandomAccessRead source)
        {
            bfSearchForLastEOFMarker(source);
            bfSearchCOSObjectKeyOffsets = new Dictionary <CosObjectKey, long>();
            long originOffset  = source.GetPosition();
            long currentOffset = MINIMUM_SEARCH_OFFSET;
            long lastObjectId  = long.MinValue;
            int  lastGenID     = int.MinValue;
            long lastObjOffset = long.MinValue;

            char[] objString    = " obj".ToCharArray();
            char[] endobjString = "endobj".ToCharArray();
            bool   endobjFound  = false;

            do
            {
                source.Seek(currentOffset);
                if (ReadHelper.IsString(source, "obj"))
                {
                    long tempOffset = currentOffset - 1;
                    source.Seek(tempOffset);
                    int genID = source.Peek();
                    // is the next char a digit?
                    if (ReadHelper.IsDigit(genID))
                    {
                        genID -= 48;
                        tempOffset--;
                        source.Seek(tempOffset);
                        if (ReadHelper.IsSpace(source))
                        {
                            while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source))
                            {
                                source.Seek(--tempOffset);
                            }
                            bool objectIDFound = false;
                            while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source))
                            {
                                source.Seek(--tempOffset);
                                objectIDFound = true;
                            }
                            if (objectIDFound)
                            {
                                source.Read();
                                long objectId = ObjectHelper.ReadObjectNumber(source);
                                if (lastObjOffset > 0)
                                {
                                    // add the former object ID only if there was a subsequent object ID
                                    bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset;
                                }
                                lastObjectId   = objectId;
                                lastGenID      = genID;
                                lastObjOffset  = tempOffset + 1;
                                currentOffset += objString.Length - 1;
                                endobjFound    = false;
                            }
                        }
                    }
                }
                else if (ReadHelper.IsString(source, "endobj"))
                {
                    endobjFound    = true;
                    currentOffset += endobjString.Length - 1;
                }
                currentOffset++;
            } while (currentOffset < lastEOFMarker && !source.IsEof());
            if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0)
            {
                // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
                // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
                bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset;
            }
            // reestablish origin position

            source.Seek(originOffset);
        }