Beispiel #1
0
        private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
                                               bool isLenientParsing,
                                               out string endObjectKey)
        {
            if (currentBase is PdfDictionary dictionary)
            {
                PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null);

                currentBase = stream;
            }
            else
            {
                // this is not legal
                // the combination of a dict and the stream/endstream
                // forms a complete stream object
                throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
            }

            ReadHelper.SkipSpaces(reader);
            endObjectKey = ReadHelper.ReadLine(reader);

            // we have case with a second 'endstream' before endobj
            if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
            {
                endObjectKey = endObjectKey.Substring(9).Trim();
                if (endObjectKey.Length == 0)
                {
                    // no other characters in extra endstream line
                    // read next line
                    endObjectKey = ReadHelper.ReadLine(reader);
                }
            }

            return(currentBase);
        }
Beispiel #2
0
        public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenientParsing)
            {
                return(startXRefOffset);
            }

            source.Seek(startXRefOffset);

            ReadHelper.SkipSpaces(source);

            if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref"))
            {
                return(startXRefOffset);
            }
            if (startXRefOffset > 0)
            {
                if (CheckXRefStreamOffset(source, startXRefOffset, true, pool))
                {
                    return(startXRefOffset);
                }

                return(CalculateXRefFixedOffset(startXRefOffset));
            }
            // can't find a valid offset
            return(-1);
        }
Beispiel #3
0
        private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
        {
            if (lastEndOfFileMarker != null)
            {
                return;
            }

            long startOffset = source.CurrentOffset;

            source.Seek(MINIMUM_SEARCH_OFFSET);

            while (!source.IsAtEnd())
            {
                // search for EOF marker
                if (ReadHelper.IsString(source, "%%EOF"))
                {
                    long tempMarker = source.CurrentOffset;

                    if (tempMarker >= source.Length)
                    {
                        lastEndOfFileMarker = tempMarker;
                        break;
                    }

                    try
                    {
                        source.Seek(tempMarker + 5);
                        // check if the following data is some valid pdf content
                        // which most likely indicates that the pdf is linearized,
                        // updated or just cut off somewhere in the middle
                        ReadHelper.SkipSpaces(source);
                        ObjectHelper.ReadObjectNumber(source);
                        ObjectHelper.ReadGenerationNumber(source);
                    }
                    catch (Exception)
                    {
                        // save the EOF marker as the following data is most likely some garbage
                        lastEndOfFileMarker = tempMarker;
                    }
                }

                source.MoveNext();
            }

            source.Seek(startOffset);

            // no EOF marker found
            if (lastEndOfFileMarker == null)
            {
                lastEndOfFileMarker = long.MaxValue;
            }
        }
Beispiel #4
0
        private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
                                            CosObjectKey key,
                                            CosObjectPool pool,
                                            bool isLenientParsing)
        {
            reader.Seek(offset);

            var objectNumber     = ObjectHelper.ReadObjectNumber(reader);
            var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);

            ReadHelper.ReadExpectedString(reader, "obj", true);

            if (objectNumber != key.Number || objectGeneration != key.Generation)
            {
                throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
            }

            ReadHelper.SkipSpaces(reader);

            var baseObject = baseParser.Parse(reader, pool);

            var endObjectKey = ReadHelper.ReadString(reader);

            var atStreamStart = string.Equals(endObjectKey, "stream");

            if (atStreamStart)
            {
                var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);

                reader.Rewind(streamStartBytes.Length);

                baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
            }

            if (!string.Equals(endObjectKey, "endobj"))
            {
                var message =
                    $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";

                if (isLenientParsing)
                {
                    log.Warn(message);
                }
                else
                {
                    throw new InvalidOperationException(message);
                }
            }

            return(baseObject);
        }
Beispiel #5
0
        private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenient || startXRefOffset == 0)
            {
                return(true);
            }
            // seek to offset-1
            source.Seek(startXRefOffset - 1);
            int nextValue = source.Read();

            // the first character has to be a whitespace, and then a digit
            if (ReadHelper.IsWhitespace(nextValue))
            {
                ReadHelper.SkipSpaces(source);
                if (ReadHelper.IsDigit(source))
                {
                    try
                    {
                        // it's a XRef stream
                        ObjectHelper.ReadObjectNumber(source);
                        ObjectHelper.ReadGenerationNumber(source);

                        ReadHelper.ReadExpectedString(source, "obj", true);

                        // check the dictionary to avoid false positives
                        PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool);
                        source.Seek(startXRefOffset);

                        if (dict.IsType(CosName.XREF))
                        {
                            return(true);
                        }
                    }
                    catch (Exception ex)
                    {
                        log.Error("Couldn't read the xref stream object.", ex);
                        // there wasn't an object of a xref stream
                        source.Seek(startXRefOffset);
                    }
                }
            }
            return(false);
        }
Beispiel #6
0
 private void bfSearchForLastEOFMarker(IRandomAccessRead source)
 {
     if (lastEOFMarker == null)
     {
         long originOffset = source.GetPosition();
         source.Seek(MINIMUM_SEARCH_OFFSET);
         while (!source.IsEof())
         {
             // search for EOF marker
             if (ReadHelper.IsString(source, "%%EOF"))
             {
                 long tempMarker = source.GetPosition();
                 source.Seek(tempMarker + 5);
                 try
                 {
                     // check if the following data is some valid pdf content
                     // which most likely indicates that the pdf is linearized,
                     // updated or just cut off somewhere in the middle
                     ReadHelper.SkipSpaces(source);
                     ObjectHelper.ReadObjectNumber(source);
                     ObjectHelper.ReadGenerationNumber(source);
                 }
                 catch (InvalidOperationException exception)
                 {
                     // save the EOF marker as the following data is most likely some garbage
                     lastEOFMarker = tempMarker;
                 }
             }
             source.Read();
         }
         source.Seek(originOffset);
         // no EOF marker found
         if (lastEOFMarker == null)
         {
             lastEOFMarker = long.MaxValue;
         }
     }
 }
            public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
            {
                builder = null;

                var tableStartOffset = source.GetPosition();

                if (source.Peek() != 'x')
                {
                    return(false);
                }

                var xref = ReadHelper.ReadString(source);

                if (!xref.Trim().Equals("xref"))
                {
                    return(false);
                }

                // check for trailer after xref
                var str = ReadHelper.ReadString(source);

                byte[] b = OtherEncodings.StringAsLatin1Bytes(str);

                source.Rewind(b.Length);

                if (str.StartsWith("trailer"))
                {
                    log.Warn("skipping empty xref table");
                    return(false);
                }

                builder = new CrossReferenceTablePartBuilder
                {
                    Offset   = offset,
                    XRefType = CrossReferenceType.Table
                };

                // Tables can have multiple sections. Each starts with a starting object id and a count.
                while (true)
                {
                    if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
                    {
                        log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");

                        if (isLenientParsing)
                        {
                            break;
                        }

                        return(false);
                    }

                    var currentObjectId = subsectionDefinition.FirstNumber;

                    ReadHelper.SkipSpaces(source);
                    for (var i = 0; i < subsectionDefinition.Count; i++)
                    {
                        if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
                        {
                            break;
                        }

                        if (source.Peek() == 't')
                        {
                            break;
                        }

                        //Ignore table contents
                        var currentLine = ReadHelper.ReadLine(source);
                        var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                        if (splitString.Length < 3)
                        {
                            log.Warn("invalid xref line: " + currentLine);
                            break;
                        }

                        // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
                        if (splitString[splitString.Length - 1].Equals(InUseEntry))
                        {
                            try
                            {
                                var objectOffset = long.Parse(splitString[0]);

                                if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
                                {
                                    // PDFBOX-3923: offset points inside this table - that can't be good
                                    throw new InvalidOperationException(
                                              $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
                                }

                                var generation = int.Parse(splitString[1]);
                                builder.Add(currentObjectId, generation, objectOffset);
                            }
                            catch (FormatException e)
                            {
                                throw new InvalidOperationException("Bad", e);
                            }
                        }
                        else if (!splitString[2].Equals(FreeEntry))
                        {
                            throw new InvalidOperationException(
                                      $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
                        }

                        currentObjectId++;

                        ReadHelper.SkipSpaces(source);
                    }

                    ReadHelper.SkipSpaces(source);
                    if (!ReadHelper.IsDigit(source))
                    {
                        break;
                    }
                }

                if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
                {
                    throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
                }

                builder.Dictionary = trailer;
                builder.Previous   = trailer.GetLongOrDefault(CosName.PREV);

                return(true);
            }
            private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
            {
                trailer = null;
                // parse the last trailer.
                var trailerOffset = source.GetPosition();

                // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
                if (isLenientParsing)
                {
                    int nextCharacter = source.Peek();
                    while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
                    {
                        if (source.GetPosition() == trailerOffset)
                        {
                            // warn only the first time
                            //LOG.warn("Expected trailer object at position " + trailerOffset
                            //        + ", keep trying");
                        }
                        ReadHelper.ReadLine(source);
                        nextCharacter = source.Peek();
                    }
                }
                if (source.Peek() != 't')
                {
                    return(false);
                }
                //read "trailer"
                long   currentOffset = source.GetPosition();
                string nextLine      = ReadHelper.ReadLine(source);

                if (!nextLine.Trim().Equals("trailer"))
                {
                    // in some cases the EOL is missing and the trailer immediately
                    // continues with "<<" or with a blank character
                    // even if this does not comply with PDF reference we want to support as many PDFs as possible
                    // Acrobat reader can also deal with this.
                    if (nextLine.StartsWith("trailer"))
                    {
                        // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
                        int len = "trailer".Length;
                        // jump back right after "trailer"
                        source.Seek(currentOffset + len);
                    }
                    else
                    {
                        return(false);
                    }
                }

                // in some cases the EOL is missing and the trailer continues with " <<"
                // even if this does not comply with PDF reference we want to support as many PDFs as possible
                // Acrobat reader can also deal with this.
                ReadHelper.SkipSpaces(source);

                PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);

                trailer = parsedTrailer;

                ReadHelper.SkipSpaces(source);
                return(true);
            }