Beispiel #1
0
        public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing)
        {
            // repair mode isn't available in non-lenient mode
            if (!isLenientParsing)
            {
                return(startXRefOffset);
            }

            source.Seek(startXRefOffset);

            ReadHelper.SkipSpaces(source);

            if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref"))
            {
                return(startXRefOffset);
            }
            if (startXRefOffset > 0)
            {
                if (CheckXRefStreamOffset(source, startXRefOffset, true, pool))
                {
                    return(startXRefOffset);
                }

                return(CalculateXRefFixedOffset(startXRefOffset));
            }
            // can't find a valid offset
            return(-1);
        }
Beispiel #2
0
        public static string ReadLine(IRandomAccessRead reader)
        {
            if (reader == null)
            {
                throw new ArgumentNullException(nameof(reader));
            }

            if (reader.IsEof())
            {
                throw new InvalidOperationException("Error: End-of-File, expected line");
            }

            var buffer = new StringBuilder(11);

            int c;

            while ((c = reader.Read()) != -1)
            {
                // CR and LF are valid EOLs
                if (IsEndOfLine(c))
                {
                    break;
                }

                buffer.Append((char)c);
            }

            // CR+LF is also a valid EOL
            if (IsCarriageReturn(c) && IsLineFeed(reader.Peek()))
            {
                reader.Read();
            }

            return(buffer.ToString());
        }
Beispiel #3
0
        public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
        {
            if (reader == null)
            {
                throw new ArgumentNullException(nameof(reader));
            }

            if (baseParser == null)
            {
                throw new ArgumentNullException(nameof(baseParser));
            }

            if (pool == null)
            {
                throw new ArgumentNullException(nameof(pool));
            }

            ReadHelper.ReadExpectedChar(reader, '<');
            ReadHelper.ReadExpectedChar(reader, '<');
            ReadHelper.SkipSpaces(reader);

            var dictionary = new PdfDictionary();

            var done = false;

            while (!done)
            {
                ReadHelper.SkipSpaces(reader);

                var c = (char)reader.Peek();

                switch (c)
                {
                case '>':
                    done = true;
                    break;

                case '/':
                    var nameValue = ParseCosDictionaryNameValuePair(reader, baseParser, pool);

                    if (nameValue.key != null && nameValue.value != null)
                    {
                        dictionary.Set(nameValue.key, nameValue.value);
                    }

                    break;

                default:
                    if (ReadUntilEnd(reader))
                    {
                        return(new PdfDictionary());
                    }
                    break;
                }
            }

            ReadHelper.ReadExpectedString(reader, ">>");

            return(dictionary);
        }
Beispiel #4
0
        private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
        {
            var key   = nameParser.Parse(reader);
            var value = ParseValue(reader, baseParser, pool);

            ReadHelper.SkipSpaces(reader);

            if ((char)reader.Peek() == 'd')
            {
                // if the next string is 'def' then we are parsing a cmap stream
                // and want to ignore it, otherwise throw an exception.
                var potentialDef = ReadHelper.ReadString(reader);
                if (!potentialDef.Equals("def"))
                {
                    reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef));
                }
                else
                {
                    ReadHelper.SkipSpaces(reader);
                }
            }

            if (value == null)
            {
                log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader));
                return(null, null);
            }

            // label this item as direct, to avoid signature problems.
            value.Direct = true;

            return(key, value);
        }
Beispiel #5
0
 public int Peek()
 {
     if (Throw)
     {
         throw new InvalidOperationException();
     }
     return(reader.Peek());
 }
Beispiel #6
0
 public int peek()
 {
     return(reader.Peek());
 }
            public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
            {
                builder = null;

                var tableStartOffset = source.GetPosition();

                if (source.Peek() != 'x')
                {
                    return(false);
                }

                var xref = ReadHelper.ReadString(source);

                if (!xref.Trim().Equals("xref"))
                {
                    return(false);
                }

                // check for trailer after xref
                var str = ReadHelper.ReadString(source);

                byte[] b = OtherEncodings.StringAsLatin1Bytes(str);

                source.Rewind(b.Length);

                if (str.StartsWith("trailer"))
                {
                    log.Warn("skipping empty xref table");
                    return(false);
                }

                builder = new CrossReferenceTablePartBuilder
                {
                    Offset   = offset,
                    XRefType = CrossReferenceType.Table
                };

                // Tables can have multiple sections. Each starts with a starting object id and a count.
                while (true)
                {
                    if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
                    {
                        log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");

                        if (isLenientParsing)
                        {
                            break;
                        }

                        return(false);
                    }

                    var currentObjectId = subsectionDefinition.FirstNumber;

                    ReadHelper.SkipSpaces(source);
                    for (var i = 0; i < subsectionDefinition.Count; i++)
                    {
                        if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
                        {
                            break;
                        }

                        if (source.Peek() == 't')
                        {
                            break;
                        }

                        //Ignore table contents
                        var currentLine = ReadHelper.ReadLine(source);
                        var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                        if (splitString.Length < 3)
                        {
                            log.Warn("invalid xref line: " + currentLine);
                            break;
                        }

                        // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
                        if (splitString[splitString.Length - 1].Equals(InUseEntry))
                        {
                            try
                            {
                                var objectOffset = long.Parse(splitString[0]);

                                if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
                                {
                                    // PDFBOX-3923: offset points inside this table - that can't be good
                                    throw new InvalidOperationException(
                                              $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
                                }

                                var generation = int.Parse(splitString[1]);
                                builder.Add(currentObjectId, generation, objectOffset);
                            }
                            catch (FormatException e)
                            {
                                throw new InvalidOperationException("Bad", e);
                            }
                        }
                        else if (!splitString[2].Equals(FreeEntry))
                        {
                            throw new InvalidOperationException(
                                      $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
                        }

                        currentObjectId++;

                        ReadHelper.SkipSpaces(source);
                    }

                    ReadHelper.SkipSpaces(source);
                    if (!ReadHelper.IsDigit(source))
                    {
                        break;
                    }
                }

                if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
                {
                    throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
                }

                builder.Dictionary = trailer;
                builder.Previous   = trailer.GetLongOrDefault(CosName.PREV);

                return(true);
            }
            private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
            {
                trailer = null;
                // parse the last trailer.
                var trailerOffset = source.GetPosition();

                // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
                if (isLenientParsing)
                {
                    int nextCharacter = source.Peek();
                    while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
                    {
                        if (source.GetPosition() == trailerOffset)
                        {
                            // warn only the first time
                            //LOG.warn("Expected trailer object at position " + trailerOffset
                            //        + ", keep trying");
                        }
                        ReadHelper.ReadLine(source);
                        nextCharacter = source.Peek();
                    }
                }
                if (source.Peek() != 't')
                {
                    return(false);
                }
                //read "trailer"
                long   currentOffset = source.GetPosition();
                string nextLine      = ReadHelper.ReadLine(source);

                if (!nextLine.Trim().Equals("trailer"))
                {
                    // in some cases the EOL is missing and the trailer immediately
                    // continues with "<<" or with a blank character
                    // even if this does not comply with PDF reference we want to support as many PDFs as possible
                    // Acrobat reader can also deal with this.
                    if (nextLine.StartsWith("trailer"))
                    {
                        // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
                        int len = "trailer".Length;
                        // jump back right after "trailer"
                        source.Seek(currentOffset + len);
                    }
                    else
                    {
                        return(false);
                    }
                }

                // in some cases the EOL is missing and the trailer continues with " <<"
                // even if this does not comply with PDF reference we want to support as many PDFs as possible
                // Acrobat reader can also deal with this.
                ReadHelper.SkipSpaces(source);

                PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);

                trailer = parsedTrailer;

                ReadHelper.SkipSpaces(source);
                return(true);
            }
Beispiel #9
0
        public IReadOnlyDictionary <CosObjectKey, long> GetObjectLocations()
        {
            if (objectLocations != null)
            {
                return(objectLocations);
            }

            var lastEndOfFile = GetLastEndOfFileMarker();

            var results = new Dictionary <CosObjectKey, long>();

            var originPosition = reader.GetPosition();

            long currentOffset    = MinimumSearchOffset;
            long lastObjectId     = long.MinValue;
            int  lastGenerationId = int.MinValue;
            long lastObjOffset    = long.MinValue;

            byte[] objString    = OtherEncodings.StringAsLatin1Bytes(" obj");
            byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj");

            bool endobjFound = false;

            do
            {
                reader.Seek(currentOffset);
                if (ReadHelper.IsString(reader, objString))
                {
                    long tempOffset = currentOffset - 1;
                    reader.Seek(tempOffset);
                    int generationId = reader.Peek();

                    // is the next char a digit?
                    if (ReadHelper.IsDigit(generationId))
                    {
                        generationId -= 48;
                        tempOffset--;
                        reader.Seek(tempOffset);
                        if (ReadHelper.IsSpace(reader))
                        {
                            while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader))
                            {
                                reader.Seek(--tempOffset);
                            }

                            bool objectIdFound = false;
                            while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
                            {
                                reader.Seek(--tempOffset);
                                objectIdFound = true;
                            }

                            if (objectIdFound)
                            {
                                reader.Read();
                                long objectId = ObjectHelper.ReadObjectNumber(reader);
                                if (lastObjOffset > 0)
                                {
                                    // add the former object ID only if there was a subsequent object ID
                                    results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset;
                                }
                                lastObjectId     = objectId;
                                lastGenerationId = generationId;
                                lastObjOffset    = tempOffset + 1;
                                currentOffset   += objString.Length - 1;
                                endobjFound      = false;
                            }
                        }
                    }
                }
                else if (ReadHelper.IsString(reader, "endobj"))
                {
                    endobjFound    = true;
                    currentOffset += endobjString.Length - 1;
                }
                currentOffset++;
            } while (currentOffset < lastEndOfFile && !reader.IsEof());
            if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
            {
                // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
                // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
                results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset;
            }

            // reestablish origin position
            reader.Seek(originPosition);

            objectLocations = results;

            return(objectLocations);
        }
Beispiel #10
0
 public static bool IsSpace(IRandomAccessRead reader)
 {
     return(IsSpace(reader.Peek()));
 }
Beispiel #11
0
 public static bool IsDigit(IRandomAccessRead reader)
 {
     return(IsDigit(reader.Peek()));
 }
Beispiel #12
0
        public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool)
        {
            ReadHelper.ReadExpectedChar(reader, '[');
            var     po = new COSArray();
            CosBase pbo;

            ReadHelper.SkipSpaces(reader);
            int i;

            while (((i = reader.Peek()) > 0) && ((char)i != ']'))
            {
                pbo = baseParser.Parse(reader, pool);
                if (pbo is CosObject)
                {
                    // We have to check if the expected values are there or not PDFBOX-385
                    if (po.get(po.size() - 1) is CosInt)
                    {
                        var genNumber = (CosInt)po.remove(po.size() - 1);
                        if (po.get(po.size() - 1) is CosInt)
                        {
                            var          number = (CosInt)po.remove(po.size() - 1);
                            CosObjectKey key    = new CosObjectKey(number.AsLong(), genNumber.AsInt());
                            pbo = pool.Get(key);
                        }
                        else
                        {
                            // the object reference is somehow wrong
                            pbo = null;
                        }
                    }
                    else
                    {
                        pbo = null;
                    }
                }
                if (pbo != null)
                {
                    po.add(pbo);
                }
                else
                {
                    //it could be a bad object in the array which is just skipped
                    // LOG.warn("Corrupt object reference at offset " + seqSource.getPosition());

                    // This could also be an "endobj" or "endstream" which means we can assume that
                    // the array has ended.
                    string isThisTheEnd = ReadHelper.ReadString(reader);
                    reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd));
                    if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream"))
                    {
                        return(po);
                    }
                }

                ReadHelper.SkipSpaces(reader);
            }
            // read ']'
            reader.Read();
            ReadHelper.SkipSpaces(reader);
            return(po);
        }
Beispiel #13
0
        private void bfSearchForObjects(IRandomAccessRead source)
        {
            bfSearchForLastEOFMarker(source);
            bfSearchCOSObjectKeyOffsets = new Dictionary <CosObjectKey, long>();
            long originOffset  = source.GetPosition();
            long currentOffset = MINIMUM_SEARCH_OFFSET;
            long lastObjectId  = long.MinValue;
            int  lastGenID     = int.MinValue;
            long lastObjOffset = long.MinValue;

            char[] objString    = " obj".ToCharArray();
            char[] endobjString = "endobj".ToCharArray();
            bool   endobjFound  = false;

            do
            {
                source.Seek(currentOffset);
                if (ReadHelper.IsString(source, "obj"))
                {
                    long tempOffset = currentOffset - 1;
                    source.Seek(tempOffset);
                    int genID = source.Peek();
                    // is the next char a digit?
                    if (ReadHelper.IsDigit(genID))
                    {
                        genID -= 48;
                        tempOffset--;
                        source.Seek(tempOffset);
                        if (ReadHelper.IsSpace(source))
                        {
                            while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source))
                            {
                                source.Seek(--tempOffset);
                            }
                            bool objectIDFound = false;
                            while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source))
                            {
                                source.Seek(--tempOffset);
                                objectIDFound = true;
                            }
                            if (objectIDFound)
                            {
                                source.Read();
                                long objectId = ObjectHelper.ReadObjectNumber(source);
                                if (lastObjOffset > 0)
                                {
                                    // add the former object ID only if there was a subsequent object ID
                                    bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset;
                                }
                                lastObjectId   = objectId;
                                lastGenID      = genID;
                                lastObjOffset  = tempOffset + 1;
                                currentOffset += objString.Length - 1;
                                endobjFound    = false;
                            }
                        }
                    }
                }
                else if (ReadHelper.IsString(source, "endobj"))
                {
                    endobjFound    = true;
                    currentOffset += endobjString.Length - 1;
                }
                currentOffset++;
            } while (currentOffset < lastEOFMarker && !source.IsEof());
            if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0)
            {
                // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
                // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
                bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset;
            }
            // reestablish origin position

            source.Seek(originOffset);
        }
Beispiel #14
0
        public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool)
        {
            CosBase retval = null;

            ReadHelper.SkipSpaces(reader);
            int nextByte = reader.Peek();

            if (nextByte == -1)
            {
                return(null);
            }

            char c = (char)nextByte;

            switch (c)
            {
            case '<':
            {
                // pull off first left bracket
                int leftBracket = reader.Read();
                // check for second left bracket
                c = (char)reader.Peek();
                reader.Unread(leftBracket);
                if (c == '<')
                {
                    retval = dictionaryParser.Parse(reader, this, pool);
                    ReadHelper.SkipSpaces(reader);
                }
                else
                {
                    retval = stringParser.Parse(reader);
                }
                break;
            }

            case '[':
            {
                // array
                retval = arrayParser.Parse(reader, this, pool);
                break;
            }

            case '(':
                retval = stringParser.Parse(reader);
                break;

            case '/':
                // name
                retval = nameParser.Parse(reader);
                break;

            case 'n':
            {
                // null
                ReadHelper.ReadExpectedString(reader, "null");
                retval = CosNull.Null;
                break;
            }

            case 't':
            {
                string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4));
                if (truestring.Equals("true"))
                {
                    retval = PdfBoolean.True;
                }
                else
                {
                    throw new IOException("expected true actual='" + truestring + "' " + reader +
                                          "' at offset " + reader.GetPosition());
                }
                break;
            }

            case 'f':
            {
                string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5));
                if (falsestring.Equals("false"))
                {
                    retval = PdfBoolean.False;
                }
                else
                {
                    throw new IOException("expected false actual='" + falsestring + "' " + reader +
                                          "' at offset " + reader.GetPosition());
                }
                break;
            }

            case 'R':
                reader.Read();
                retval = new CosObject(null);
                break;

            default:

                if (char.IsDigit(c) || c == '-' || c == '+' || c == '.')
                {
                    StringBuilder buf = new StringBuilder();
                    int           ic  = reader.Read();
                    c = (char)ic;
                    while (char.IsDigit(c) ||
                           c == '-' ||
                           c == '+' ||
                           c == '.' ||
                           c == 'E' ||
                           c == 'e')
                    {
                        buf.Append(c);
                        ic = reader.Read();
                        c  = (char)ic;
                    }
                    if (ic != -1)
                    {
                        reader.Unread(ic);
                    }
                    retval = CosNumberFactory.get(buf.ToString()) as CosBase;
                }
                else
                {
                    //This is not suppose to happen, but we will allow for it
                    //so we are more compatible with POS writers that don't
                    //follow the spec
                    string badstring = ReadHelper.ReadString(reader);
                    if (badstring == string.Empty)
                    {
                        int peek = reader.Peek();
                        // we can end up in an infinite loop otherwise
                        throw new IOException("Unknown dir object c='" + c +
                                              "' cInt=" + (int)c + " peek='" + (char)peek
                                              + "' peekInt=" + peek + " at offset " + reader.GetPosition());
                    }

                    // if it's an endstream/endobj, we want to put it back so the caller will see it
                    if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring))
                    {
                        reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring));
                    }
                }
                break;
            }
            return(retval);
        }