Ejemplo n.º 1
0
        public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser)
        {
            PdfRawStream result;

            // read 'stream'; this was already tested in parseObjectsDynamically()
            ReadHelper.ReadExpectedString(reader, "stream");

            skipWhiteSpaces(reader);

            // This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null.
            ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser);

            ValidateStreamLength(reader, isLenientParsing, streamLength);

            // get output stream to copy data to
            using (var stream = new MemoryStream())
                using (var writer = new BinaryWriter(stream))
                {
                    if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length()))
                    {
                        ReadValidStream(reader, writer, streamLength);
                    }
                    else
                    {
                        ReadUntilEndStream(reader, writer);
                    }

                    result = new PdfRawStream(stream.ToArray(), streamDictionary);
                }

            String endStream = ReadHelper.ReadString(reader);

            if (endStream.Equals("endobj") && isLenientParsing)
            {
                log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}");

                // avoid follow-up warning about missing endobj
                reader.Rewind("endobj".Length);
            }
            else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream"))
            {
                log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition());
                // unread the "extra" bytes
                reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length);
            }
            else if (!endStream.Equals("endstream"))
            {
                throw new InvalidOperationException("Error reading stream, expected='endstream' actual='"
                                                    + endStream + "' at offset " + reader.GetPosition());
            }

            return(result);
        }
Ejemplo n.º 2
0
 public void Rewind(int bytes)
 {
     if (Throw)
     {
         throw new InvalidOperationException();
     }
     reader.Rewind(bytes);
 }
Ejemplo n.º 3
0
        private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
                                            CosObjectKey key,
                                            CosObjectPool pool,
                                            bool isLenientParsing)
        {
            reader.Seek(offset);

            var objectNumber     = ObjectHelper.ReadObjectNumber(reader);
            var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);

            ReadHelper.ReadExpectedString(reader, "obj", true);

            if (objectNumber != key.Number || objectGeneration != key.Generation)
            {
                throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
            }

            ReadHelper.SkipSpaces(reader);

            var baseObject = baseParser.Parse(reader, pool);

            var endObjectKey = ReadHelper.ReadString(reader);

            var atStreamStart = string.Equals(endObjectKey, "stream");

            if (atStreamStart)
            {
                var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);

                reader.Rewind(streamStartBytes.Length);

                baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
            }

            if (!string.Equals(endObjectKey, "endobj"))
            {
                var message =
                    $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";

                if (isLenientParsing)
                {
                    log.Warn(message);
                }
                else
                {
                    throw new InvalidOperationException(message);
                }
            }

            return(baseObject);
        }
Ejemplo n.º 4
0
 public void unread(int b)
 {
     reader.Rewind(1);
 }
Ejemplo n.º 5
0
            public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
            {
                builder = null;

                var tableStartOffset = source.GetPosition();

                if (source.Peek() != 'x')
                {
                    return(false);
                }

                var xref = ReadHelper.ReadString(source);

                if (!xref.Trim().Equals("xref"))
                {
                    return(false);
                }

                // check for trailer after xref
                var str = ReadHelper.ReadString(source);

                byte[] b = OtherEncodings.StringAsLatin1Bytes(str);

                source.Rewind(b.Length);

                if (str.StartsWith("trailer"))
                {
                    log.Warn("skipping empty xref table");
                    return(false);
                }

                builder = new CrossReferenceTablePartBuilder
                {
                    Offset   = offset,
                    XRefType = CrossReferenceType.Table
                };

                // Tables can have multiple sections. Each starts with a starting object id and a count.
                while (true)
                {
                    if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
                    {
                        log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");

                        if (isLenientParsing)
                        {
                            break;
                        }

                        return(false);
                    }

                    var currentObjectId = subsectionDefinition.FirstNumber;

                    ReadHelper.SkipSpaces(source);
                    for (var i = 0; i < subsectionDefinition.Count; i++)
                    {
                        if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
                        {
                            break;
                        }

                        if (source.Peek() == 't')
                        {
                            break;
                        }

                        //Ignore table contents
                        var currentLine = ReadHelper.ReadLine(source);
                        var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                        if (splitString.Length < 3)
                        {
                            log.Warn("invalid xref line: " + currentLine);
                            break;
                        }

                        // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
                        if (splitString[splitString.Length - 1].Equals(InUseEntry))
                        {
                            try
                            {
                                var objectOffset = long.Parse(splitString[0]);

                                if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
                                {
                                    // PDFBOX-3923: offset points inside this table - that can't be good
                                    throw new InvalidOperationException(
                                              $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
                                }

                                var generation = int.Parse(splitString[1]);
                                builder.Add(currentObjectId, generation, objectOffset);
                            }
                            catch (FormatException e)
                            {
                                throw new InvalidOperationException("Bad", e);
                            }
                        }
                        else if (!splitString[2].Equals(FreeEntry))
                        {
                            throw new InvalidOperationException(
                                      $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
                        }

                        currentObjectId++;

                        ReadHelper.SkipSpaces(source);
                    }

                    ReadHelper.SkipSpaces(source);
                    if (!ReadHelper.IsDigit(source))
                    {
                        break;
                    }
                }

                if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
                {
                    throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
                }

                builder.Dictionary = trailer;
                builder.Previous   = trailer.GetLongOrDefault(CosName.PREV);

                return(true);
            }
Ejemplo n.º 6
0
        private void ReadUntilEndStream(IRandomAccessRead source, BinaryWriter output)
        {
            int bufSize;
            int charMatchCount = 0;

            byte[] keyw = ENDSTREAM;

            // last character position of shortest keyword ('endobj')
            int quickTestOffset = 5;

            // read next chunk into buffer; already matched chars are added to beginning of buffer
            while ((bufSize = source.Read(strmBuf, charMatchCount, STRMBUFLEN - charMatchCount)) > 0)
            {
                bufSize += charMatchCount;

                int bIdx = charMatchCount;
                int quickTestIdx;

                // iterate over buffer, trying to find keyword match
                for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++)
                {
                    // reduce compare operations by first test last character we would have to
                    // match if current one matches; if it is not a character from keywords
                    // we can move behind the test character; this shortcut is inspired by the
                    // Boyer-Moore string search algorithm and can reduce parsing time by approx. 20%
                    quickTestIdx = bIdx + quickTestOffset;
                    if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx)
                    {
                        byte ch = strmBuf[quickTestIdx];
                        if ((ch > 't') || (ch < 'a'))
                        {
                            // last character we would have to match if current character would match
                            // is not a character from keywords -> jump behind and start over
                            bIdx = quickTestIdx;
                            continue;
                        }
                    }

                    // could be negative - but we only compare to ASCII
                    byte ch1 = strmBuf[bIdx];

                    if (ch1 == keyw[charMatchCount])
                    {
                        if (++charMatchCount == keyw.Length)
                        {
                            // match found
                            bIdx++;
                            break;
                        }
                    }
                    else
                    {
                        if ((charMatchCount == 3) && (ch1 == ENDOBJ[charMatchCount]))
                        {
                            // maybe ENDSTREAM is missing but we could have ENDOBJ
                            keyw = ENDOBJ;
                            charMatchCount++;
                        }
                        else
                        {
                            // no match; incrementing match start by 1 would be dumb since we already know
                            // matched chars depending on current char read we may already have beginning
                            // of a new match: 'e': first char matched; 'n': if we are at match position
                            // idx 7 we already read 'e' thus 2 chars matched for each other char we have
                            // to start matching first keyword char beginning with next read position
                            charMatchCount = (ch1 == 'e') ? 1 : ((ch1 == 'n') && (charMatchCount == 7)) ? 2 : 0;
                            // search again for 'endstream'
                            keyw = ENDSTREAM;
                        }
                    }
                }

                int contentBytes = Math.Max(0, bIdx - charMatchCount);

                // write buffer content until first matched char to output stream
                if (contentBytes > 0)
                {
                    output.Write(strmBuf, 0, contentBytes);
                }
                if (charMatchCount == keyw.Length)
                {
                    // keyword matched; unread matched keyword (endstream/endobj) and following buffered content
                    source.Rewind(bufSize - contentBytes);
                    break;
                }

                // copy matched chars at start of buffer
                Array.Copy(keyw, 0, strmBuf, 0, charMatchCount);
            }
            // this writes a lonely CR or drops trailing CR LF and LF
            // output.flush();
        }