public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } if (baseParser == null) { throw new ArgumentNullException(nameof(baseParser)); } if (pool == null) { throw new ArgumentNullException(nameof(pool)); } ReadHelper.ReadExpectedChar(reader, '<'); ReadHelper.ReadExpectedChar(reader, '<'); ReadHelper.SkipSpaces(reader); var dictionary = new PdfDictionary(); var done = false; while (!done) { ReadHelper.SkipSpaces(reader); var c = (char)reader.Peek(); switch (c) { case '>': done = true; break; case '/': var nameValue = ParseCosDictionaryNameValuePair(reader, baseParser, pool); if (nameValue.key != null && nameValue.value != null) { dictionary.Set(nameValue.key, nameValue.value); } break; default: if (ReadUntilEnd(reader)) { return(new PdfDictionary()); } break; } } ReadHelper.ReadExpectedString(reader, ">>"); return(dictionary); }
public CosBase Parse(IRandomAccessRead reader, long objectNumber, int objectGeneration, CosObjectPool pool, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenient, bool requireExistingObject) { if (pool == null) { throw new ArgumentNullException(nameof(pool)); } var key = new CosObjectKey(objectNumber, objectGeneration); var pdfObject = pool.GetOrCreateDefault(key); if (pdfObject.GetObject() != null) { return(pdfObject.GetObject()); } if (crossReferenceTable == null) { throw new ArgumentNullException(nameof(crossReferenceTable)); } var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets); if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0)) { throw new InvalidOperationException("Object must be defined and not compressed: " + key); } if (isLenient && offsetOrStreamNumber == null) { var locations = bruteForceSearcher.GetObjectLocations(); offsetOrStreamNumber = TryGet(key, locations); if (offsetOrStreamNumber != null) { crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value); } } if (offsetOrStreamNumber == null) { return(CosNull.Null); } var isCompressedStreamObject = offsetOrStreamNumber <= 0; if (!isCompressedStreamObject) { return(ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, pool, isLenient)); } return(ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, objectNumber, pool, crossReferenceTable, bruteForceSearcher, isLenient)); }
public IReadOnlyList <CosObject> Parse(PdfRawStream stream, CosObjectPool pool) { if (stream == null) { throw new ArgumentNullException(nameof(stream)); } //need to first parse the header. var numberOfObjects = stream.Dictionary.GetIntOrDefault(CosName.N); var objectNumbers = new List <long>(numberOfObjects); var streamObjects = new List <CosObject>(numberOfObjects); var bytes = stream.Decode(filterProvider); var reader = new RandomAccessBuffer(bytes); for (int i = 0; i < numberOfObjects; i++) { long objectNumber = ObjectHelper.ReadObjectNumber(reader); // skip offset ReadHelper.ReadLong(reader); objectNumbers.Add(objectNumber); } CosObject obj; CosBase cosObject; int objectCounter = 0; while ((cosObject = baseParser.Parse(reader, pool)) != null) { obj = new CosObject(cosObject); obj.SetGenerationNumber(0); if (objectCounter >= objectNumbers.Count) { log.Error("/ObjStm (object stream) has more objects than /N " + numberOfObjects); break; } obj.SetObjectNumber(objectNumbers[objectCounter]); streamObjects.Add(obj); // According to the spec objects within an object stream shall not be enclosed // by obj/endobj tags, but there are some pdfs in the wild using those tags // skip endobject marker if present if (!reader.IsEof() && reader.Peek() == 'e') { ReadHelper.ReadLine(reader); } objectCounter++; } return(streamObjects); }
public XrefOffsetValidator(ILog log, IRandomAccessRead source, CosDictionaryParser dictionaryParser, CosBaseParser baseParser, CosObjectPool pool) { this.log = log; this.source = source; this.dictionaryParser = dictionaryParser; this.baseParser = baseParser; this.pool = pool; }
public CosBase Parse(IRandomAccessRead reader, CosObject obj, CosObjectPool pool, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenient, bool requireExistingObject) { if (obj == null) { throw new ArgumentNullException(nameof(obj)); } return(Parse(reader, obj.GetObjectNumber(), obj.GetGenerationNumber(), pool, crossReferenceTable, bruteForceSearcher, isLenient, requireExistingObject)); }
public PdfObjectParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, CosObjectPool objectPool, ObjectStreamParser objectStreamParser) { this.log = log ?? new NoOpLog(); this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser)); this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser)); this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable)); this.bruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); this.objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser)); }
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader, CosObjectKey key, CosObjectPool pool, bool isLenientParsing) { reader.Seek(offset); var objectNumber = ObjectHelper.ReadObjectNumber(reader); var objectGeneration = ObjectHelper.ReadGenerationNumber(reader); ReadHelper.ReadExpectedString(reader, "obj", true); if (objectNumber != key.Number || objectGeneration != key.Generation) { throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}"); } ReadHelper.SkipSpaces(reader); var baseObject = baseParser.Parse(reader, pool); var endObjectKey = ReadHelper.ReadString(reader); var atStreamStart = string.Equals(endObjectKey, "stream"); if (atStreamStart) { var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey); reader.Rewind(streamStartBytes.Length); baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey); } if (!string.Equals(endObjectKey, "endobj")) { var message = $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'"; if (isLenientParsing) { log.Warn(message); } else { throw new InvalidOperationException(message); } } return(baseObject); }
public ParsingCachingProviders(CosObjectPool objectPool, BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer) { ObjectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer)); }
private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, CosObjectPool objectPool, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenientParsing) { var baseStream = Parse(reader, streamObjectNumber, 0, objectPool, crossReferenceTable, bruteForceSearcher, isLenientParsing, true); if (!(baseStream is PdfRawStream stream)) { log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}"); return(CosNull.Null); } var objects = objectStreamParser.Parse(stream, objectPool); // register all objects which are referenced to be contained in object stream foreach (var next in objects) { var streamKey = new CosObjectKey(next); var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets); if (offset != null && offset == -streamObjectNumber) { var streamObject = objectPool.Get(streamKey); streamObject.SetObject(next.GetObject()); } } var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber); if (matchingStreamObject != null) { return(matchingStreamObject); } log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull."); return(CosNull.Null); }
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool) { return(CosNull.Null); }
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable, DynamicParser dynamicParser, BruteForceSearcher bruteForceSearcher, CosObjectPool pool, bool isLenientParsing) { foreach (var value in crossReferenceTable.Dictionary.Values) { if (value is CosObject temporaryObject) { // Loads these objects into the object pool for access later. dynamicParser.Parse(reader, temporaryObject, pool, crossReferenceTable, bruteForceSearcher, isLenientParsing, false); } } CosObject root = (CosObject)crossReferenceTable.Dictionary.GetItemOrDefault(CosName.ROOT); if (root == null) { throw new InvalidOperationException("Missing root object specification in trailer."); } var rootObject = dynamicParser.Parse(reader, root, pool, crossReferenceTable, bruteForceSearcher, isLenientParsing, false); return(rootObject); }
private static CosBase ParseValue(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { var numOffset = reader.GetPosition(); var value = baseParser.Parse(reader, pool); ReadHelper.SkipSpaces(reader); // proceed if the given object is a number and the following is a number as well if (!(value is ICosNumber) || !ReadHelper.IsDigit(reader)) { return(value); } // read the remaining information of the object number var genOffset = reader.GetPosition(); var generationNumber = baseParser.Parse(reader, pool); ReadHelper.SkipSpaces(reader); ReadHelper.ReadExpectedChar(reader, 'R'); if (!(value is CosInt)) { throw new InvalidOperationException("expected number, actual=" + value + " at offset " + numOffset); } if (!(generationNumber is CosInt)) { throw new InvalidOperationException("expected number, actual=" + value + " at offset " + genOffset); } var key = new CosObjectKey(((CosInt)value).AsLong(), ((CosInt)generationNumber).AsInt()); // dereference the object return(pool.Get(key)); }
private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { var key = nameParser.Parse(reader); var value = ParseValue(reader, baseParser, pool); ReadHelper.SkipSpaces(reader); if ((char)reader.Peek() == 'd') { // if the next string is 'def' then we are parsing a cmap stream // and want to ignore it, otherwise throw an exception. var potentialDef = ReadHelper.ReadString(reader); if (!potentialDef.Equals("def")) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef)); } else { ReadHelper.SkipSpaces(reader); } } if (value == null) { log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader)); return(null, null); } // label this item as direct, to avoid signature problems. value.Direct = true; return(key, value); }
public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool) { ReadHelper.ReadExpectedChar(reader, '['); var po = new COSArray(); CosBase pbo; ReadHelper.SkipSpaces(reader); int i; while (((i = reader.Peek()) > 0) && ((char)i != ']')) { pbo = baseParser.Parse(reader, pool); if (pbo is CosObject) { // We have to check if the expected values are there or not PDFBOX-385 if (po.get(po.size() - 1) is CosInt) { var genNumber = (CosInt)po.remove(po.size() - 1); if (po.get(po.size() - 1) is CosInt) { var number = (CosInt)po.remove(po.size() - 1); CosObjectKey key = new CosObjectKey(number.AsLong(), genNumber.AsInt()); pbo = pool.Get(key); } else { // the object reference is somehow wrong pbo = null; } } else { pbo = null; } } if (pbo != null) { po.add(pbo); } else { //it could be a bad object in the array which is just skipped // LOG.warn("Corrupt object reference at offset " + seqSource.getPosition()); // This could also be an "endobj" or "endstream" which means we can assume that // the array has ended. string isThisTheEnd = ReadHelper.ReadString(reader); reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd)); if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream")) { return(po); } } ReadHelper.SkipSpaces(reader); } // read ']' reader.Read(); ReadHelper.SkipSpaces(reader); return(po); }
public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { return(new PdfDictionary()); }
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer) { trailer = null; // parse the last trailer. var trailerOffset = source.GetPosition(); // PDFBOX-1739 skip extra xref entries in RegisSTAR documents if (isLenientParsing) { int nextCharacter = source.Peek(); while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter)) { if (source.GetPosition() == trailerOffset) { // warn only the first time //LOG.warn("Expected trailer object at position " + trailerOffset // + ", keep trying"); } ReadHelper.ReadLine(source); nextCharacter = source.Peek(); } } if (source.Peek() != 't') { return(false); } //read "trailer" long currentOffset = source.GetPosition(); string nextLine = ReadHelper.ReadLine(source); if (!nextLine.Trim().Equals("trailer")) { // in some cases the EOL is missing and the trailer immediately // continues with "<<" or with a blank character // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. if (nextLine.StartsWith("trailer")) { // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes int len = "trailer".Length; // jump back right after "trailer" source.Seek(currentOffset + len); } else { return(false); } } // in some cases the EOL is missing and the trailer continues with " <<" // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. ReadHelper.SkipSpaces(source); PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool); trailer = parsedTrailer; ReadHelper.SkipSpaces(source); return(true); }
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder) { builder = null; var tableStartOffset = source.GetPosition(); if (source.Peek() != 'x') { return(false); } var xref = ReadHelper.ReadString(source); if (!xref.Trim().Equals("xref")) { return(false); } // check for trailer after xref var str = ReadHelper.ReadString(source); byte[] b = OtherEncodings.StringAsLatin1Bytes(str); source.Rewind(b.Length); if (str.StartsWith("trailer")) { log.Warn("skipping empty xref table"); return(false); } builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; // Tables can have multiple sections. Each starts with a starting object id and a count. while (true) { if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition)) { log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}"); if (isLenientParsing) { break; } return(false); } var currentObjectId = subsectionDefinition.FirstNumber; ReadHelper.SkipSpaces(source); for (var i = 0; i < subsectionDefinition.Count; i++) { if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek())) { break; } if (source.Peek() == 't') { break; } //Ignore table contents var currentLine = ReadHelper.ReadLine(source); var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (splitString.Length < 3) { log.Warn("invalid xref line: " + currentLine); break; } // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) if (splitString[splitString.Length - 1].Equals(InUseEntry)) { try { var objectOffset = long.Parse(splitString[0]); if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition()) { // PDFBOX-3923: offset points inside this table - that can't be good throw new InvalidOperationException( $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}"); } var generation = int.Parse(splitString[1]); builder.Add(currentObjectId, generation, objectOffset); } catch (FormatException e) { throw new InvalidOperationException("Bad", e); } } else if (!splitString[2].Equals(FreeEntry)) { throw new InvalidOperationException( $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}."); } currentObjectId++; ReadHelper.SkipSpaces(source); } ReadHelper.SkipSpaces(source); if (!ReadHelper.IsDigit(source)) { break; } } if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer)) { throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}."); } builder.Dictionary = trailer; builder.Previous = trailer.GetLongOrDefault(CosName.PREV); return(true); }
private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool) { // repair mode isn't available in non-lenient mode if (!isLenient || startXRefOffset == 0) { return(true); } // seek to offset-1 source.Seek(startXRefOffset - 1); int nextValue = source.Read(); // the first character has to be a whitespace, and then a digit if (ReadHelper.IsWhitespace(nextValue)) { ReadHelper.SkipSpaces(source); if (ReadHelper.IsDigit(source)) { try { // it's a XRef stream ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); ReadHelper.ReadExpectedString(source, "obj", true); // check the dictionary to avoid false positives PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool); source.Seek(startXRefOffset); if (dict.IsType(CosName.XREF)) { return(true); } } catch (Exception ex) { log.Error("Couldn't read the xref stream object.", ex); // there wasn't an object of a xref stream source.Seek(startXRefOffset); } } } return(false); }
private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing) { var log = container.Get <ILog>(); var version = container.Get <FileHeaderParser>().Parse(scanner, isLenientParsing); var crossReferenceOffset = container.Get <FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing); var pool = new CosObjectPool(); // TODO: make this use the scanner. var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get <CosDictionaryParser>(), container.Get <CosBaseParser>(), pool)); crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing); var crossReferenceTable = container.Get <CrossReferenceParser>() .Parse(reader, isLenientParsing, crossReferenceOffset, pool); container.Get <CrossReferenceParser>().ParseNew(crossReferenceOffset, scanner, isLenientParsing); var filterProvider = container.Get <IFilterProvider>(); var bruteForceSearcher = new BruteForceSearcher(reader); var pdfObjectParser = new PdfObjectParser(container.Get <ILog>(), container.Get <CosBaseParser>(), container.Get <CosStreamParser>(), crossReferenceTable, bruteForceSearcher, pool, container.Get <ObjectStreamParser>()); var trueTypeFontParser = new TrueTypeFontParser(); var fontDescriptorFactory = new FontDescriptorFactory(); var cidFontFactory = new CidFontFactory(fontDescriptorFactory, trueTypeFontParser, pdfObjectParser, filterProvider); var cMapCache = new CMapCache(new CMapParser()); var fontFactory = new FontFactory(log, new Type0FontHandler(cidFontFactory, cMapCache, filterProvider, pdfObjectParser), new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser)); var dynamicParser = container.Get <DynamicParser>(); var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory); var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory())); var informationFactory = new DocumentInformationFactory(); var catalogFactory = new CatalogFactory(pdfObjectParser); var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool, isLenientParsing); if (!(root is PdfDictionary rootDictionary)) { throw new InvalidOperationException("Expected root dictionary, but got this: " + root); } // in some pdfs the type value "Catalog" is missing in the root object if (isLenientParsing && !rootDictionary.ContainsKey(CosName.TYPE)) { rootDictionary.Set(CosName.TYPE, CosName.CATALOG); } var information = informationFactory.Create(pdfObjectParser, crossReferenceTable.Dictionary, reader, isLenientParsing); var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing); var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer); return(new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information)); }
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool) { CosBase retval = null; ReadHelper.SkipSpaces(reader); int nextByte = reader.Peek(); if (nextByte == -1) { return(null); } char c = (char)nextByte; switch (c) { case '<': { // pull off first left bracket int leftBracket = reader.Read(); // check for second left bracket c = (char)reader.Peek(); reader.Unread(leftBracket); if (c == '<') { retval = dictionaryParser.Parse(reader, this, pool); ReadHelper.SkipSpaces(reader); } else { retval = stringParser.Parse(reader); } break; } case '[': { // array retval = arrayParser.Parse(reader, this, pool); break; } case '(': retval = stringParser.Parse(reader); break; case '/': // name retval = nameParser.Parse(reader); break; case 'n': { // null ReadHelper.ReadExpectedString(reader, "null"); retval = CosNull.Null; break; } case 't': { string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4)); if (truestring.Equals("true")) { retval = PdfBoolean.True; } else { throw new IOException("expected true actual='" + truestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'f': { string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5)); if (falsestring.Equals("false")) { retval = PdfBoolean.False; } else { throw new IOException("expected false actual='" + falsestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'R': reader.Read(); retval = new CosObject(null); break; default: if (char.IsDigit(c) || c == '-' || c == '+' || c == '.') { StringBuilder buf = new StringBuilder(); int ic = reader.Read(); c = (char)ic; while (char.IsDigit(c) || c == '-' || c == '+' || c == '.' || c == 'E' || c == 'e') { buf.Append(c); ic = reader.Read(); c = (char)ic; } if (ic != -1) { reader.Unread(ic); } retval = CosNumberFactory.get(buf.ToString()) as CosBase; } else { //This is not suppose to happen, but we will allow for it //so we are more compatible with POS writers that don't //follow the spec string badstring = ReadHelper.ReadString(reader); if (badstring == string.Empty) { int peek = reader.Peek(); // we can end up in an infinite loop otherwise throw new IOException("Unknown dir object c='" + c + "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " at offset " + reader.GetPosition()); } // if it's an endstream/endobj, we want to put it back so the caller will see it if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring)) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring)); } } break; } return(retval); }