private bool checkObjectKeys(IRandomAccessRead source, CosObjectKey objectKey, long offset) { // there can't be any object at the very beginning of a pdf if (offset < MINIMUM_SEARCH_OFFSET) { return(false); } long objectNr = objectKey.Number; long objectGen = objectKey.Generation; long originOffset = source.GetPosition(); string objectString = ObjectHelper.createObjectString(objectNr, objectGen); try { source.Seek(offset); if (ReadHelper.IsString(source, OtherEncodings.StringAsLatin1Bytes(objectString))) { // everything is ok, return origin object key source.Seek(originOffset); return(true); } } catch (InvalidOperationException exception) { // Swallow the exception, obviously there isn't any valid object number } finally { source.Seek(originOffset); } // no valid object number found return(false); }
private bool validateStreamLength(IRandomAccessRead source, long streamLength, long fileLength) { bool streamLengthIsValid = true; long originOffset = source.GetPosition(); long expectedEndOfStream = originOffset + streamLength; if (expectedEndOfStream > fileLength) { streamLengthIsValid = false; //LOG.warn("The end of the stream is out of range, using workaround to read the stream, " // + "stream start position: " + originOffset + ", length: " + streamLength // + ", expected end position: " + expectedEndOfStream); } else { source.Seek(expectedEndOfStream); ReadHelper.SkipSpaces(source); if (!ReadHelper.IsString(source, "endstream")) { streamLengthIsValid = false; //LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, " // + "stream start position: " + originOffset + ", length: " + streamLength // + ", expected end position: " + expectedEndOfStream); } source.Seek(originOffset); } return(streamLengthIsValid); }
public void Seek(long position) { if (Throw) { throw new InvalidOperationException(); } reader.Seek(position); }
private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool) { // repair mode isn't available in non-lenient mode if (!isLenient || startXRefOffset == 0) { return(true); } // seek to offset-1 source.Seek(startXRefOffset - 1); int nextValue = source.Read(); // the first character has to be a whitespace, and then a digit if (ReadHelper.IsWhitespace(nextValue)) { ReadHelper.SkipSpaces(source); if (ReadHelper.IsDigit(source)) { try { // it's a XRef stream ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); ReadHelper.ReadExpectedString(source, "obj", true); // check the dictionary to avoid false positives PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool); source.Seek(startXRefOffset); if (dict.IsType(CosName.XREF)) { return(true); } } catch (Exception ex) { log.Error("Couldn't read the xref stream object.", ex); // there wasn't an object of a xref stream source.Seek(startXRefOffset); } } } return(false); }
private ICosNumber GetLength(IRandomAccessRead source, CosBase lengthBaseObj, CosName streamType, bool isLenientParsing, IPdfObjectParser parser) { if (lengthBaseObj == null) { return(null); } // Length is given directly in the stream dictionary if (lengthBaseObj is ICosNumber number) { return(number); } // length in referenced object if (lengthBaseObj is CosObject lengthObj) { var currentObject = lengthObj.GetObject(); if (currentObject == null) { if (parser == null) { throw new InvalidOperationException("This method required access to the PDF object parser but it was not created yet. Figure out how to fix this."); } var currentOffset = source.GetPosition(); var obj = parser.Parse(lengthObj.ToIndirectReference(), source, isLenientParsing); source.Seek(currentOffset); if (obj is ICosNumber referenceNumber) { return(referenceNumber); } throw new InvalidOperationException("Length object content was not read."); } if (currentObject is ICosNumber objectNumber) { return(objectNumber); } throw new InvalidOperationException("Wrong type of referenced length object " + lengthObj + ": " + lengthObj.GetObject().GetType().Name); } throw new InvalidOperationException($"Wrong type of length object: {lengthBaseObj.GetType().Name}"); }
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader, CosObjectKey key, CosObjectPool pool, bool isLenientParsing) { reader.Seek(offset); var objectNumber = ObjectHelper.ReadObjectNumber(reader); var objectGeneration = ObjectHelper.ReadGenerationNumber(reader); ReadHelper.ReadExpectedString(reader, "obj", true); if (objectNumber != key.Number || objectGeneration != key.Generation) { throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}"); } ReadHelper.SkipSpaces(reader); var baseObject = baseParser.Parse(reader, pool); var endObjectKey = ReadHelper.ReadString(reader); var atStreamStart = string.Equals(endObjectKey, "stream"); if (atStreamStart) { var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey); reader.Rewind(streamStartBytes.Length); baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey); } if (!string.Equals(endObjectKey, "endobj")) { var message = $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'"; if (isLenientParsing) { log.Warn(message); } else { throw new InvalidOperationException(message); } } return(baseObject); }
private void bfSearchForLastEOFMarker(IRandomAccessRead source) { if (lastEOFMarker == null) { long originOffset = source.GetPosition(); source.Seek(MINIMUM_SEARCH_OFFSET); while (!source.IsEof()) { // search for EOF marker if (ReadHelper.IsString(source, "%%EOF")) { long tempMarker = source.GetPosition(); source.Seek(tempMarker + 5); try { // check if the following data is some valid pdf content // which most likely indicates that the pdf is linearized, // updated or just cut off somewhere in the middle ReadHelper.SkipSpaces(source); ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); } catch (InvalidOperationException exception) { // save the EOF marker as the following data is most likely some garbage lastEOFMarker = tempMarker; } } source.Read(); } source.Seek(originOffset); // no EOF marker found if (lastEOFMarker == null) { lastEOFMarker = long.MaxValue; } } }
public static bool IsString(IRandomAccessRead reader, IEnumerable <byte> str) { bool bytesMatching = true; long originOffset = reader.GetPosition(); foreach (var c in str) { if (reader.Read() != c) { bytesMatching = false; break; } } reader.Seek(originOffset); return(bytesMatching); }
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer) { trailer = null; // parse the last trailer. var trailerOffset = source.GetPosition(); // PDFBOX-1739 skip extra xref entries in RegisSTAR documents if (isLenientParsing) { int nextCharacter = source.Peek(); while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter)) { if (source.GetPosition() == trailerOffset) { // warn only the first time //LOG.warn("Expected trailer object at position " + trailerOffset // + ", keep trying"); } ReadHelper.ReadLine(source); nextCharacter = source.Peek(); } } if (source.Peek() != 't') { return(false); } //read "trailer" long currentOffset = source.GetPosition(); string nextLine = ReadHelper.ReadLine(source); if (!nextLine.Trim().Equals("trailer")) { // in some cases the EOL is missing and the trailer immediately // continues with "<<" or with a blank character // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. if (nextLine.StartsWith("trailer")) { // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes int len = "trailer".Length; // jump back right after "trailer" source.Seek(currentOffset + len); } else { return(false); } } // in some cases the EOL is missing and the trailer continues with " <<" // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. ReadHelper.SkipSpaces(source); PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool); trailer = parsedTrailer; ReadHelper.SkipSpaces(source); return(true); }
public IReadOnlyDictionary <CosObjectKey, long> GetObjectLocations() { if (objectLocations != null) { return(objectLocations); } var lastEndOfFile = GetLastEndOfFileMarker(); var results = new Dictionary <CosObjectKey, long>(); var originPosition = reader.GetPosition(); long currentOffset = MinimumSearchOffset; long lastObjectId = long.MinValue; int lastGenerationId = int.MinValue; long lastObjOffset = long.MinValue; byte[] objString = OtherEncodings.StringAsLatin1Bytes(" obj"); byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj"); bool endobjFound = false; do { reader.Seek(currentOffset); if (ReadHelper.IsString(reader, objString)) { long tempOffset = currentOffset - 1; reader.Seek(tempOffset); int generationId = reader.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(generationId)) { generationId -= 48; tempOffset--; reader.Seek(tempOffset); if (ReadHelper.IsSpace(reader)) { while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader)) { reader.Seek(--tempOffset); } bool objectIdFound = false; while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader)) { reader.Seek(--tempOffset); objectIdFound = true; } if (objectIdFound) { reader.Read(); long objectId = ObjectHelper.ReadObjectNumber(reader); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset; } lastObjectId = objectId; lastGenerationId = generationId; lastObjOffset = tempOffset + 1; currentOffset += objString.Length - 1; endobjFound = false; } } } } else if (ReadHelper.IsString(reader, "endobj")) { endobjFound = true; currentOffset += endobjString.Length - 1; } currentOffset++; } while (currentOffset < lastEndOfFile && !reader.IsEof()); if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset; } // reestablish origin position reader.Seek(originPosition); objectLocations = results; return(objectLocations); }
public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing) { // repair mode isn't available in non-lenient mode if (!isLenientParsing) { return(startXRefOffset); } source.Seek(startXRefOffset); ReadHelper.SkipSpaces(source); if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref")) { return(startXRefOffset); } if (startXRefOffset > 0) { if (CheckXRefStreamOffset(source, startXRefOffset, true, pool)) { return(startXRefOffset); } return(CalculateXRefFixedOffset(startXRefOffset)); } // can't find a valid offset return(-1); }
private void bfSearchForObjects(IRandomAccessRead source) { bfSearchForLastEOFMarker(source); bfSearchCOSObjectKeyOffsets = new Dictionary <CosObjectKey, long>(); long originOffset = source.GetPosition(); long currentOffset = MINIMUM_SEARCH_OFFSET; long lastObjectId = long.MinValue; int lastGenID = int.MinValue; long lastObjOffset = long.MinValue; char[] objString = " obj".ToCharArray(); char[] endobjString = "endobj".ToCharArray(); bool endobjFound = false; do { source.Seek(currentOffset); if (ReadHelper.IsString(source, "obj")) { long tempOffset = currentOffset - 1; source.Seek(tempOffset); int genID = source.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(genID)) { genID -= 48; tempOffset--; source.Seek(tempOffset); if (ReadHelper.IsSpace(source)) { while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source)) { source.Seek(--tempOffset); } bool objectIDFound = false; while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source)) { source.Seek(--tempOffset); objectIDFound = true; } if (objectIDFound) { source.Read(); long objectId = ObjectHelper.ReadObjectNumber(source); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset; } lastObjectId = objectId; lastGenID = genID; lastObjOffset = tempOffset + 1; currentOffset += objString.Length - 1; endobjFound = false; } } } } else if (ReadHelper.IsString(source, "endobj")) { endobjFound = true; currentOffset += endobjString.Length - 1; } currentOffset++; } while (currentOffset < lastEOFMarker && !source.IsEof()); if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset; } // reestablish origin position source.Seek(originOffset); }