private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset, bool isLenientParsing, out string endObjectKey) { if (currentBase is PdfDictionary dictionary) { PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null); currentBase = stream; } else { // this is not legal // the combination of a dict and the stream/endstream // forms a complete stream object throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset})."); } ReadHelper.SkipSpaces(reader); endObjectKey = ReadHelper.ReadLine(reader); // we have case with a second 'endstream' before endobj if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream")) { endObjectKey = endObjectKey.Substring(9).Trim(); if (endObjectKey.Length == 0) { // no other characters in extra endstream line // read next line endObjectKey = ReadHelper.ReadLine(reader); } } return(currentBase); }
public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing) { // repair mode isn't available in non-lenient mode if (!isLenientParsing) { return(startXRefOffset); } source.Seek(startXRefOffset); ReadHelper.SkipSpaces(source); if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref")) { return(startXRefOffset); } if (startXRefOffset > 0) { if (CheckXRefStreamOffset(source, startXRefOffset, true, pool)) { return(startXRefOffset); } return(CalculateXRefFixedOffset(startXRefOffset)); } // can't find a valid offset return(-1); }
private void BruteForceSearchForEndOfFileMarker(IInputBytes source) { if (lastEndOfFileMarker != null) { return; } long startOffset = source.CurrentOffset; source.Seek(MINIMUM_SEARCH_OFFSET); while (!source.IsAtEnd()) { // search for EOF marker if (ReadHelper.IsString(source, "%%EOF")) { long tempMarker = source.CurrentOffset; if (tempMarker >= source.Length) { lastEndOfFileMarker = tempMarker; break; } try { source.Seek(tempMarker + 5); // check if the following data is some valid pdf content // which most likely indicates that the pdf is linearized, // updated or just cut off somewhere in the middle ReadHelper.SkipSpaces(source); ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); } catch (Exception) { // save the EOF marker as the following data is most likely some garbage lastEndOfFileMarker = tempMarker; } } source.MoveNext(); } source.Seek(startOffset); // no EOF marker found if (lastEndOfFileMarker == null) { lastEndOfFileMarker = long.MaxValue; } }
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader, CosObjectKey key, CosObjectPool pool, bool isLenientParsing) { reader.Seek(offset); var objectNumber = ObjectHelper.ReadObjectNumber(reader); var objectGeneration = ObjectHelper.ReadGenerationNumber(reader); ReadHelper.ReadExpectedString(reader, "obj", true); if (objectNumber != key.Number || objectGeneration != key.Generation) { throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}"); } ReadHelper.SkipSpaces(reader); var baseObject = baseParser.Parse(reader, pool); var endObjectKey = ReadHelper.ReadString(reader); var atStreamStart = string.Equals(endObjectKey, "stream"); if (atStreamStart) { var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey); reader.Rewind(streamStartBytes.Length); baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey); } if (!string.Equals(endObjectKey, "endobj")) { var message = $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'"; if (isLenientParsing) { log.Warn(message); } else { throw new InvalidOperationException(message); } } return(baseObject); }
private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool) { // repair mode isn't available in non-lenient mode if (!isLenient || startXRefOffset == 0) { return(true); } // seek to offset-1 source.Seek(startXRefOffset - 1); int nextValue = source.Read(); // the first character has to be a whitespace, and then a digit if (ReadHelper.IsWhitespace(nextValue)) { ReadHelper.SkipSpaces(source); if (ReadHelper.IsDigit(source)) { try { // it's a XRef stream ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); ReadHelper.ReadExpectedString(source, "obj", true); // check the dictionary to avoid false positives PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool); source.Seek(startXRefOffset); if (dict.IsType(CosName.XREF)) { return(true); } } catch (Exception ex) { log.Error("Couldn't read the xref stream object.", ex); // there wasn't an object of a xref stream source.Seek(startXRefOffset); } } } return(false); }
private void bfSearchForLastEOFMarker(IRandomAccessRead source) { if (lastEOFMarker == null) { long originOffset = source.GetPosition(); source.Seek(MINIMUM_SEARCH_OFFSET); while (!source.IsEof()) { // search for EOF marker if (ReadHelper.IsString(source, "%%EOF")) { long tempMarker = source.GetPosition(); source.Seek(tempMarker + 5); try { // check if the following data is some valid pdf content // which most likely indicates that the pdf is linearized, // updated or just cut off somewhere in the middle ReadHelper.SkipSpaces(source); ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); } catch (InvalidOperationException exception) { // save the EOF marker as the following data is most likely some garbage lastEOFMarker = tempMarker; } } source.Read(); } source.Seek(originOffset); // no EOF marker found if (lastEOFMarker == null) { lastEOFMarker = long.MaxValue; } } }
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder) { builder = null; var tableStartOffset = source.GetPosition(); if (source.Peek() != 'x') { return(false); } var xref = ReadHelper.ReadString(source); if (!xref.Trim().Equals("xref")) { return(false); } // check for trailer after xref var str = ReadHelper.ReadString(source); byte[] b = OtherEncodings.StringAsLatin1Bytes(str); source.Rewind(b.Length); if (str.StartsWith("trailer")) { log.Warn("skipping empty xref table"); return(false); } builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; // Tables can have multiple sections. Each starts with a starting object id and a count. while (true) { if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition)) { log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}"); if (isLenientParsing) { break; } return(false); } var currentObjectId = subsectionDefinition.FirstNumber; ReadHelper.SkipSpaces(source); for (var i = 0; i < subsectionDefinition.Count; i++) { if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek())) { break; } if (source.Peek() == 't') { break; } //Ignore table contents var currentLine = ReadHelper.ReadLine(source); var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (splitString.Length < 3) { log.Warn("invalid xref line: " + currentLine); break; } // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) if (splitString[splitString.Length - 1].Equals(InUseEntry)) { try { var objectOffset = long.Parse(splitString[0]); if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition()) { // PDFBOX-3923: offset points inside this table - that can't be good throw new InvalidOperationException( $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}"); } var generation = int.Parse(splitString[1]); builder.Add(currentObjectId, generation, objectOffset); } catch (FormatException e) { throw new InvalidOperationException("Bad", e); } } else if (!splitString[2].Equals(FreeEntry)) { throw new InvalidOperationException( $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}."); } currentObjectId++; ReadHelper.SkipSpaces(source); } ReadHelper.SkipSpaces(source); if (!ReadHelper.IsDigit(source)) { break; } } if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer)) { throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}."); } builder.Dictionary = trailer; builder.Previous = trailer.GetLongOrDefault(CosName.PREV); return(true); }
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer) { trailer = null; // parse the last trailer. var trailerOffset = source.GetPosition(); // PDFBOX-1739 skip extra xref entries in RegisSTAR documents if (isLenientParsing) { int nextCharacter = source.Peek(); while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter)) { if (source.GetPosition() == trailerOffset) { // warn only the first time //LOG.warn("Expected trailer object at position " + trailerOffset // + ", keep trying"); } ReadHelper.ReadLine(source); nextCharacter = source.Peek(); } } if (source.Peek() != 't') { return(false); } //read "trailer" long currentOffset = source.GetPosition(); string nextLine = ReadHelper.ReadLine(source); if (!nextLine.Trim().Equals("trailer")) { // in some cases the EOL is missing and the trailer immediately // continues with "<<" or with a blank character // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. if (nextLine.StartsWith("trailer")) { // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes int len = "trailer".Length; // jump back right after "trailer" source.Seek(currentOffset + len); } else { return(false); } } // in some cases the EOL is missing and the trailer continues with " <<" // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. ReadHelper.SkipSpaces(source); PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool); trailer = parsedTrailer; ReadHelper.SkipSpaces(source); return(true); }