private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool) { // repair mode isn't available in non-lenient mode if (!isLenient || startXRefOffset == 0) { return(true); } // seek to offset-1 source.Seek(startXRefOffset - 1); int nextValue = source.Read(); // the first character has to be a whitespace, and then a digit if (ReadHelper.IsWhitespace(nextValue)) { ReadHelper.SkipSpaces(source); if (ReadHelper.IsDigit(source)) { try { // it's a XRef stream ObjectHelper.ReadObjectNumber(source); ObjectHelper.ReadGenerationNumber(source); ReadHelper.ReadExpectedString(source, "obj", true); // check the dictionary to avoid false positives PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool); source.Seek(startXRefOffset); if (dict.IsType(CosName.XREF)) { return(true); } } catch (Exception ex) { log.Error("Couldn't read the xref stream object.", ex); // there wasn't an object of a xref stream source.Seek(startXRefOffset); } } } return(false); }
private void BfSearchForXRefStreams(IInputBytes bytes) { if (bfSearchXRefStreamsOffsets != null) { return; } // a pdf may contain more than one /XRef entry bfSearchXRefStreamsOffsets = new List <long>(); var startOffset = bytes.CurrentOffset; bytes.Seek(MinimumSearchOffset); // search for XRef streams var objString = " obj"; while (bytes.MoveNext() && !bytes.IsAtEnd()) { if (!ReadHelper.IsString(bytes, "xref")) { continue; } // search backwards for the beginning of the stream long newOffset = -1; long xrefOffset = bytes.CurrentOffset; bool objFound = false; for (var i = 1; i < 40; i++) { if (objFound) { break; } long currentOffset = xrefOffset - (i * 10); if (currentOffset > 0) { bytes.Seek(currentOffset); for (int j = 0; j < 10; j++) { if (ReadHelper.IsString(bytes, objString)) { long tempOffset = currentOffset - 1; bytes.Seek(tempOffset); var generationNumber = bytes.Peek(); // is the next char a digit? if (generationNumber.HasValue && ReadHelper.IsDigit(generationNumber.Value)) { tempOffset--; bytes.Seek(tempOffset); // is the digit preceded by a space? if (ReadHelper.IsSpace(bytes.CurrentByte)) { int length = 0; bytes.Seek(--tempOffset); while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(bytes.CurrentByte)) { bytes.Seek(--tempOffset); length++; } if (length > 0) { bytes.MoveNext(); newOffset = bytes.CurrentOffset; } } } objFound = true; break; } currentOffset++; bytes.MoveNext(); } } } if (newOffset > -1) { bfSearchXRefStreamsOffsets.Add(newOffset); } bytes.Seek(xrefOffset + 5); } bytes.Seek(startOffset); }
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder) { builder = null; var tableStartOffset = source.GetPosition(); if (source.Peek() != 'x') { return(false); } var xref = ReadHelper.ReadString(source); if (!xref.Trim().Equals("xref")) { return(false); } // check for trailer after xref var str = ReadHelper.ReadString(source); byte[] b = OtherEncodings.StringAsLatin1Bytes(str); source.Rewind(b.Length); if (str.StartsWith("trailer")) { log.Warn("skipping empty xref table"); return(false); } builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; // Tables can have multiple sections. Each starts with a starting object id and a count. while (true) { if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition)) { log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}"); if (isLenientParsing) { break; } return(false); } var currentObjectId = subsectionDefinition.FirstNumber; ReadHelper.SkipSpaces(source); for (var i = 0; i < subsectionDefinition.Count; i++) { if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek())) { break; } if (source.Peek() == 't') { break; } //Ignore table contents var currentLine = ReadHelper.ReadLine(source); var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (splitString.Length < 3) { log.Warn("invalid xref line: " + currentLine); break; } // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) if (splitString[splitString.Length - 1].Equals(InUseEntry)) { try { var objectOffset = long.Parse(splitString[0]); if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition()) { // PDFBOX-3923: offset points inside this table - that can't be good throw new InvalidOperationException( $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}"); } var generation = int.Parse(splitString[1]); builder.Add(currentObjectId, generation, objectOffset); } catch (FormatException e) { throw new InvalidOperationException("Bad", e); } } else if (!splitString[2].Equals(FreeEntry)) { throw new InvalidOperationException( $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}."); } currentObjectId++; ReadHelper.SkipSpaces(source); } ReadHelper.SkipSpaces(source); if (!ReadHelper.IsDigit(source)) { break; } } if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer)) { throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}."); } builder.Dictionary = trailer; builder.Previous = trailer.GetLongOrDefault(CosName.PREV); return(true); }
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer) { trailer = null; // parse the last trailer. var trailerOffset = source.GetPosition(); // PDFBOX-1739 skip extra xref entries in RegisSTAR documents if (isLenientParsing) { int nextCharacter = source.Peek(); while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter)) { if (source.GetPosition() == trailerOffset) { // warn only the first time //LOG.warn("Expected trailer object at position " + trailerOffset // + ", keep trying"); } ReadHelper.ReadLine(source); nextCharacter = source.Peek(); } } if (source.Peek() != 't') { return(false); } //read "trailer" long currentOffset = source.GetPosition(); string nextLine = ReadHelper.ReadLine(source); if (!nextLine.Trim().Equals("trailer")) { // in some cases the EOL is missing and the trailer immediately // continues with "<<" or with a blank character // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. if (nextLine.StartsWith("trailer")) { // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes int len = "trailer".Length; // jump back right after "trailer" source.Seek(currentOffset + len); } else { return(false); } } // in some cases the EOL is missing and the trailer continues with " <<" // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. ReadHelper.SkipSpaces(source); PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool); trailer = parsedTrailer; ReadHelper.SkipSpaces(source); return(true); }
private void BfSearchForXRefStreams() { if (bfSearchXRefStreamsOffsets == null) { // a pdf may contain more than one /XRef entry bfSearchXRefStreamsOffsets = new List <long>(); long originOffset = source.GetPosition(); source.Seek(MinimumSearchOffset); // search for XRef streams var objString = " obj"; while (!source.IsEof()) { if (ReadHelper.IsString(source, "xref")) { // search backwards for the beginning of the stream long newOffset = -1; long xrefOffset = source.GetPosition(); bool objFound = false; for (int i = 1; i < 40 && !objFound; i++) { long currentOffset = xrefOffset - (i * 10); if (currentOffset > 0) { source.Seek(currentOffset); for (int j = 0; j < 10; j++) { if (ReadHelper.IsString(source, objString)) { long tempOffset = currentOffset - 1; source.Seek(tempOffset); int genId = source.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(genId)) { tempOffset--; source.Seek(tempOffset); if (ReadHelper.IsSpace(source)) { int length = 0; source.Seek(--tempOffset); while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(source)) { source.Seek(--tempOffset); length++; } if (length > 0) { source.Read(); newOffset = source.GetPosition(); } } } objFound = true; break; } else { currentOffset++; source.Read(); } } } } if (newOffset > -1) { bfSearchXRefStreamsOffsets.Add(newOffset); } source.Seek(xrefOffset + 5); } source.Read(); } source.Seek(originOffset); } }
private void bfSearchForObjects(IRandomAccessRead source) { bfSearchForLastEOFMarker(source); bfSearchCOSObjectKeyOffsets = new Dictionary <CosObjectKey, long>(); long originOffset = source.GetPosition(); long currentOffset = MINIMUM_SEARCH_OFFSET; long lastObjectId = long.MinValue; int lastGenID = int.MinValue; long lastObjOffset = long.MinValue; char[] objString = " obj".ToCharArray(); char[] endobjString = "endobj".ToCharArray(); bool endobjFound = false; do { source.Seek(currentOffset); if (ReadHelper.IsString(source, "obj")) { long tempOffset = currentOffset - 1; source.Seek(tempOffset); int genID = source.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(genID)) { genID -= 48; tempOffset--; source.Seek(tempOffset); if (ReadHelper.IsSpace(source)) { while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source)) { source.Seek(--tempOffset); } bool objectIDFound = false; while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source)) { source.Seek(--tempOffset); objectIDFound = true; } if (objectIDFound) { source.Read(); long objectId = ObjectHelper.ReadObjectNumber(source); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset; } lastObjectId = objectId; lastGenID = genID; lastObjOffset = tempOffset + 1; currentOffset += objString.Length - 1; endobjFound = false; } } } } else if (ReadHelper.IsString(source, "endobj")) { endobjFound = true; currentOffset += endobjString.Length - 1; } currentOffset++; } while (currentOffset < lastEOFMarker && !source.IsEof()); if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset; } // reestablish origin position source.Seek(originOffset); }