public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing) { // repair mode isn't available in non-lenient mode if (!isLenientParsing) { return(startXRefOffset); } source.Seek(startXRefOffset); ReadHelper.SkipSpaces(source); if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref")) { return(startXRefOffset); } if (startXRefOffset > 0) { if (CheckXRefStreamOffset(source, startXRefOffset, true, pool)) { return(startXRefOffset); } return(CalculateXRefFixedOffset(startXRefOffset)); } // can't find a valid offset return(-1); }
public static string ReadLine(IRandomAccessRead reader) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } if (reader.IsEof()) { throw new InvalidOperationException("Error: End-of-File, expected line"); } var buffer = new StringBuilder(11); int c; while ((c = reader.Read()) != -1) { // CR and LF are valid EOLs if (IsEndOfLine(c)) { break; } buffer.Append((char)c); } // CR+LF is also a valid EOL if (IsCarriageReturn(c) && IsLineFeed(reader.Peek())) { reader.Read(); } return(buffer.ToString()); }
public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } if (baseParser == null) { throw new ArgumentNullException(nameof(baseParser)); } if (pool == null) { throw new ArgumentNullException(nameof(pool)); } ReadHelper.ReadExpectedChar(reader, '<'); ReadHelper.ReadExpectedChar(reader, '<'); ReadHelper.SkipSpaces(reader); var dictionary = new PdfDictionary(); var done = false; while (!done) { ReadHelper.SkipSpaces(reader); var c = (char)reader.Peek(); switch (c) { case '>': done = true; break; case '/': var nameValue = ParseCosDictionaryNameValuePair(reader, baseParser, pool); if (nameValue.key != null && nameValue.value != null) { dictionary.Set(nameValue.key, nameValue.value); } break; default: if (ReadUntilEnd(reader)) { return(new PdfDictionary()); } break; } } ReadHelper.ReadExpectedString(reader, ">>"); return(dictionary); }
private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { var key = nameParser.Parse(reader); var value = ParseValue(reader, baseParser, pool); ReadHelper.SkipSpaces(reader); if ((char)reader.Peek() == 'd') { // if the next string is 'def' then we are parsing a cmap stream // and want to ignore it, otherwise throw an exception. var potentialDef = ReadHelper.ReadString(reader); if (!potentialDef.Equals("def")) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef)); } else { ReadHelper.SkipSpaces(reader); } } if (value == null) { log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader)); return(null, null); } // label this item as direct, to avoid signature problems. value.Direct = true; return(key, value); }
public int Peek() { if (Throw) { throw new InvalidOperationException(); } return(reader.Peek()); }
public int peek() { return(reader.Peek()); }
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder) { builder = null; var tableStartOffset = source.GetPosition(); if (source.Peek() != 'x') { return(false); } var xref = ReadHelper.ReadString(source); if (!xref.Trim().Equals("xref")) { return(false); } // check for trailer after xref var str = ReadHelper.ReadString(source); byte[] b = OtherEncodings.StringAsLatin1Bytes(str); source.Rewind(b.Length); if (str.StartsWith("trailer")) { log.Warn("skipping empty xref table"); return(false); } builder = new CrossReferenceTablePartBuilder { Offset = offset, XRefType = CrossReferenceType.Table }; // Tables can have multiple sections. Each starts with a starting object id and a count. while (true) { if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition)) { log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}"); if (isLenientParsing) { break; } return(false); } var currentObjectId = subsectionDefinition.FirstNumber; ReadHelper.SkipSpaces(source); for (var i = 0; i < subsectionDefinition.Count; i++) { if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek())) { break; } if (source.Peek() == 't') { break; } //Ignore table contents var currentLine = ReadHelper.ReadLine(source); var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); if (splitString.Length < 3) { log.Warn("invalid xref line: " + currentLine); break; } // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) if (splitString[splitString.Length - 1].Equals(InUseEntry)) { try { var objectOffset = long.Parse(splitString[0]); if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition()) { // PDFBOX-3923: offset points inside this table - that can't be good throw new InvalidOperationException( $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}"); } var generation = int.Parse(splitString[1]); builder.Add(currentObjectId, generation, objectOffset); } catch (FormatException e) { throw new InvalidOperationException("Bad", e); } } else if (!splitString[2].Equals(FreeEntry)) { throw new InvalidOperationException( $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}."); } currentObjectId++; ReadHelper.SkipSpaces(source); } ReadHelper.SkipSpaces(source); if (!ReadHelper.IsDigit(source)) { break; } } if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer)) { throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}."); } builder.Dictionary = trailer; builder.Previous = trailer.GetLongOrDefault(CosName.PREV); return(true); }
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer) { trailer = null; // parse the last trailer. var trailerOffset = source.GetPosition(); // PDFBOX-1739 skip extra xref entries in RegisSTAR documents if (isLenientParsing) { int nextCharacter = source.Peek(); while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter)) { if (source.GetPosition() == trailerOffset) { // warn only the first time //LOG.warn("Expected trailer object at position " + trailerOffset // + ", keep trying"); } ReadHelper.ReadLine(source); nextCharacter = source.Peek(); } } if (source.Peek() != 't') { return(false); } //read "trailer" long currentOffset = source.GetPosition(); string nextLine = ReadHelper.ReadLine(source); if (!nextLine.Trim().Equals("trailer")) { // in some cases the EOL is missing and the trailer immediately // continues with "<<" or with a blank character // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. if (nextLine.StartsWith("trailer")) { // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes int len = "trailer".Length; // jump back right after "trailer" source.Seek(currentOffset + len); } else { return(false); } } // in some cases the EOL is missing and the trailer continues with " <<" // even if this does not comply with PDF reference we want to support as many PDFs as possible // Acrobat reader can also deal with this. ReadHelper.SkipSpaces(source); PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool); trailer = parsedTrailer; ReadHelper.SkipSpaces(source); return(true); }
public IReadOnlyDictionary <CosObjectKey, long> GetObjectLocations() { if (objectLocations != null) { return(objectLocations); } var lastEndOfFile = GetLastEndOfFileMarker(); var results = new Dictionary <CosObjectKey, long>(); var originPosition = reader.GetPosition(); long currentOffset = MinimumSearchOffset; long lastObjectId = long.MinValue; int lastGenerationId = int.MinValue; long lastObjOffset = long.MinValue; byte[] objString = OtherEncodings.StringAsLatin1Bytes(" obj"); byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj"); bool endobjFound = false; do { reader.Seek(currentOffset); if (ReadHelper.IsString(reader, objString)) { long tempOffset = currentOffset - 1; reader.Seek(tempOffset); int generationId = reader.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(generationId)) { generationId -= 48; tempOffset--; reader.Seek(tempOffset); if (ReadHelper.IsSpace(reader)) { while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader)) { reader.Seek(--tempOffset); } bool objectIdFound = false; while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader)) { reader.Seek(--tempOffset); objectIdFound = true; } if (objectIdFound) { reader.Read(); long objectId = ObjectHelper.ReadObjectNumber(reader); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset; } lastObjectId = objectId; lastGenerationId = generationId; lastObjOffset = tempOffset + 1; currentOffset += objString.Length - 1; endobjFound = false; } } } } else if (ReadHelper.IsString(reader, "endobj")) { endobjFound = true; currentOffset += endobjString.Length - 1; } currentOffset++; } while (currentOffset < lastEndOfFile && !reader.IsEof()); if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset; } // reestablish origin position reader.Seek(originPosition); objectLocations = results; return(objectLocations); }
public static bool IsSpace(IRandomAccessRead reader) { return(IsSpace(reader.Peek())); }
public static bool IsDigit(IRandomAccessRead reader) { return(IsDigit(reader.Peek())); }
public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool) { ReadHelper.ReadExpectedChar(reader, '['); var po = new COSArray(); CosBase pbo; ReadHelper.SkipSpaces(reader); int i; while (((i = reader.Peek()) > 0) && ((char)i != ']')) { pbo = baseParser.Parse(reader, pool); if (pbo is CosObject) { // We have to check if the expected values are there or not PDFBOX-385 if (po.get(po.size() - 1) is CosInt) { var genNumber = (CosInt)po.remove(po.size() - 1); if (po.get(po.size() - 1) is CosInt) { var number = (CosInt)po.remove(po.size() - 1); CosObjectKey key = new CosObjectKey(number.AsLong(), genNumber.AsInt()); pbo = pool.Get(key); } else { // the object reference is somehow wrong pbo = null; } } else { pbo = null; } } if (pbo != null) { po.add(pbo); } else { //it could be a bad object in the array which is just skipped // LOG.warn("Corrupt object reference at offset " + seqSource.getPosition()); // This could also be an "endobj" or "endstream" which means we can assume that // the array has ended. string isThisTheEnd = ReadHelper.ReadString(reader); reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd)); if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream")) { return(po); } } ReadHelper.SkipSpaces(reader); } // read ']' reader.Read(); ReadHelper.SkipSpaces(reader); return(po); }
private void bfSearchForObjects(IRandomAccessRead source) { bfSearchForLastEOFMarker(source); bfSearchCOSObjectKeyOffsets = new Dictionary <CosObjectKey, long>(); long originOffset = source.GetPosition(); long currentOffset = MINIMUM_SEARCH_OFFSET; long lastObjectId = long.MinValue; int lastGenID = int.MinValue; long lastObjOffset = long.MinValue; char[] objString = " obj".ToCharArray(); char[] endobjString = "endobj".ToCharArray(); bool endobjFound = false; do { source.Seek(currentOffset); if (ReadHelper.IsString(source, "obj")) { long tempOffset = currentOffset - 1; source.Seek(tempOffset); int genID = source.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(genID)) { genID -= 48; tempOffset--; source.Seek(tempOffset); if (ReadHelper.IsSpace(source)) { while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source)) { source.Seek(--tempOffset); } bool objectIDFound = false; while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source)) { source.Seek(--tempOffset); objectIDFound = true; } if (objectIDFound) { source.Read(); long objectId = ObjectHelper.ReadObjectNumber(source); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset; } lastObjectId = objectId; lastGenID = genID; lastObjOffset = tempOffset + 1; currentOffset += objString.Length - 1; endobjFound = false; } } } } else if (ReadHelper.IsString(source, "endobj")) { endobjFound = true; currentOffset += endobjString.Length - 1; } currentOffset++; } while (currentOffset < lastEOFMarker && !source.IsEof()); if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id bfSearchCOSObjectKeyOffsets[new CosObjectKey(lastObjectId, lastGenID)] = lastObjOffset; } // reestablish origin position source.Seek(originOffset); }
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool) { CosBase retval = null; ReadHelper.SkipSpaces(reader); int nextByte = reader.Peek(); if (nextByte == -1) { return(null); } char c = (char)nextByte; switch (c) { case '<': { // pull off first left bracket int leftBracket = reader.Read(); // check for second left bracket c = (char)reader.Peek(); reader.Unread(leftBracket); if (c == '<') { retval = dictionaryParser.Parse(reader, this, pool); ReadHelper.SkipSpaces(reader); } else { retval = stringParser.Parse(reader); } break; } case '[': { // array retval = arrayParser.Parse(reader, this, pool); break; } case '(': retval = stringParser.Parse(reader); break; case '/': // name retval = nameParser.Parse(reader); break; case 'n': { // null ReadHelper.ReadExpectedString(reader, "null"); retval = CosNull.Null; break; } case 't': { string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4)); if (truestring.Equals("true")) { retval = PdfBoolean.True; } else { throw new IOException("expected true actual='" + truestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'f': { string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5)); if (falsestring.Equals("false")) { retval = PdfBoolean.False; } else { throw new IOException("expected false actual='" + falsestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'R': reader.Read(); retval = new CosObject(null); break; default: if (char.IsDigit(c) || c == '-' || c == '+' || c == '.') { StringBuilder buf = new StringBuilder(); int ic = reader.Read(); c = (char)ic; while (char.IsDigit(c) || c == '-' || c == '+' || c == '.' || c == 'E' || c == 'e') { buf.Append(c); ic = reader.Read(); c = (char)ic; } if (ic != -1) { reader.Unread(ic); } retval = CosNumberFactory.get(buf.ToString()) as CosBase; } else { //This is not suppose to happen, but we will allow for it //so we are more compatible with POS writers that don't //follow the spec string badstring = ReadHelper.ReadString(reader); if (badstring == string.Empty) { int peek = reader.Peek(); // we can end up in an infinite loop otherwise throw new IOException("Unknown dir object c='" + c + "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " at offset " + reader.GetPosition()); } // if it's an endstream/endobj, we want to put it back so the caller will see it if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring)) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring)); } } break; } return(retval); }