private long GetLastEndOfFileMarker() { var originalOffset = reader.GetPosition(); var searchTerm = OtherEncodings.StringAsLatin1Bytes("%%EOF"); var minimumEndOffset = reader.Length() - searchTerm.Length; reader.Seek(minimumEndOffset); while (reader.GetPosition() > 0) { if (ReadHelper.IsString(reader, searchTerm)) { var position = reader.GetPosition(); reader.Seek(originalOffset); return(position); } reader.Seek(minimumEndOffset--); } reader.Seek(originalOffset); return(long.MaxValue); }
private bool validateStreamLength(IRandomAccessRead source, long streamLength, long fileLength) { bool streamLengthIsValid = true; long originOffset = source.GetPosition(); long expectedEndOfStream = originOffset + streamLength; if (expectedEndOfStream > fileLength) { streamLengthIsValid = false; //LOG.warn("The end of the stream is out of range, using workaround to read the stream, " // + "stream start position: " + originOffset + ", length: " + streamLength // + ", expected end position: " + expectedEndOfStream); } else { source.Seek(expectedEndOfStream); ReadHelper.SkipSpaces(source); if (!ReadHelper.IsString(source, "endstream")) { streamLengthIsValid = false; //LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, " // + "stream start position: " + originOffset + ", length: " + streamLength // + ", expected end position: " + expectedEndOfStream); } source.Seek(originOffset); } return(streamLengthIsValid); }
public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } if (baseParser == null) { throw new ArgumentNullException(nameof(baseParser)); } if (pool == null) { throw new ArgumentNullException(nameof(pool)); } ReadHelper.ReadExpectedChar(reader, '<'); ReadHelper.ReadExpectedChar(reader, '<'); ReadHelper.SkipSpaces(reader); var dictionary = new PdfDictionary(); var done = false; while (!done) { ReadHelper.SkipSpaces(reader); var c = (char)reader.Peek(); switch (c) { case '>': done = true; break; case '/': var nameValue = ParseCosDictionaryNameValuePair(reader, baseParser, pool); if (nameValue.key != null && nameValue.value != null) { dictionary.Set(nameValue.key, nameValue.value); } break; default: if (ReadUntilEnd(reader)) { return(new PdfDictionary()); } break; } } ReadHelper.ReadExpectedString(reader, ">>"); return(dictionary); }
private static CosBase ParseValue(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { var numOffset = reader.GetPosition(); var value = baseParser.Parse(reader, pool); ReadHelper.SkipSpaces(reader); // proceed if the given object is a number and the following is a number as well if (!(value is ICosNumber) || !ReadHelper.IsDigit(reader)) { return(value); } // read the remaining information of the object number var genOffset = reader.GetPosition(); var generationNumber = baseParser.Parse(reader, pool); ReadHelper.SkipSpaces(reader); ReadHelper.ReadExpectedChar(reader, 'R'); if (!(value is CosInt)) { throw new InvalidOperationException("expected number, actual=" + value + " at offset " + numOffset); } if (!(generationNumber is CosInt)) { throw new InvalidOperationException("expected number, actual=" + value + " at offset " + genOffset); } var key = new CosObjectKey(((CosInt)value).AsLong(), ((CosInt)generationNumber).AsInt()); // dereference the object return(pool.Get(key)); }
private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { var key = nameParser.Parse(reader); var value = ParseValue(reader, baseParser, pool); ReadHelper.SkipSpaces(reader); if ((char)reader.Peek() == 'd') { // if the next string is 'def' then we are parsing a cmap stream // and want to ignore it, otherwise throw an exception. var potentialDef = ReadHelper.ReadString(reader); if (!potentialDef.Equals("def")) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef)); } else { ReadHelper.SkipSpaces(reader); } } if (value == null) { log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader)); return(null, null); } // label this item as direct, to avoid signature problems. value.Direct = true; return(key, value); }
public static int ReadGenerationNumber(IRandomAccessRead reader) { int retval = ReadHelper.ReadInt(reader); if (retval < 0 || retval > GenerationNumberThreshold) { throw new FormatException("Generation Number '" + retval + "' has more than 5 digits"); } return(retval); }
public static long ReadObjectNumber(IRandomAccessRead reader) { long retval = ReadHelper.ReadLong(reader); if (retval < 0 || retval >= ObjectNumberThreshold) { throw new FormatException($"Object Number \'{retval}\' has more than 10 digits or is negative"); } return(retval); }
public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser) { PdfRawStream result; // read 'stream'; this was already tested in parseObjectsDynamically() ReadHelper.ReadExpectedString(reader, "stream"); skipWhiteSpaces(reader); // This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null. ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser); ValidateStreamLength(reader, isLenientParsing, streamLength); // get output stream to copy data to using (var stream = new MemoryStream()) using (var writer = new BinaryWriter(stream)) { if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length())) { ReadValidStream(reader, writer, streamLength); } else { ReadUntilEndStream(reader, writer); } result = new PdfRawStream(stream.ToArray(), streamDictionary); } String endStream = ReadHelper.ReadString(reader); if (endStream.Equals("endobj") && isLenientParsing) { log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}"); // avoid follow-up warning about missing endobj reader.Rewind("endobj".Length); } else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream")) { log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition()); // unread the "extra" bytes reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length); } else if (!endStream.Equals("endstream")) { throw new InvalidOperationException("Error reading stream, expected='endstream' actual='" + endStream + "' at offset " + reader.GetPosition()); } return(result); }
/// <summary> /// This will parse a PDF HEX string with fail fast semantic meaning that we stop if a not allowed character is found. /// This is necessary in order to detect malformed input and be able to skip to next object start. /// We assume starting '<' was already read. /// </summary> private static CosString ParseHexString(IRandomAccessRead reader) { var sBuf = new StringBuilder(); while (true) { int c = reader.Read(); if (ReadHelper.IsHexDigit((char)c)) { sBuf.Append((char)c); } else if (c == '>') { break; } else if (c < 0) { throw new IOException("Missing closing bracket for hex string. Reached EOS."); } else if (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\b' || c == '\f') { } else { // if invalid chars was found: discard last // hex character if it is not part of a pair if (sBuf.Length % 2 != 0) { sBuf.Remove(sBuf.Length - 1, 1); } // read till the closing bracket was found do { c = reader.Read(); }while (c != '>' && c >= 0); // might have reached EOF while looking for the closing bracket // this can happen for malformed PDFs only. Make sure that there is // no endless loop. if (c < 0) { throw new IOException("Missing closing bracket for hex string. Reached EOS."); } // exit loop break; } } return(CosString.ParseHex(sBuf.ToString())); }
public CosName Parse([NotNull] IRandomAccessRead reader) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } ReadHelper.ReadExpectedChar(reader, '/'); using (var memoryStream = new MemoryStream()) using (var writer = new BinaryWriter(memoryStream)) { int c = reader.Read(); while (c != -1) { byte ch = (byte)c; if (ch == '#') { int ch1 = reader.Read(); int ch2 = reader.Read(); // Prior to PDF v1.2, the # was not a special character. Also, // it has been observed that various PDF tools do not follow the // spec with respect to the # escape, even though they report // PDF versions of 1.2 or later. The solution here is that we // interpret the # as an escape only when it is followed by two // valid hex digits. if (ReadHelper.IsHexDigit((char)ch1) && ReadHelper.IsHexDigit((char)ch2)) { string hex = "" + (char)ch1 + (char)ch2; try { var byteToWrite = (byte)Convert.ToInt32(hex, 16); writer.Write(byteToWrite); } catch (FormatException e) { throw new IOException("Error: expected hex digit, actual='" + hex + "'", e); } c = reader.Read(); } else { // check for premature EOF if (ch2 == -1 || ch1 == -1) { //LOG.error("Premature EOF in BaseParser#parseCosName"); c = -1; break; } reader.Unread(ch2); c = ch1; writer.Write(ch); } } else if (ReadHelper.IsEndOfName(ch)) { break; } else { writer.Write(ch); c = reader.Read(); } } if (c != -1) { reader.Unread(c); } byte[] bytes = memoryStream.ToArray(); var str = ReadHelper.IsValidUtf8(bytes) ? Encoding.UTF8.GetString(memoryStream.ToArray()) : Encoding.GetEncoding("windows-1252").GetString(memoryStream.ToArray()); return(CosName.Create(str)); } }
public IReadOnlyDictionary <CosObjectKey, long> GetObjectLocations() { if (objectLocations != null) { return(objectLocations); } var lastEndOfFile = GetLastEndOfFileMarker(); var results = new Dictionary <CosObjectKey, long>(); var originPosition = reader.GetPosition(); long currentOffset = MinimumSearchOffset; long lastObjectId = long.MinValue; int lastGenerationId = int.MinValue; long lastObjOffset = long.MinValue; byte[] objString = OtherEncodings.StringAsLatin1Bytes(" obj"); byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj"); bool endobjFound = false; do { reader.Seek(currentOffset); if (ReadHelper.IsString(reader, objString)) { long tempOffset = currentOffset - 1; reader.Seek(tempOffset); int generationId = reader.Peek(); // is the next char a digit? if (ReadHelper.IsDigit(generationId)) { generationId -= 48; tempOffset--; reader.Seek(tempOffset); if (ReadHelper.IsSpace(reader)) { while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader)) { reader.Seek(--tempOffset); } bool objectIdFound = false; while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader)) { reader.Seek(--tempOffset); objectIdFound = true; } if (objectIdFound) { reader.Read(); long objectId = ObjectHelper.ReadObjectNumber(reader); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset; } lastObjectId = objectId; lastGenerationId = generationId; lastObjOffset = tempOffset + 1; currentOffset += objString.Length - 1; endobjFound = false; } } } } else if (ReadHelper.IsString(reader, "endobj")) { endobjFound = true; currentOffset += endobjString.Length - 1; } currentOffset++; } while (currentOffset < lastEndOfFile && !reader.IsEof()); if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset; } // reestablish origin position reader.Seek(originPosition); objectLocations = results; return(objectLocations); }
public CosString Parse(IRandomAccessRead seqSource) { char nextChar = (char)seqSource.Read(); if (nextChar == '<') { return(ParseHexString(seqSource)); } if (nextChar != '(') { throw new IOException("parseCOSstring string should start with '(' or '<' and not '" + nextChar + "' " + seqSource); } using (var memoryStream = new MemoryStream()) using (var writer = new StreamWriter(memoryStream)) { // This is the number of braces read int braces = 1; int c = seqSource.Read(); while (braces > 0 && c != -1) { char ch = (char)c; int nextc = -2; // not yet read if (ch == ')') { braces--; braces = CheckForEndOfString(seqSource, braces); if (braces != 0) { writer.Write(ch); } } else if (ch == '(') { braces++; writer.Write(ch); } else if (ch == '\\') { //patched by ram char next = (char)seqSource.Read(); switch (next) { case 'n': writer.Write('\n'); break; case 'r': writer.Write('\r'); break; case 't': writer.Write('\t'); break; case 'b': writer.Write('\b'); break; case 'f': writer.Write('\f'); break; case ')': // PDFBox 276 /Title (c:\) braces = CheckForEndOfString(seqSource, braces); if (braces != 0) { writer.Write(next); } else { writer.Write('\\'); } break; case '(': case '\\': writer.Write(next); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { var octal = new StringBuilder(); octal.Append(next); c = seqSource.Read(); char digit = (char)c; if (digit >= '0' && digit <= '7') { octal.Append(digit); c = seqSource.Read(); digit = (char)c; if (digit >= '0' && digit <= '7') { octal.Append(digit); } else { nextc = c; } } else { nextc = c; } int character; try { character = Convert.ToInt32(octal.ToString(), 8); } catch (FormatException e) { throw new IOException("Error: Expected octal character, actual='" + octal + "'", e); } writer.Write(character); break; } default: if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed) { // this is a break in the line so ignore it and the newline and continue c = seqSource.Read(); while (ReadHelper.IsEndOfLine(c) && c != -1) { c = seqSource.Read(); } nextc = c; break; } // dropping the backslash // see 7.3.4.2 Literal strings for further information writer.Write(next); break; } } else { writer.Write(ch); } if (nextc != -2) { c = nextc; } else { c = seqSource.Read(); } } if (c != -1) { seqSource.Unread(c); } writer.Flush(); return(new CosString(memoryStream.ToArray())); } }
public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool) { ReadHelper.ReadExpectedChar(reader, '['); var po = new COSArray(); CosBase pbo; ReadHelper.SkipSpaces(reader); int i; while (((i = reader.Peek()) > 0) && ((char)i != ']')) { pbo = baseParser.Parse(reader, pool); if (pbo is CosObject) { // We have to check if the expected values are there or not PDFBOX-385 if (po.get(po.size() - 1) is CosInt) { var genNumber = (CosInt)po.remove(po.size() - 1); if (po.get(po.size() - 1) is CosInt) { var number = (CosInt)po.remove(po.size() - 1); CosObjectKey key = new CosObjectKey(number.AsLong(), genNumber.AsInt()); pbo = pool.Get(key); } else { // the object reference is somehow wrong pbo = null; } } else { pbo = null; } } if (pbo != null) { po.add(pbo); } else { //it could be a bad object in the array which is just skipped // LOG.warn("Corrupt object reference at offset " + seqSource.getPosition()); // This could also be an "endobj" or "endstream" which means we can assume that // the array has ended. string isThisTheEnd = ReadHelper.ReadString(reader); reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd)); if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream")) { return(po); } } ReadHelper.SkipSpaces(reader); } // read ']' reader.Read(); ReadHelper.SkipSpaces(reader); return(po); }
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool) { CosBase retval = null; ReadHelper.SkipSpaces(reader); int nextByte = reader.Peek(); if (nextByte == -1) { return(null); } char c = (char)nextByte; switch (c) { case '<': { // pull off first left bracket int leftBracket = reader.Read(); // check for second left bracket c = (char)reader.Peek(); reader.Unread(leftBracket); if (c == '<') { retval = dictionaryParser.Parse(reader, this, pool); ReadHelper.SkipSpaces(reader); } else { retval = stringParser.Parse(reader); } break; } case '[': { // array retval = arrayParser.Parse(reader, this, pool); break; } case '(': retval = stringParser.Parse(reader); break; case '/': // name retval = nameParser.Parse(reader); break; case 'n': { // null ReadHelper.ReadExpectedString(reader, "null"); retval = CosNull.Null; break; } case 't': { string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4)); if (truestring.Equals("true")) { retval = PdfBoolean.True; } else { throw new IOException("expected true actual='" + truestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'f': { string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5)); if (falsestring.Equals("false")) { retval = PdfBoolean.False; } else { throw new IOException("expected false actual='" + falsestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'R': reader.Read(); retval = new CosObject(null); break; default: if (char.IsDigit(c) || c == '-' || c == '+' || c == '.') { StringBuilder buf = new StringBuilder(); int ic = reader.Read(); c = (char)ic; while (char.IsDigit(c) || c == '-' || c == '+' || c == '.' || c == 'E' || c == 'e') { buf.Append(c); ic = reader.Read(); c = (char)ic; } if (ic != -1) { reader.Unread(ic); } retval = CosNumberFactory.get(buf.ToString()) as CosBase; } else { //This is not suppose to happen, but we will allow for it //so we are more compatible with POS writers that don't //follow the spec string badstring = ReadHelper.ReadString(reader); if (badstring == string.Empty) { int peek = reader.Peek(); // we can end up in an infinite loop otherwise throw new IOException("Unknown dir object c='" + c + "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " at offset " + reader.GetPosition()); } // if it's an endstream/endobj, we want to put it back so the caller will see it if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring)) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring)); } } break; } return(retval); }