protected void skipWhiteSpaces(IRandomAccessRead reader) { //PDF Ref 3.2.7 A stream must be followed by either //a CRLF or LF but nothing else. int whitespace = reader.Read(); //see brother_scan_cover.pdf, it adds whitespaces //after the stream but before the start of the //data, so just read those first while (whitespace == ' ') { whitespace = reader.Read(); } if (whitespace == ReadHelper.AsciiCarriageReturn) { whitespace = reader.Read(); if (whitespace != ReadHelper.AsciiLineFeed) { reader.Unread(whitespace); //The spec says this is invalid but it happens in the real //world so we must support it. } } else if (whitespace != ReadHelper.AsciiLineFeed) { //we are in an error. //but again we will do a lenient parsing and just assume that everything //is fine reader.Unread(whitespace); } }
public void Unread(int b) { if (Throw) { throw new InvalidOperationException(); } reader.Unread(b); }
private static StringBuilder ReadStringNumber(IRandomAccessRead reader) { int lastByte = 0; StringBuilder buffer = new StringBuilder(); while ((lastByte = reader.Read()) != ' ' && lastByte != AsciiLineFeed && lastByte != AsciiCarriageReturn && lastByte != 60 && //see sourceforge bug 1714707 lastByte != '[' && // PDFBOX-1845 lastByte != '(' && // PDFBOX-2579 lastByte != 0 && //See sourceforge bug 853328 lastByte != -1) { buffer.Append((char)lastByte); if (buffer.Length > long.MaxValue.ToString("D").Length) { throw new IOException("Number '" + buffer + "' is getting too long, stop reading at offset " + reader.GetPosition()); } } if (lastByte != -1) { reader.Unread(lastByte); } return(buffer); }
private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter) { int braces = bracesParameter; byte[] nextThreeBytes = new byte[3]; int amountRead = reader.Read(nextThreeBytes); // Check the next 3 bytes if available // The following cases are valid indicators for the end of the string // 1. Next line contains another COSObject: CR + LF + '/' // 2. CosDictionary ends in the next line: CR + LF + '>' // 3. Next line contains another COSObject: CR + '/' // 4. CosDictionary ends in the next line: CR + '>' if (amountRead == 3 && nextThreeBytes[0] == ReadHelper.AsciiCarriageReturn) { if (nextThreeBytes[1] == ReadHelper.AsciiLineFeed && nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>' || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>') { braces = 0; } } if (amountRead > 0) { reader.Unread(nextThreeBytes, 0, amountRead); } return(braces); }
public static void SkipSpaces(IRandomAccessRead reader) { const int commentCharacter = 37; int c = reader.Read(); while (IsWhitespace(c) || c == 37) { if (c == commentCharacter) { // skip past the comment section c = reader.Read(); while (!IsEndOfLine(c) && c != -1) { c = reader.Read(); } } else { c = reader.Read(); } } if (c != -1) { reader.Unread(c); } }
private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool) { var key = nameParser.Parse(reader); var value = ParseValue(reader, baseParser, pool); ReadHelper.SkipSpaces(reader); if ((char)reader.Peek() == 'd') { // if the next string is 'def' then we are parsing a cmap stream // and want to ignore it, otherwise throw an exception. var potentialDef = ReadHelper.ReadString(reader); if (!potentialDef.Equals("def")) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef)); } else { ReadHelper.SkipSpaces(reader); } } if (value == null) { log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader)); return(null, null); } // label this item as direct, to avoid signature problems. value.Direct = true; return(key, value); }
public static string ReadString(IRandomAccessRead reader) { SkipSpaces(reader); StringBuilder buffer = new StringBuilder(); int c = reader.Read(); while (!IsEndOfName((char)c) && c != -1) { buffer.Append((char)c); c = reader.Read(); } if (c != -1) { reader.Unread(c); } return(buffer.ToString()); }
public static long ReadLong(IRandomAccessRead reader) { SkipSpaces(reader); long retval; StringBuilder longBuffer = ReadStringNumber(reader); try { retval = long.Parse(longBuffer.ToString()); } catch (FormatException e) { var bytesToReverse = OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()); reader.Unread(bytesToReverse); throw new InvalidOperationException($"Error: Expected a long type at offset {reader.GetPosition()}, instead got \'{longBuffer}\'", e); } return(retval); }
private static bool ReadUntilEnd(IRandomAccessRead reader) { var c = reader.Read(); while (c != -1 && c != '/' && c != '>') { // in addition to stopping when we find / or >, we also want // to stop when we find endstream or endobj. if (c == E) { c = reader.Read(); if (c == N) { c = reader.Read(); if (c == D) { c = reader.Read(); var isStream = c == S && reader.Read() == T && reader.Read() == R && reader.Read() == E && reader.Read() == A && reader.Read() == M; var isObj = !isStream && c == O && reader.Read() == B && reader.Read() == J; if (isStream || isObj) { // we're done reading this object! return(true); } } } } c = reader.Read(); } if (c == -1) { return(true); } reader.Unread(c); return(false); }
public static int ReadInt(IRandomAccessRead reader) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } SkipSpaces(reader); int result; var intBuffer = ReadStringNumber(reader); try { result = int.Parse(intBuffer.ToString()); } catch (Exception e) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString())); throw new IOException("Error: Expected an integer type at offset " + reader.GetPosition(), e); } return(result); }
public CosName Parse([NotNull] IRandomAccessRead reader) { if (reader == null) { throw new ArgumentNullException(nameof(reader)); } ReadHelper.ReadExpectedChar(reader, '/'); using (var memoryStream = new MemoryStream()) using (var writer = new BinaryWriter(memoryStream)) { int c = reader.Read(); while (c != -1) { byte ch = (byte)c; if (ch == '#') { int ch1 = reader.Read(); int ch2 = reader.Read(); // Prior to PDF v1.2, the # was not a special character. Also, // it has been observed that various PDF tools do not follow the // spec with respect to the # escape, even though they report // PDF versions of 1.2 or later. The solution here is that we // interpret the # as an escape only when it is followed by two // valid hex digits. if (ReadHelper.IsHexDigit((char)ch1) && ReadHelper.IsHexDigit((char)ch2)) { string hex = "" + (char)ch1 + (char)ch2; try { var byteToWrite = (byte)Convert.ToInt32(hex, 16); writer.Write(byteToWrite); } catch (FormatException e) { throw new IOException("Error: expected hex digit, actual='" + hex + "'", e); } c = reader.Read(); } else { // check for premature EOF if (ch2 == -1 || ch1 == -1) { //LOG.error("Premature EOF in BaseParser#parseCosName"); c = -1; break; } reader.Unread(ch2); c = ch1; writer.Write(ch); } } else if (ReadHelper.IsEndOfName(ch)) { break; } else { writer.Write(ch); c = reader.Read(); } } if (c != -1) { reader.Unread(c); } byte[] bytes = memoryStream.ToArray(); var str = ReadHelper.IsValidUtf8(bytes) ? Encoding.UTF8.GetString(memoryStream.ToArray()) : Encoding.GetEncoding("windows-1252").GetString(memoryStream.ToArray()); return(CosName.Create(str)); } }
public CosString Parse(IRandomAccessRead seqSource) { char nextChar = (char)seqSource.Read(); if (nextChar == '<') { return(ParseHexString(seqSource)); } if (nextChar != '(') { throw new IOException("parseCOSstring string should start with '(' or '<' and not '" + nextChar + "' " + seqSource); } using (var memoryStream = new MemoryStream()) using (var writer = new StreamWriter(memoryStream)) { // This is the number of braces read int braces = 1; int c = seqSource.Read(); while (braces > 0 && c != -1) { char ch = (char)c; int nextc = -2; // not yet read if (ch == ')') { braces--; braces = CheckForEndOfString(seqSource, braces); if (braces != 0) { writer.Write(ch); } } else if (ch == '(') { braces++; writer.Write(ch); } else if (ch == '\\') { //patched by ram char next = (char)seqSource.Read(); switch (next) { case 'n': writer.Write('\n'); break; case 'r': writer.Write('\r'); break; case 't': writer.Write('\t'); break; case 'b': writer.Write('\b'); break; case 'f': writer.Write('\f'); break; case ')': // PDFBox 276 /Title (c:\) braces = CheckForEndOfString(seqSource, braces); if (braces != 0) { writer.Write(next); } else { writer.Write('\\'); } break; case '(': case '\\': writer.Write(next); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { var octal = new StringBuilder(); octal.Append(next); c = seqSource.Read(); char digit = (char)c; if (digit >= '0' && digit <= '7') { octal.Append(digit); c = seqSource.Read(); digit = (char)c; if (digit >= '0' && digit <= '7') { octal.Append(digit); } else { nextc = c; } } else { nextc = c; } int character; try { character = Convert.ToInt32(octal.ToString(), 8); } catch (FormatException e) { throw new IOException("Error: Expected octal character, actual='" + octal + "'", e); } writer.Write(character); break; } default: if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed) { // this is a break in the line so ignore it and the newline and continue c = seqSource.Read(); while (ReadHelper.IsEndOfLine(c) && c != -1) { c = seqSource.Read(); } nextc = c; break; } // dropping the backslash // see 7.3.4.2 Literal strings for further information writer.Write(next); break; } } else { writer.Write(ch); } if (nextc != -2) { c = nextc; } else { c = seqSource.Read(); } } if (c != -1) { seqSource.Unread(c); } writer.Flush(); return(new CosString(memoryStream.ToArray())); } }
public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool) { ReadHelper.ReadExpectedChar(reader, '['); var po = new COSArray(); CosBase pbo; ReadHelper.SkipSpaces(reader); int i; while (((i = reader.Peek()) > 0) && ((char)i != ']')) { pbo = baseParser.Parse(reader, pool); if (pbo is CosObject) { // We have to check if the expected values are there or not PDFBOX-385 if (po.get(po.size() - 1) is CosInt) { var genNumber = (CosInt)po.remove(po.size() - 1); if (po.get(po.size() - 1) is CosInt) { var number = (CosInt)po.remove(po.size() - 1); CosObjectKey key = new CosObjectKey(number.AsLong(), genNumber.AsInt()); pbo = pool.Get(key); } else { // the object reference is somehow wrong pbo = null; } } else { pbo = null; } } if (pbo != null) { po.add(pbo); } else { //it could be a bad object in the array which is just skipped // LOG.warn("Corrupt object reference at offset " + seqSource.getPosition()); // This could also be an "endobj" or "endstream" which means we can assume that // the array has ended. string isThisTheEnd = ReadHelper.ReadString(reader); reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd)); if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream")) { return(po); } } ReadHelper.SkipSpaces(reader); } // read ']' reader.Read(); ReadHelper.SkipSpaces(reader); return(po); }
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool) { CosBase retval = null; ReadHelper.SkipSpaces(reader); int nextByte = reader.Peek(); if (nextByte == -1) { return(null); } char c = (char)nextByte; switch (c) { case '<': { // pull off first left bracket int leftBracket = reader.Read(); // check for second left bracket c = (char)reader.Peek(); reader.Unread(leftBracket); if (c == '<') { retval = dictionaryParser.Parse(reader, this, pool); ReadHelper.SkipSpaces(reader); } else { retval = stringParser.Parse(reader); } break; } case '[': { // array retval = arrayParser.Parse(reader, this, pool); break; } case '(': retval = stringParser.Parse(reader); break; case '/': // name retval = nameParser.Parse(reader); break; case 'n': { // null ReadHelper.ReadExpectedString(reader, "null"); retval = CosNull.Null; break; } case 't': { string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4)); if (truestring.Equals("true")) { retval = PdfBoolean.True; } else { throw new IOException("expected true actual='" + truestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'f': { string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5)); if (falsestring.Equals("false")) { retval = PdfBoolean.False; } else { throw new IOException("expected false actual='" + falsestring + "' " + reader + "' at offset " + reader.GetPosition()); } break; } case 'R': reader.Read(); retval = new CosObject(null); break; default: if (char.IsDigit(c) || c == '-' || c == '+' || c == '.') { StringBuilder buf = new StringBuilder(); int ic = reader.Read(); c = (char)ic; while (char.IsDigit(c) || c == '-' || c == '+' || c == '.' || c == 'E' || c == 'e') { buf.Append(c); ic = reader.Read(); c = (char)ic; } if (ic != -1) { reader.Unread(ic); } retval = CosNumberFactory.get(buf.ToString()) as CosBase; } else { //This is not suppose to happen, but we will allow for it //so we are more compatible with POS writers that don't //follow the spec string badstring = ReadHelper.ReadString(reader); if (badstring == string.Empty) { int peek = reader.Peek(); // we can end up in an infinite loop otherwise throw new IOException("Unknown dir object c='" + c + "' cInt=" + (int)c + " peek='" + (char)peek + "' peekInt=" + peek + " at offset " + reader.GetPosition()); } // if it's an endstream/endobj, we want to put it back so the caller will see it if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring)) { reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring)); } } break; } return(retval); }