/** * <summary>Moves the pointer to the next token.</summary> * <remarks>To properly parse the current token, the pointer MUST be just before its starting * (leading whitespaces are ignored). When this method terminates, the pointer IS * at the last byte of the current token.</remarks> * <returns>Whether a new token was found.</returns> */ public virtual bool MoveNext( ) { StringBuilder buffer = null; token = null; int c = 0; // Skip leading white-space characters. do { c = stream.ReadByte(); if (c == -1) { return(false); } } while(IsWhitespace(c)); // Keep goin' till there's a white-space character... // Which character is it? switch (c) { case Symbol.Slash: // Name. { tokenType = TokenTypeEnum.Name; /* * NOTE: As name objects are simple symbols uniquely defined by sequences of characters, * the bytes making up the name are never treated as text, so here they are just * passed through without unescaping. */ buffer = new StringBuilder(); while (true) { c = stream.ReadByte(); if (c == -1) { break; // NOOP. } if (IsDelimiter(c) || IsWhitespace(c)) { break; } buffer.Append((char)c); } if (c > -1) { stream.Skip(-1); } // Restores the first byte after the current token. } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': case '-': case '+': // Number. { if (c == '.') { tokenType = TokenTypeEnum.Real; } else // Digit or signum. { tokenType = TokenTypeEnum.Integer; } // By default (it may be real). // Building the number... buffer = new StringBuilder(); while (true) { buffer.Append((char)c); c = stream.ReadByte(); if (c == -1) { break; // NOOP. } else if (c == '.') { tokenType = TokenTypeEnum.Real; } else if (c < '0' || c > '9') { break; } } if (c > -1) { stream.Skip(-1); } // Restores the first byte after the current token. } break; case Symbol.OpenSquareBracket: // Array (begin). tokenType = TokenTypeEnum.ArrayBegin; break; case Symbol.CloseSquareBracket: // Array (end). tokenType = TokenTypeEnum.ArrayEnd; break; case Symbol.OpenAngleBracket: // Dictionary (begin) | Hexadecimal string. { c = stream.ReadByte(); if (c == -1) { throw new ParseException("Unexpected EOF (isolated opening angle-bracket character)."); } // Is it a dictionary (2nd angle bracket)? if (c == Symbol.OpenAngleBracket) { tokenType = TokenTypeEnum.DictionaryBegin; break; } // Hexadecimal string (single angle bracket). tokenType = TokenTypeEnum.Hex; buffer = new StringBuilder(); while (c != Symbol.CloseAngleBracket) // NOT string end. { if (!IsWhitespace(c)) { buffer.Append((char)c); } c = stream.ReadByte(); if (c == -1) { throw new ParseException("Unexpected EOF (malformed hex string)."); } } } break; case Symbol.CloseAngleBracket: // Dictionary (end). { c = stream.ReadByte(); if (c != Symbol.CloseAngleBracket) { throw new ParseException("Malformed dictionary.", stream.Position); } tokenType = TokenTypeEnum.DictionaryEnd; } break; case Symbol.OpenRoundBracket: // Literal string. { tokenType = TokenTypeEnum.Literal; buffer = new StringBuilder(); int level = 0; while (true) { c = stream.ReadByte(); if (c == -1) { break; } else if (c == Symbol.OpenRoundBracket) { level++; } else if (c == Symbol.CloseRoundBracket) { level--; } else if (c == '\\') { bool lineBreak = false; c = stream.ReadByte(); switch (c) { case 'n': c = Symbol.LineFeed; break; case 'r': c = Symbol.CarriageReturn; break; case 't': c = '\t'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case Symbol.OpenRoundBracket: case Symbol.CloseRoundBracket: case '\\': break; case Symbol.CarriageReturn: lineBreak = true; c = stream.ReadByte(); if (c != Symbol.LineFeed) { stream.Skip(-1); } break; case Symbol.LineFeed: lineBreak = true; break; default: { // Is it outside the octal encoding? if (c < '0' || c > '7') { break; } // Octal. int octal = c - '0'; c = stream.ReadByte(); // Octal end? if (c < '0' || c > '7') { c = octal; stream.Skip(-1); break; } octal = (octal << 3) + c - '0'; c = stream.ReadByte(); // Octal end? if (c < '0' || c > '7') { c = octal; stream.Skip(-1); break; } octal = (octal << 3) + c - '0'; c = octal & 0xff; break; } } if (lineBreak) { continue; } if (c == -1) { break; } } else if (c == Symbol.CarriageReturn) { c = stream.ReadByte(); if (c == -1) { break; } else if (c != Symbol.LineFeed) { c = Symbol.LineFeed; stream.Skip(-1); } } if (level == -1) { break; } buffer.Append((char)c); } if (c == -1) { throw new ParseException("Malformed literal string."); } } break; case Symbol.Percent: // Comment. { tokenType = TokenTypeEnum.Comment; buffer = new StringBuilder(); while (true) { c = stream.ReadByte(); if (c == -1 || IsEOL(c)) { break; } buffer.Append((char)c); } } break; default: // Keyword. { tokenType = TokenTypeEnum.Keyword; buffer = new StringBuilder(); do { buffer.Append((char)c); c = stream.ReadByte(); if (c == -1) { break; } } while(!IsDelimiter(c) && !IsWhitespace(c)); if (c > -1) { stream.Skip(-1); } // Restores the first byte after the current token. } break; } if (buffer != null) { switch (tokenType) { case TokenTypeEnum.Keyword: { token = buffer.ToString(); switch ((string)token) { case Keyword.False: case Keyword.True: // Boolean. tokenType = TokenTypeEnum.Boolean; token = bool.Parse((string)token); break; case Keyword.Null: // Null. tokenType = TokenTypeEnum.Null; token = null; break; } } break; case TokenTypeEnum.Name: case TokenTypeEnum.Literal: case TokenTypeEnum.Hex: case TokenTypeEnum.Comment: token = buffer.ToString(); break; case TokenTypeEnum.Integer: token = Int32.Parse( buffer.ToString(), NumberStyles.Integer, StandardNumberFormatInfo ); break; case TokenTypeEnum.Real: token = Double.Parse( buffer.ToString(), NumberStyles.Float, StandardNumberFormatInfo ); break; } } return(true); }
/** * <summary>Parses the next token [PDF:1.6:3.1].</summary> * <remarks>To properly parse the current token, the pointer MUST be just before its starting * (leading whitespaces are ignored). When this method terminates, the pointer IS * at the last byte of the current token.</remarks> * <returns>Whether a new token was found.</returns> */ public bool MoveNext( ) { /* * NOTE: It'd be interesting to evaluate an alternative regular-expression-based * implementation... */ StringBuilder buffer = null; token = null; int c = 0; // Skip white-space characters [PDF:1.6:3.1.1]. do { c = stream.ReadByte(); if (c == -1) { return(false); } } while(IsWhitespace(c)); // Keep goin' till there's a white-space character... // Which character is it? switch (c) { case Symbol.Slash: // Name. tokenType = TokenTypeEnum.Name; buffer = new StringBuilder(); while (true) { c = stream.ReadByte(); if (c == -1) { throw new FileFormatException("Unexpected EOF (malformed name object).", stream.Position); } if (IsDelimiter(c) || IsWhitespace(c)) { break; } buffer.Append((char)c); } stream.Skip(-1); // Recover the first byte after the current token. break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': case '-': case '+': // Number [PDF:1.6:3.2.2] | Indirect reference. switch (c) { case '.': // Decimal point. tokenType = TokenTypeEnum.Real; break; case '-': case '+': // Signum. tokenType = TokenTypeEnum.Integer; // By default (it may be real). break; default: // Digit. if (multipleTokenParsing) // Plain number (multiple token parsing -- see indirect reference search). { tokenType = TokenTypeEnum.Integer; // By default (it may be real). } else // Maybe an indirect reference (postfix notation [PDF:1.6:3.2.9]). { /* * NOTE: We need to identify this pattern: * ref := { int int 'R' } */ // Enable multiple token parsing! // NOTE: This state MUST be disabled before returning. multipleTokenParsing = true; // 1. Object number. // Try the possible object number! stream.Skip(-1); MoveNext(); // Isn't it a valid object number? if (tokenType != TokenTypeEnum.Integer) { // Disable multiple token parsing! multipleTokenParsing = false; return(true); } // Assign object number! int objectNumber = (int)token; // Backup the recovery position! long oldOffset = stream.Position; // 2. Generation number. // Try the possible generation number! MoveNext(); // Isn't it a valid generation number? if (tokenType != TokenTypeEnum.Integer) { // Rollback! stream.Seek(oldOffset); token = objectNumber; tokenType = TokenTypeEnum.Integer; // Disable multiple token parsing! multipleTokenParsing = false; return(true); } // Assign generation number! int generationNumber = (int)token; // 3. Reference keyword. // Try the possible reference keyword! MoveNext(); // Isn't it a valid reference keyword? if (tokenType != TokenTypeEnum.Reference) { // Rollback! stream.Seek(oldOffset); token = objectNumber; tokenType = TokenTypeEnum.Integer; // Disable multiple token parsing! multipleTokenParsing = false; return(true); } token = new Reference(objectNumber, generationNumber); // Disable multiple token parsing! multipleTokenParsing = false; return(true); } break; } // Building the number... buffer = new StringBuilder(); do { buffer.Append((char)c); c = stream.ReadByte(); if (c == -1) { throw new FileFormatException("Unexpected EOF (malformed number object).", stream.Position); } if (c == '.') { tokenType = TokenTypeEnum.Real; } else if (c < '0' || c > '9') { break; } } while(true); stream.Skip(-1); // Recover the first byte after the current token. break; case Symbol.OpenSquareBracket: // Array (begin). tokenType = TokenTypeEnum.ArrayBegin; break; case Symbol.CloseSquareBracket: // Array (end). tokenType = TokenTypeEnum.ArrayEnd; break; case Symbol.OpenAngleBracket: // Dictionary (begin) | Hexadecimal string. c = stream.ReadByte(); if (c == -1) { throw new FileFormatException("Unexpected EOF (isolated opening angle-bracket character).", stream.Position); } // Is it a dictionary (2nd angle bracket [PDF:1.6:3.2.6])? if (c == Symbol.OpenAngleBracket) { tokenType = TokenTypeEnum.DictionaryBegin; break; } // Hexadecimal string (single angle bracket [PDF:1.6:3.2.3]). tokenType = TokenTypeEnum.Hex; // [FIX:0.0.4:4] It skipped after the first hexadecimal character, missing it. buffer = new StringBuilder(); while (c != Symbol.CloseAngleBracket) // NOT string end. { buffer.Append((char)c); c = stream.ReadByte(); if (c == -1) { throw new FileFormatException("Unexpected EOF (malformed hex string).", stream.Position); } } break; case Symbol.CloseAngleBracket: // Dictionary (end). c = stream.ReadByte(); if (c != Symbol.CloseAngleBracket) { throw new FileFormatException("Malformed dictionary.", stream.Position); } tokenType = TokenTypeEnum.DictionaryEnd; break; case Symbol.OpenRoundBracket: // Literal string. tokenType = TokenTypeEnum.Literal; buffer = new StringBuilder(); int level = 0; while (true) { c = stream.ReadByte(); if (c == -1) { break; } if (c == Symbol.OpenRoundBracket) { level++; } else if (c == Symbol.CloseRoundBracket) { level--; } else if (c == '\\') { bool lineBreak = false; c = stream.ReadByte(); switch (c) { case 'n': c = Symbol.LineFeed; break; case 'r': c = Symbol.CarriageReturn; break; case 't': c = '\t'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case Symbol.OpenRoundBracket: case Symbol.CloseRoundBracket: case '\\': break; case Symbol.CarriageReturn: lineBreak = true; c = stream.ReadByte(); if (c != Symbol.LineFeed) { stream.Skip(-1); } break; case Symbol.LineFeed: lineBreak = true; break; default: { // Is it outside the octal encoding? if (c < '0' || c > '7') { break; } // Octal [PDF:1.6:3.2.3]. int octal = c - '0'; c = stream.ReadByte(); // Octal end? if (c < '0' || c > '7') { c = octal; stream.Skip(-1); break; } octal = (octal << 3) + c - '0'; c = stream.ReadByte(); // Octal end? if (c < '0' || c > '7') { c = octal; stream.Skip(-1); break; } octal = (octal << 3) + c - '0'; c = octal & 0xff; break; } } if (lineBreak) { continue; } if (c == -1) { break; } } else if (c == Symbol.CarriageReturn) { c = stream.ReadByte(); if (c == -1) { break; } if (c != Symbol.LineFeed) { c = Symbol.LineFeed; stream.Skip(-1); } } if (level == -1) { break; } buffer.Append((char)c); } if (c == -1) { throw new FileFormatException("Malformed literal string.", stream.Position); } break; case Symbol.CapitalR: // Indirect reference. tokenType = TokenTypeEnum.Reference; break; case Symbol.Percent: // Comment [PDF:1.6:3.1.2]. tokenType = TokenTypeEnum.Comment; buffer = new StringBuilder(); while (true) { c = stream.ReadByte(); if (c == -1 || IsEOL(c)) { break; } buffer.Append((char)c); } break; default: // Keyword object. tokenType = TokenTypeEnum.Keyword; buffer = new StringBuilder(); do { buffer.Append((char)c); c = stream.ReadByte(); if (c == -1) { break; } } while(!IsDelimiter(c) && !IsWhitespace(c)); stream.Skip(-1); // Recover the first byte after the current token. break; } if (buffer != null) { /* * Current token initialization. */ // Wich token type? switch (tokenType) { case TokenTypeEnum.Keyword: token = buffer.ToString(); // Late recognition. switch ((string)token) { case Keyword.False: case Keyword.True: // Boolean. tokenType = TokenTypeEnum.Boolean; token = bool.Parse((string)token); break; case Keyword.Null: // Null. tokenType = TokenTypeEnum.Null; token = null; break; } break; case TokenTypeEnum.Comment: case TokenTypeEnum.Hex: case TokenTypeEnum.Name: token = buffer.ToString(); break; case TokenTypeEnum.Literal: token = buffer.ToString(); // Late recognition. if (((string)token).StartsWith(Keyword.DatePrefix)) // Date. { tokenType = TokenTypeEnum.Date; token = PdfDate.ToDate((string)token); } break; case TokenTypeEnum.Integer: token = Int32.Parse( buffer.ToString(), NumberStyles.Integer, StandardNumberFormatInfo ); break; case TokenTypeEnum.Real: // [FIX:1668410] Parsing of float numbers was buggy (localized default number format). token = Single.Parse( buffer.ToString(), NumberStyles.Float, StandardNumberFormatInfo ); break; } } return(true); }