/// <summary> /// Get the next token. The last token will be an EofToken unless /// there's an unterminated quote or unterminated block comment /// and Settings.DoUntermCheck is true, in which case this throws /// an exception of type StreamTokenizerUntermException or sub-class. /// </summary> /// <param name="token">The output token.</param> /// <returns>bool - true for success, false for failure.</returns> public bool NextToken(out Token token) { token = null; int thisChar = 0; // current character byte ctype; // type of this character NextTokenState state = NextTokenState.Start; int prevChar = 0; // previous character byte prevCtype = (byte)CharTypeBits.Eof; // get previous char from nextTokenSb if there // (nextTokenSb is a StringBuilder containing the characters // of the next token to be emitted) if (nextTokenSb.Length > 0) { prevChar = nextTokenSb[nextTokenSb.Length - 1]; prevCtype = settings.CharTypes[prevChar]; state = PickNextState(prevCtype, prevChar); } // extra state for number parse int seenDot = 0; // how many .'s in the number int seenE = 0; // how many e's or E's have we seen in the number bool seenDigit = false; // seen any digits (numbers can start with -) // lineNumber can change with each GetNextChar() // tokenLineNumber is the line on which the token started int tokenLineNumber = lineNumber; // State Machine: Produces a single token. // Enter a state based on a single character. // Generally, being in a state means we're currently collecting chars // in that type of token. // We do state machine until it builds a token (Eof is a token), then // return that token. thisChar = prevChar; // for first iteration, since prevChar is set to this bool done = false; // optimization while (!done) { prevChar = thisChar; thisChar = GetNextChar(); if (thisChar >= settings.CharTypes.Length) { // greater than 7-bit ascii, treat as word character ctype = (byte)CharTypeBits.Word; } else ctype = settings.CharTypes[thisChar]; #if DEBUG log.Debug("Before switch: state = {0}, thisChar = '{1}'", state, (char)thisChar); #endif // see if we need to change states, or emit a token switch (state) { case NextTokenState.Start: // RESET state = PickNextState(ctype, thisChar); tokenLineNumber = lineNumber; break; case NextTokenState.Char: token = new CharToken((char)prevChar, tokenLineNumber); done = true; nextTokenSb.Length = 0; break; case NextTokenState.Word: if ((!settings.IsCharType(ctype, CharTypeBits.Word)) && (!settings.IsCharType(ctype, CharTypeBits.Digit))) { // end of word, emit token = new WordToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } break; case NextTokenState.Whitespace: if (!settings.IsCharType(ctype, CharTypeBits.Whitespace) || (settings.GrabEol && (thisChar == 10))) { // end of whitespace, emit if (settings.GrabWhitespace) { token = new WhitespaceToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } else { // RESET nextTokenSb.Length = 0; tokenLineNumber = lineNumber; state = PickNextState(ctype, thisChar); } } break; case NextTokenState.EndQuote: // we're now 1 char after end of quote token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; break; case NextTokenState.Quote: // looking for end quote matching char that started the quote if (thisChar == nextTokenSb[0]) { // handle escaped backslashes: count the immediately prior backslashes // - even (including 0) means it's not escaped // - odd means it is escaped int backSlashCount = 0; for (int i = nextTokenSb.Length - 1; i >= 0; i--) { if (nextTokenSb[i] == '\\') backSlashCount++; else break; } if ((backSlashCount % 2) == 0) { state = NextTokenState.EndQuote; } } if ((state != NextTokenState.EndQuote) && (thisChar == Eof)) { if (settings.DoUntermCheck) { nextTokenSb.Length = 0; throw new StreamTokenizerUntermQuoteException("Unterminated quote"); } token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } break; case NextTokenState.MaybeComment: if (thisChar == Eof) { token = new CharToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } else { // if we get the right char, we're in a comment if (settings.SlashSlashComments && (thisChar == '/')) state = NextTokenState.LineComment; else if (settings.SlashStarComments && (thisChar == '*')) state = NextTokenState.BlockComment; else { token = new CharToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } } break; case NextTokenState.LineComment: if (thisChar == Eof) { if (settings.GrabComments) { token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } else { // RESET nextTokenSb.Length = 0; tokenLineNumber = lineNumber; state = PickNextState(ctype, thisChar); } } else { if (thisChar == '\n') { if (settings.GrabComments) { token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } else { // RESET nextTokenSb.Length = 0; tokenLineNumber = lineNumber; state = PickNextState(ctype, thisChar); } } } break; case NextTokenState.BlockComment: if (thisChar == Eof) { if (settings.DoUntermCheck) { nextTokenSb.Length = 0; throw new StreamTokenizerUntermCommentException("Unterminated comment."); } if (settings.GrabComments) { token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } else { // RESET nextTokenSb.Length = 0; tokenLineNumber = lineNumber; state = PickNextState(ctype, thisChar); } } else { if ((thisChar == '/') && (prevChar == '*')) { state = NextTokenState.EndBlockComment; } } break; // special case for 2-character token termination case NextTokenState.EndBlockComment: if (settings.GrabComments) { token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } else { // RESET nextTokenSb.Length = 0; tokenLineNumber = lineNumber; state = PickNextState(ctype, thisChar); } break; case NextTokenState.MaybeHex: // previous char was 0 if (thisChar != 'x') { // back up and try non-hex // back up to the 0 nextTokenSb.Append((char)thisChar); backString.Append(nextTokenSb); nextTokenSb.Length = 0; // reset state and don't choose MaybeNumber state. // pull char from backString thisChar = backString[0]; backString.Remove(0, 1); state = PickNextState(settings.CharTypes[thisChar], (int)thisChar, NextTokenState.MaybeHex); #if DEBUG log.Debug("HexGot0x: Next state on '{0}' is {1}", (char)thisChar, state); #endif } else state = NextTokenState.HexGot0x; break; case NextTokenState.HexGot0x: if (!settings.IsCharType(ctype, CharTypeBits.HexDigit)) { // got 0x but now a non-hex char // back up to the 0 nextTokenSb.Append((char)thisChar); backString.Append(nextTokenSb); nextTokenSb.Length = 0; // reset state and don't choose MaybeNumber state. // pull char from backString thisChar = backString[0]; backString.Remove(0, 1); state = PickNextState(settings.CharTypes[thisChar], (int)thisChar, NextTokenState.MaybeHex); #if DEBUG log.Debug("HexGot0x: Next state on '{0}' is {1}", ((char)thisChar).ToString(), state.ToString()); #endif } else state = NextTokenState.HexNumber; break; case NextTokenState.HexNumber: if (!settings.IsCharType(ctype, CharTypeBits.HexDigit)) { // emit the hex number we've collected #if DEBUG log.Debug("Emit hex IntToken from string '{0}'", nextTokenSb); #endif token = IntToken.ParseHex(nextTokenSb.ToString(), tokenLineNumber); done = true; nextTokenSb.Length = 0; } break; case NextTokenState.MaybeNumber: // // Determine whether or not to stop collecting characters for // the number parse. We terminate when it's clear it's not // a number or no longer a number. // bool term = false; if (settings.IsCharType(ctype, CharTypeBits.Digit) || settings.IsCharType(prevChar, CharTypeBits.Digit)) seenDigit = true; // term conditions if (thisChar == '.') { seenDot++; if (seenDot > 1) term = true; // more than one dot, it aint a number } else if (((thisChar == 'e') || (thisChar == 'E'))) { seenE++; if (!seenDigit) term = true; // e before any digits is bad else if (seenE > 1) term = true; // more than 1 e is bad else { term = true; // done regardless // scan the exponent, put its characters into // nextTokenSb, if there are any char c; expSb.Clear(); expSb.Append((char)thisChar); if (GrabInt(expSb, true, out c)) { // we got a good exponent, tack it on nextTokenSb.Append(expSb); thisChar = c; // and continue after the exponent's characters } } } else if (thisChar == Eof) term = true; // or a char that can't be in a number else if ((!settings.IsCharType(ctype, CharTypeBits.Digit) && (thisChar != 'e') && (thisChar != 'E') && (thisChar != '-') && (thisChar != '.')) || ((thisChar == '+') && (seenE == 0))) { // it's not a normal number character term = true; } // or a dash not after e else if ((thisChar == '-') && (!((prevChar == 'e') || (prevChar == 'E')))) term = true; if (term) { // we are terminating a number, or it wasn't a number if (seenDigit) { if ((nextTokenSb.IndexOf('.') >= 0) || (nextTokenSb.IndexOf('e') >= 0) || (nextTokenSb.IndexOf('E') >= 0) || (nextTokenSb.Length >= 19) // probably too large for Int64, use float ) { token = new FloatToken(nextTokenSb.ToString(), tokenLineNumber); #if DEBUG log.Debug("Emit FloatToken from string '{0}'", nextTokenSb); #endif } else { #if DEBUG log.Debug("Emit IntToken from string '{0}'", nextTokenSb); #endif token = new IntToken(nextTokenSb.ToString(), tokenLineNumber); } done = true; nextTokenSb.Length = 0; } else { // -whatever or -.whatever // didn't see any digits, must have gotten here by a leading - // and no digits after it // back up to -, pick next state excluding numbers nextTokenSb.Append((char)thisChar); backString.Append(nextTokenSb); nextTokenSb.Length = 0; // restart on the - and don't choose MaybeNumber state // pull char from backString thisChar = backString[0]; backString.Remove(0, 1); state = PickNextState(settings.CharTypes[thisChar], (int)thisChar, NextTokenState.MaybeNumber); #if DEBUG log.Debug("MaybeNumber: Next state on '{0}' is {1}", (char)thisChar, state); #endif } } break; case NextTokenState.Eol: // tokenLineNumber - 1 because the newline char is on the previous line token = new EolToken(tokenLineNumber - 1); done = true; nextTokenSb.Length = 0; break; case NextTokenState.Eof: token = new EofToken(tokenLineNumber); done = true; nextTokenSb.Length = 0; return (false); case NextTokenState.Invalid: default: // not a good sign, some unrepresented state? log.Error("NextToken: Hit unrepresented state {0}", state.ToString()); return (false); } // use a StringBuilder to accumulate characters which are part of this token if (thisChar != Eof) nextTokenSb.Append((char)thisChar); #if DEBUG log.Debug("After switch: state = {0}, nextTokenSb = '{1}', backString = '{2}'", state, nextTokenSb, backString); #endif } #if DEBUG log.Debug("Got token {0}", token.ToDebugString()); #endif return (true); }
/// <summary> /// Tokenize a file completely and return the tokens in a Token[]. /// </summary> /// <param name="fileName">The file to tokenize.</param> /// <returns>A Token[] with all tokens.</returns> public Token[] TokenizeFile(string fileName) { List<Token> list = new List<Token>(); if (!TokenizeFile(fileName, list)) { return (null); } else { if (list.Count > 0) { Token[] tokenArray = new Token[list.Count]; list.CopyTo(tokenArray); return tokenArray; } else return (null); } }