protected void AddTransition(TokenType oldState, char c, TokenType newState) { var key = new TokenizerTableKey(oldState, c); if (table.ContainsKey(key)) { throw new Exception(string.Format("Tokenizer.AddTransition() : The key [{0}, {1}] already exists in the transition table", oldState, c)); } table[key] = newState; }
// Return the next valid token read from the specified file stream. // Read buffered characters before reading new characters from fp. protected override Token GetToken() { var state_list = new List <TokenType>(); // List of states corresponding to not-yet-accepted characters char c; // A character read from get_char() var start_col = col_num; // The column of the first char in a token var s = TokenType.S_Start; // Current state sbToken.Clear(); state_list.Add(s); for ( ; ;) // Loop until valid token read { char cSimplified; c = GetChar(); if (char.IsLetter(c)) { cSimplified = 'A'; } else if (char.IsDigit(c)) { cSimplified = '0'; } else { cSimplified = c; } if (s != TokenType.S_Start && (c == '\0' || c == '\n' || (char.IsWhiteSpace(c) && !dictInternalStringStateToCompletedState.ContainsKey(s)))) { if (dictInternalStringStateToCompletedState.ContainsKey(s) && (c == '\0' || c == '\n')) // Newline or EOF delimits string literal { s = dictInternalStringStateToCompletedState[s]; } if (c == '\n') { ++char_num; } if (acceptableTokens.Contains(s)) { // Valid token has been delimited by white space break; } else { // Error - try to recover s = TokenType.S_Error; } } TokenType newState; if (s == TokenType.S_Start && c == '\0') { newState = TokenType.T_EOF; } else if (s == TokenType.S_Start && c == '\n') { ++line_num; col_num = 1; start_col = 1; // Don't buffer white space newState = s; // TokenType.S_Start; } else if (s == TokenType.S_Start && char.IsWhiteSpace(c)) { ++start_col; // Don't buffer white space newState = s; // TokenType.S_Start; } else if (s == TokenType.S_Start && removeComments && c == cCommentDelimiter) { for (; ;) { ++char_num; c = GetChar(); if (c == '\0') { newState = TokenType.T_EOF; break; } else if (c == '\n') { ++line_num; col_num = 1; start_col = 1; // Don't buffer white space newState = s; // TokenType.S_Start; break; } } } #if DEAD_CODE else if (s == TokenType.S_StrLitOpen && c != cStringDelimiter) { newState = s; } #else else if (dictInternalStringStateToDelimiter.ContainsKey(s) && c != dictInternalStringStateToDelimiter[s]) { newState = s; } #endif else { var key = new TokenizerTableKey(s, cSimplified); if (table.ContainsKey(key)) { newState = table[key]; } else { newState = TokenType.S_Error; } } s = newState; if (s == TokenType.T_EOF) { break; } else if (s == TokenType.S_Error) { s = RecoverToken(state_list); if (s != TokenType.S_Error) { var rewindAmount = col_num - (start_col + sbToken.Length); char_num -= rewindAmount; col_num = start_col + sbToken.Length; break; // Valid token recovered } throw new TokenizerException(string.Format("Lexical error at line {0}, column {1}", line_num, start_col), line_num, start_col); /* * Console.WriteLine("Discarding unmatched '{0}' at column {1}", m_str[start_col], start_col); * was_err = true; * strlit.Clear(); ++start_col; * col_num = start_col; * token_str.Clear(); * s = TokenType.S_Start; // Start over at next unmatched char * state_list.Clear(); * state_list.Add(s); */ } else { ++char_num; if (c != '\n') { ++col_num; } if (s != TokenType.S_Start) { sbToken.Append(c); state_list.Add(s); } } } if (!acceptableTokens.Contains(s)) { throw new TokenizerException( string.Format("Internal error at line {0}, column {1}: Non-token {2} accepted", line_num, start_col, s), line_num, start_col); } //Console.WriteLine("Token: {0} ; Line {1}; Column: {2}", sbToken.ToString(), line_num, start_col); var token_str = sbToken.ToString(); object tokenValue; switch (s) { case TokenType.T_IntLit: tokenValue = int.Parse(token_str); break; case TokenType.T_FltLit: tokenValue = double.Parse(token_str); break; case TokenType.T_StrLit: tokenValue = GetStrLitFromTokenStr(cStringDelimiter); break; case TokenType.T_EOF: tokenValue = "EOF"; break; default: tokenValue = ExtendedGetTokenValue(ref s, token_str); // TODO: Pass s by reference so that the Prolog interpreter can change it in the case of single-quoted strings -> identifiers. if (tokenValue == null) { tokenValue = token_str.ToString(); } break; } var token = new Token(s, tokenValue, line_num, start_col); if (c == '\n') { ++line_num; col_num = 1; } return(token); }