/// <summary> /// Reads in a number (only integers, not floats). /// </summary> /// <param name="text">The character stream to be tokenized.</param> /// <param name="i">The current index offset.</param> /// <exception cref="TokenizerException">If the number is invalid, /// such as multiple decimal points or characters.</exception> private void ConsumeNumber(string text, ref int i) { Assert(text != null); Assert(i < text.Length); ValidCharDelegate validDel = c => char.IsDigit(c); IllegalCharDelegate illegalDel = c => !(char.IsDigit(c) || VALID_SYMBOLS_AFTER_NUMBER.Contains(c)); string tokenStr = ExtractElementToToken(text, ref i, validDel, illegalDel); // This function does not support floating point numbers. May be changed in the future. if (tokenStr.Contains('.')) { throw new TokenizerException(lineNumber, charOffset, "Floating point numbers not supported."); } int dummy = 0; // TryParse needs an 'out' variable, we don't use it though. if (!int.TryParse(tokenStr, out dummy)) { throw new TokenizerException(lineNumber, charOffset, $"Malformed number: {tokenStr}"); } tokens.Add(new Token(TokenType.Number, tokenStr, lineNumber, charOffset)); charOffset += tokenStr.Length; }
/// <summary> /// Consumes a word from the file character stream. Advances the stream /// index after consuming some word. /// </summary> /// <param name="text">All the characters the tokenizer is tokenizing. /// </param> /// <param name="i">The offset that this should consume the word from. /// </param> /// <exception cref="TokenizerException">If there are bad characters /// in the word (like ab$d or he0p or some@) </exception> private void ConsumeWord(string text, ref int i) { Assert(text != null); Assert(i < text.Length); ValidCharDelegate del = c => char.IsLetter(c); IllegalCharDelegate illegalDel = c => char.IsDigit(c) || c == '$' || c == '@'; string tokenStr = ExtractElementToToken(text, ref i, del, illegalDel); tokens.Add(new Token(TokenType.Word, tokenStr, lineNumber, charOffset)); charOffset += tokenStr.Length; }
/// <summary> /// Gets an identifier and then assigns the token type based on the /// symbol prefix. /// </summary> /// <param name="text">The character stream to be tokenized.</param> /// <param name="i">The current index offset.</param> /// <param name="symbolPrefix">The symbol that prefixes this /// identifier, used in determining what kind of token it is.</param> /// <exception cref="TokenizerException">If the identifier is /// malformed.</exception> private void ExtractIdentifier(string text, ref int i, char symbolPrefix) { Assert(text != null); Assert(i < text.Length); Assert(symbolPrefix == '$' || symbolPrefix == '@'); i++; // Jump past the symbol, we don't need it anymore. ValidCharDelegate validDel = c => char.IsLetter(c) || c == '_' || c == '.'; string tokenStr = ExtractElementToToken(text, ref i, validDel); if (!IDENTIFIER_REGEX.Match(tokenStr).Success) { throw new TokenizerException(lineNumber, charOffset, $"Malformed identifier: {tokenStr}"); } TokenType type = symbolPrefix == '$' ? TokenType.DollarIdentifier : TokenType.AtIdentifier; tokens.Add(new Token(type, tokenStr, lineNumber, charOffset)); charOffset += 1 + tokenStr.Length; // 1 char symbol + token length characters read. }
/// <summary> /// Extracts the token based on the provided arguments. Increments the /// loop counter but not the charOffset/lineNumber. /// </summary> /// <param name="text">The character stream to tokenize (in string /// format).</param> /// <param name="i">The character offset (will be modified).</param> /// <param name="ValidCharDel">A delegate to determine what is a valid /// character and thus part of the token.True means the character is /// accepted, false means it is not.</param> /// <param name="IllegalCharDel">A delegate, which may be null, whereby /// not being null will cause an exception to be thrown if it returns /// true.</param> /// <returns>The extracted token.</returns> /// <exception cref="TokenizerException">If IllegalCharDel is not null /// and it detects an illegal character.</exception> private string ExtractElementToToken(string text, ref int i, ValidCharDelegate ValidCharDel, IllegalCharDelegate IllegalCharDel) { Assert(text != null); Assert(i < text.Length); Assert(ValidCharDel != null); char c; bool isValidChar = false; int tempCharOffset = charOffset; StringBuilder stringBuilder = new StringBuilder(); do { c = text[i]; if (IllegalCharDel != null && IllegalCharDel(c)) { throw new TokenizerException(lineNumber, tempCharOffset, $"Unexpected character: {c.ToString()}"); } isValidChar = ValidCharDel(c); if (isValidChar) { stringBuilder.Append(c); i++; // Only advance if it's a valid character. } tempCharOffset++; } while (i < text.Length && isValidChar); // Since the GenerateToken method for-loop will increment for us, we need to rewind prematurely. // When we rewind, we make it so that when the for-loop does increment i, it will then look at // the character that caused the above to terminate and assign that (or skip) as needed. i--; Assert(stringBuilder.Length > 0); string wordStr = stringBuilder.ToString(); return(wordStr); }
/// <summary> /// Will consume a quoted string with any character in the quotes /// (except escape sequences, tab is allowed). /// </summary> /// <param name="text">The character stream to be tokenized.</param> /// <param name="i">The current index offset.</param> /// <exception cref="TokenizerException">If the quotation mark ending /// is missing or the end is reached before finding it.</exception> private void ConsumeQuotedString(string text, ref int i) { Assert(text != null); Assert(i < text.Length); // Because we will be skipping the first quotation mark, we want to make sure // 'i' is actually still valid. Otherwise if there is no next character than we // know it's a malformed quote. i++; if (i >= text.Length) { throw new TokenizerException(lineNumber, charOffset, "Found starting quote at EOF."); } ValidCharDelegate del = c => c != 127 && (c >= 32 || c == '\t') && c != '"'; string tokenStr = ExtractElementToToken(text, ref i, del); // Now we actually want to skip past the last quotation mark since we didn't consume it. // We need to make some logic checks with this since fringe cases could yield EOF issues. i++; if (tokenStr.Length <= 0) { throw new TokenizerException(lineNumber, charOffset, "Cannot have an empty quoted string."); } else if (i >= text.Length) { throw new TokenizerException(lineNumber, charOffset, "Quotation mark not found (EOF)."); } else if (text[i] != '"') { throw new TokenizerException(lineNumber, charOffset, $"Could not find ending quotation mark, got '{text[i]}' instead."); } tokens.Add(new Token(TokenType.QuotedString, tokenStr, lineNumber, charOffset)); charOffset += tokenStr.Length + 2; // +2 for two quotation marks. }
/// <summary> /// Extracts the token based on the provided arguments. Increments the /// loop counter but not the charOffset/lineNumber. Same as calling /// ExtractElementToToken except with a null IsIllegalDelegate. /// </summary> /// <param name="text">The character stream to tokenize (in string /// format).</param> /// <param name="i">The character offset (will be modified).</param> /// <param name="ValidCharDel">A delegate to determine what is a valid /// character and thus part of the token.True means the character is /// accepted, false means it is not.</param> /// <returns>The extracted token.</returns> private string ExtractElementToToken(string text, ref int i, ValidCharDelegate ValidCharDel) { return(ExtractElementToToken(text, ref i, ValidCharDel, null)); }