/// <summary> /// Break a string into tokens /// </summary> /// <param name="s">The string</param> /// <param name="culture">The Culture to use for this</param> /// <param name="symbolIsOneChar">Whether to consider every symbol character a separate symbol token</param> /// <param name="quoteCharacter">Quote character found around quoted literal token</param> /// <param name="excludedSymbols">List of characters which are not to be considered symbols</param> /// <returns>Enumerable of tokens</returns> public static IEnumerable <Token> Tokenise(this string s, CultureInfo culture, bool symbolIsOneChar, char?quoteCharacter, List <char> excludedSymbols) { int pos = 0; Token token = null; Token lastToken = null; bool hasWhitespaceBefore = false; string decimalPoint = culture.NumberFormat.NumberDecimalSeparator; if (excludedSymbols == null) { excludedSymbols = new List <char>(); } while (pos < s.Length) { hasWhitespaceBefore = false; while (pos < s.Length && char.IsWhiteSpace(s[pos])) { pos++; hasWhitespaceBefore = true; } if (pos >= s.Length) { yield break; } if (quoteCharacter.HasValue && s[pos] == quoteCharacter) { token = new QuotedLiteralToken(pos + 1, pos + 1); } else if (char.IsNumber(s[pos])) { token = new NumberToken(pos, pos + 1); } else if (IsSymbolChar(s[pos], excludedSymbols)) { token = new SymbolToken(pos, pos + 1); } else { token = new WordToken(pos, pos + 1); } pos++; bool hasDP = false; while (pos < s.Length && (token is QuotedLiteralToken || !char.IsWhiteSpace(s[pos]))) { if ((token is NumberToken && char.IsNumber(s[pos])) || (token is SymbolToken && IsSymbolChar(s[pos], excludedSymbols) && !symbolIsOneChar) || (token is WordToken && !char.IsNumber(s[pos]) && !IsSymbolChar(s[pos], excludedSymbols) && !char.IsWhiteSpace(s[pos])) || (token is QuotedLiteralToken && s[pos] != quoteCharacter)) { token.ExtendRight(); } else if (token is NumberToken && s[pos] == decimalPoint[0] && pos + 1 < s.Length && char.IsNumber(s[pos + 1]) && !hasDP) { hasDP = true; token.ExtendRight(); } else { break; } pos++; } token.Previous = lastToken; token.SetValue(s); token.HasWhitespaceBefore = hasWhitespaceBefore; if (token is QuotedLiteralToken) // skip final quote { pos++; } yield return(token); lastToken = token; } }
/// <summary> /// Break a string into tokens /// </summary> /// <param name="s">The string</param> /// <param name="culture">The Culture to use for this</param> /// <param name="symbolIsOneChar">Whether to consider every symbol character a separate symbol token</param> /// <param name="quoteCharacter">Quote character found around quoted literal token</param> /// <param name="excludedSymbols">List of characters which are not to be considered symbols</param> /// <returns>Enumerable of tokens</returns> public static IEnumerable<Token> Tokenise(this string s, CultureInfo culture, bool symbolIsOneChar, char? quoteCharacter, List<char> excludedSymbols) { int pos = 0; Token token = null; Token lastToken = null; bool hasWhitespaceBefore = false; string decimalPoint = culture.NumberFormat.NumberDecimalSeparator; if (excludedSymbols == null) excludedSymbols = new List<char>(); while (pos < s.Length) { hasWhitespaceBefore = false; while (pos < s.Length && char.IsWhiteSpace(s[pos])) { pos++; hasWhitespaceBefore = true; } if (pos >= s.Length) yield break; if (quoteCharacter.HasValue && s[pos] == quoteCharacter) token = new QuotedLiteralToken(pos + 1, pos + 1); else if (char.IsNumber(s[pos])) token = new NumberToken(pos, pos + 1); else if (IsSymbolChar(s[pos], excludedSymbols)) token = new SymbolToken(pos, pos + 1); else token = new WordToken(pos, pos + 1); pos++; bool hasDP = false; while (pos < s.Length && (token is QuotedLiteralToken || !char.IsWhiteSpace(s[pos]))) { if ((token is NumberToken && char.IsNumber(s[pos])) || (token is SymbolToken && IsSymbolChar(s[pos], excludedSymbols) && !symbolIsOneChar) || (token is WordToken && !char.IsNumber(s[pos]) && !IsSymbolChar(s[pos], excludedSymbols) && !char.IsWhiteSpace(s[pos])) || (token is QuotedLiteralToken && s[pos] != quoteCharacter)) token.ExtendRight(); else if (token is NumberToken && s[pos] == decimalPoint[0] && pos + 1 < s.Length && char.IsNumber(s[pos+1]) && !hasDP) { hasDP = true; token.ExtendRight(); } else break; pos++; } token.Previous = lastToken; token.SetValue(s); token.HasWhitespaceBefore = hasWhitespaceBefore; if (token is QuotedLiteralToken) // skip final quote pos++; yield return token; lastToken = token; } }