Пример #1
0
        /// <summary>
        /// Break a string into tokens
        /// </summary>
        /// <param name="s">The string</param>
        /// <param name="culture">The Culture to use for this</param>
        /// <param name="symbolIsOneChar">Whether to consider every symbol character a separate symbol token</param>
        /// <param name="quoteCharacter">Quote character found around quoted literal token</param>
        /// <param name="excludedSymbols">List of characters which are not to be considered symbols</param>
        /// <returns>Enumerable of tokens</returns>
        public static IEnumerable <Token> Tokenise(this string s, CultureInfo culture, bool symbolIsOneChar, char?quoteCharacter, List <char> excludedSymbols)
        {
            int    pos                 = 0;
            Token  token               = null;
            Token  lastToken           = null;
            bool   hasWhitespaceBefore = false;
            string decimalPoint        = culture.NumberFormat.NumberDecimalSeparator;

            if (excludedSymbols == null)
            {
                excludedSymbols = new List <char>();
            }

            while (pos < s.Length)
            {
                hasWhitespaceBefore = false;
                while (pos < s.Length && char.IsWhiteSpace(s[pos]))
                {
                    pos++;
                    hasWhitespaceBefore = true;
                }

                if (pos >= s.Length)
                {
                    yield break;
                }
                if (quoteCharacter.HasValue && s[pos] == quoteCharacter)
                {
                    token = new QuotedLiteralToken(pos + 1, pos + 1);
                }
                else if (char.IsNumber(s[pos]))
                {
                    token = new NumberToken(pos, pos + 1);
                }
                else if (IsSymbolChar(s[pos], excludedSymbols))
                {
                    token = new SymbolToken(pos, pos + 1);
                }
                else
                {
                    token = new WordToken(pos, pos + 1);
                }
                pos++;
                bool hasDP = false;
                while (pos < s.Length && (token is QuotedLiteralToken || !char.IsWhiteSpace(s[pos])))
                {
                    if ((token is NumberToken && char.IsNumber(s[pos])) ||
                        (token is SymbolToken && IsSymbolChar(s[pos], excludedSymbols) && !symbolIsOneChar) ||
                        (token is WordToken && !char.IsNumber(s[pos]) && !IsSymbolChar(s[pos], excludedSymbols) && !char.IsWhiteSpace(s[pos])) ||
                        (token is QuotedLiteralToken && s[pos] != quoteCharacter))
                    {
                        token.ExtendRight();
                    }
                    else if (token is NumberToken &&
                             s[pos] == decimalPoint[0] &&
                             pos + 1 < s.Length &&
                             char.IsNumber(s[pos + 1]) &&
                             !hasDP)
                    {
                        hasDP = true;
                        token.ExtendRight();
                    }
                    else
                    {
                        break;
                    }

                    pos++;
                }

                token.Previous = lastToken;
                token.SetValue(s);
                token.HasWhitespaceBefore = hasWhitespaceBefore;
                if (token is QuotedLiteralToken)    // skip final quote
                {
                    pos++;
                }
                yield return(token);

                lastToken = token;
            }
        }
Пример #2
0
        /// <summary>
        /// Break a string into tokens
        /// </summary>
        /// <param name="s">The string</param>
        /// <param name="culture">The Culture to use for this</param>
        /// <param name="symbolIsOneChar">Whether to consider every symbol character a separate symbol token</param>
        /// <param name="quoteCharacter">Quote character found around quoted literal token</param>
        /// <param name="excludedSymbols">List of characters which are not to be considered symbols</param>
        /// <returns>Enumerable of tokens</returns>
        public static IEnumerable<Token> Tokenise(this string s, CultureInfo culture, bool symbolIsOneChar, char? quoteCharacter, List<char> excludedSymbols)
        {
            int pos = 0;
            Token token = null;
            Token lastToken = null;
            bool hasWhitespaceBefore = false;
            string decimalPoint = culture.NumberFormat.NumberDecimalSeparator;
            if (excludedSymbols == null)
                excludedSymbols = new List<char>();

            while (pos < s.Length)
            {
                hasWhitespaceBefore = false;
                while (pos < s.Length && char.IsWhiteSpace(s[pos]))
                {
                    pos++;
                    hasWhitespaceBefore = true;
                }

                if (pos >= s.Length)
                    yield break;
                if (quoteCharacter.HasValue && s[pos] == quoteCharacter)
                    token = new QuotedLiteralToken(pos + 1, pos + 1);
                else if (char.IsNumber(s[pos]))
                    token = new NumberToken(pos, pos + 1);
                else if (IsSymbolChar(s[pos], excludedSymbols))
                    token = new SymbolToken(pos, pos + 1);
                else
                    token = new WordToken(pos, pos + 1);
                pos++;
                bool hasDP = false;
                while (pos < s.Length && (token is QuotedLiteralToken || !char.IsWhiteSpace(s[pos])))
                {
                    if ((token is NumberToken && char.IsNumber(s[pos]))
                        || (token is SymbolToken && IsSymbolChar(s[pos], excludedSymbols) && !symbolIsOneChar)
                        || (token is WordToken && !char.IsNumber(s[pos]) && !IsSymbolChar(s[pos], excludedSymbols) && !char.IsWhiteSpace(s[pos]))
                        || (token is QuotedLiteralToken && s[pos] != quoteCharacter))
                        token.ExtendRight();
                    else if (token is NumberToken
                        && s[pos] == decimalPoint[0]
                        && pos + 1 < s.Length
                        && char.IsNumber(s[pos+1])
                        && !hasDP)
                    {
                        hasDP = true;
                        token.ExtendRight();
                    }
                    else
                        break;

                    pos++;
                }

                token.Previous = lastToken;
                token.SetValue(s);
                token.HasWhitespaceBefore = hasWhitespaceBefore;
                if (token is QuotedLiteralToken)    // skip final quote
                    pos++;
                yield return token;
                lastToken = token;
            }
        }