private IEnumerable <string> Tokenize(string text)
        {
            IList <string> tokens   = new List <string>();
            int            tokenLen = 0;

            int len = text.Length;
            var buf = new char[255];

            for (int i = 0; i <= len; i++)
            {
                char c = '\x00';
                if (i != len)
                {
                    c = TokenizeRules.Normalize(text[i]);
                }
                if (i != len && TokenizeRules.IsTokenChar(c))
                {
                    buf[tokenLen] = c;
                    tokenLen++;
                    if (tokenLen > 255)
                    {
                        throw new InvalidOperationException("Token longer than 255 characters.");
                    }
                }
                else
                {
                    // we're not interested in tokens shorter than 3 chars
                    if (tokenLen >= 3)
                    {
                        tokens.Add(new string(buf, 0, tokenLen));
                    }
                    tokenLen = 0;
                }
            }
            return(tokens);
        }
예제 #2
0
 /// <summary>Converts char to lower case
 /// <see cref="char.ToLower(char)" />.
 /// </summary>
 protected override char Normalize(char c)
 {
     return(TokenizeRules.Normalize(c));
 }
예제 #3
0
 /// <summary>Collects only characters which satisfy
 /// <see cref="char.IsLetter(char)" />.
 /// </summary>
 protected override bool IsTokenChar(char c)
 {
     return(TokenizeRules.IsTokenChar(c));
 }