private IEnumerable <string> Tokenize(string text) { IList <string> tokens = new List <string>(); int tokenLen = 0; int len = text.Length; var buf = new char[255]; for (int i = 0; i <= len; i++) { char c = '\x00'; if (i != len) { c = TokenizeRules.Normalize(text[i]); } if (i != len && TokenizeRules.IsTokenChar(c)) { buf[tokenLen] = c; tokenLen++; if (tokenLen > 255) { throw new InvalidOperationException("Token longer than 255 characters."); } } else { // we're not interested in tokens shorter than 3 chars if (tokenLen >= 3) { tokens.Add(new string(buf, 0, tokenLen)); } tokenLen = 0; } } return(tokens); }
/// <summary>Converts char to lower case /// <see cref="char.ToLower(char)" />. /// </summary> protected override char Normalize(char c) { return(TokenizeRules.Normalize(c)); }
/// <summary>Collects only characters which satisfy /// <see cref="char.IsLetter(char)" />. /// </summary> protected override bool IsTokenChar(char c) { return(TokenizeRules.IsTokenChar(c)); }