private IList <string> Tokenize(string text) { var tokens = new List <string>(); var letters = UnicodeUtils.Letters(text); var token = string.Empty; foreach (var letter in letters) { if (UnicodeUtils.IsEmoji(letter)) { // Character is in a Supplementary Unicode Plane. This is where emoji live so // we're going to just break each character in this range out as its own token. tokens.Add(letter); if (!string.IsNullOrWhiteSpace(token)) { tokens.Add(token); token = string.Empty; } } else if (!(config.TokenRegex.IsMatch(letter) || string.IsNullOrWhiteSpace(letter))) { token = token + letter; } else if (!string.IsNullOrWhiteSpace(token)) { tokens.Add(token); token = string.Empty; } } if (!string.IsNullOrWhiteSpace(token)) { tokens.Add(token); token = string.Empty; } return(tokens); }
public static string CompressRle(byte[] bytes) { StringBuilder sbr = new StringBuilder(); for (int i = 0; i < bytes.Length;) { byte b = bytes[i]; if (i < bytes.Length - 1 && b == bytes[i + 1]) { int count = 0; while (count < 16384 && bytes[i + count] == b) { ++count; } i += count; if (count < 80) { sbr.Append(UnicodeUtils.EncodeByte((byte)(b + 32))); sbr.Append(UnicodeUtils.EncodeByte((byte)count)); } else { sbr.Append(UnicodeUtils.EncodeByte((byte)64)); sbr.Append(UnicodeUtils.EncodeByte(b)); for (int j = 0; j < 3; ++j) { sbr.Append(UnicodeUtils.EncodeByte((byte)(count & 0x3F))); count /= 0x40; } } } else { sbr.Append(UnicodeUtils.EncodeByte(bytes[i++])); } } return(sbr.ToString()); }