Exemplo n.º 1
0
        private IList <string> Tokenize(string text)
        {
            var tokens  = new List <string>();
            var letters = UnicodeUtils.Letters(text);

            var token = string.Empty;

            foreach (var letter in letters)
            {
                if (UnicodeUtils.IsEmoji(letter))
                {
                    // Character is in a Supplementary Unicode Plane. This is where emoji live so
                    // we're going to just break each character in this range out as its own token.
                    tokens.Add(letter);
                    if (!string.IsNullOrWhiteSpace(token))
                    {
                        tokens.Add(token);
                        token = string.Empty;
                    }
                }
                else if (!(config.TokenRegex.IsMatch(letter) || string.IsNullOrWhiteSpace(letter)))
                {
                    token = token + letter;
                }
                else if (!string.IsNullOrWhiteSpace(token))
                {
                    tokens.Add(token);
                    token = string.Empty;
                }
            }

            if (!string.IsNullOrWhiteSpace(token))
            {
                tokens.Add(token);
                token = string.Empty;
            }

            return(tokens);
        }
Exemplo n.º 2
0
        public static string CompressRle(byte[] bytes)
        {
            StringBuilder sbr = new StringBuilder();

            for (int i = 0; i < bytes.Length;)
            {
                byte b = bytes[i];
                if (i < bytes.Length - 1 && b == bytes[i + 1])
                {
                    int count = 0;
                    while (count < 16384 && bytes[i + count] == b)
                    {
                        ++count;
                    }
                    i += count;
                    if (count < 80)
                    {
                        sbr.Append(UnicodeUtils.EncodeByte((byte)(b + 32)));
                        sbr.Append(UnicodeUtils.EncodeByte((byte)count));
                    }
                    else
                    {
                        sbr.Append(UnicodeUtils.EncodeByte((byte)64));
                        sbr.Append(UnicodeUtils.EncodeByte(b));
                        for (int j = 0; j < 3; ++j)
                        {
                            sbr.Append(UnicodeUtils.EncodeByte((byte)(count & 0x3F)));
                            count /= 0x40;
                        }
                    }
                }
                else
                {
                    sbr.Append(UnicodeUtils.EncodeByte(bytes[i++]));
                }
            }
            return(sbr.ToString());
        }