コード例 #1
0
        /// <summary>
        /// Normalizer for Vietnamese.
        /// Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx .
        /// </summary>
        /// <param name="text"></param>
        /// <returns>normalized text</returns>
        public static string normalize_vi(string text)
        {
            MatchCollection matcher = ALPHABET_WITH_DMARK.Matches(text);
            StringBuilder   sb      = new StringBuilder();
            int             last    = 0;

            foreach (Match m in matcher)
            {
                string match = m.Groups[0].Value;
                if (match.Trim().Length > 0)
                {
                    int alphabet = TO_NORMALIZE_VI_CHARS.IndexOf(m.Groups[1].Value[0]);

                    int dmark = DMARK_CLASS.IndexOf(m.Groups[2].Value[0]); // Diacritical Mark

                    sb.Append(text.Substring(last, m.Index - last));
                    sb.Append(m.Result(NORMALIZED_VI_CHARS[dmark].Substring(alphabet, 1)));
                }
                last = m.Index + m.Length;
            }
            sb.Append(text.Substring(last));
            text = sb.ToString();

            return(text);
        }
コード例 #2
0
ファイル: NGram.cs プロジェクト: weselow/LanguageDetection
        /// <summary>
        /// Normalizer for Vietnamese.
        /// Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx .
        /// </summary>
        /// <param name="text"></param>
        /// <returns>normalized text</returns>
        public static string normalize_vi(string text)
        {
            var           matches = ALPHABET_WITH_DMARK.Matches(text);
            StringBuilder buf     = new StringBuilder();

            foreach (Match m in matches)
            {
                int alphabet = TO_NORMALIZE_VI_CHARS.IndexOf(m.Groups[1].Value);
                int dmark    = DMARK_CLASS.IndexOf(m.Groups[2].Value); // Diacritical Mark
                // TODO: m.appendReplacement(buf, NORMALIZED_VI_CHARS[dmark].Substring(alphabet, 1));
            }
            if (buf.Length == 0)
            {
                return(text);
            }
            // TODO: m.appendTail(buf);
            return(buf.ToString());
        }