/// <summary> /// Normalizer for Vietnamese. /// Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx . /// </summary> /// <param name="text"></param> /// <returns>normalized text</returns> public static string normalize_vi(string text) { MatchCollection matcher = ALPHABET_WITH_DMARK.Matches(text); StringBuilder sb = new StringBuilder(); int last = 0; foreach (Match m in matcher) { string match = m.Groups[0].Value; if (match.Trim().Length > 0) { int alphabet = TO_NORMALIZE_VI_CHARS.IndexOf(m.Groups[1].Value[0]); int dmark = DMARK_CLASS.IndexOf(m.Groups[2].Value[0]); // Diacritical Mark sb.Append(text.Substring(last, m.Index - last)); sb.Append(m.Result(NORMALIZED_VI_CHARS[dmark].Substring(alphabet, 1))); } last = m.Index + m.Length; } sb.Append(text.Substring(last)); text = sb.ToString(); return(text); }
/// <summary> /// Normalizer for Vietnamese. /// Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx . /// </summary> /// <param name="text"></param> /// <returns>normalized text</returns> public static string normalize_vi(string text) { var matches = ALPHABET_WITH_DMARK.Matches(text); StringBuilder buf = new StringBuilder(); foreach (Match m in matches) { int alphabet = TO_NORMALIZE_VI_CHARS.IndexOf(m.Groups[1].Value); int dmark = DMARK_CLASS.IndexOf(m.Groups[2].Value); // Diacritical Mark // TODO: m.appendReplacement(buf, NORMALIZED_VI_CHARS[dmark].Substring(alphabet, 1)); } if (buf.Length == 0) { return(text); } // TODO: m.appendTail(buf); return(buf.ToString()); }