public void OnValueChange() { if (field != null) { string inStr = field.text; if (inStr.Length != 0) { System.Text.StringBuilder retStr = new System.Text.StringBuilder(); System.Globalization.TextElementEnumerator tee = System.Globalization.StringInfo.GetTextElementEnumerator(inStr); tee.Reset(); while (tee.MoveNext()) { // 1文字取得 var te = tee.GetTextElement(); // 1文字が2つ以上のcharからなる場合は、サロゲートペアと判断 if (1 < te.Length) { // 文字列から除去 } else { retStr = retStr.Append(te); } } // InputFieldに返す field.text = retStr.ToString(); } } }
/// <summary> /// Creates a new Tokenizer object that will read from the given input. /// </summary> /// <param name="input">Where to read input from.</param> /// <param name="name">The name of the input, used for debugging.</param> /// <exception cref="System.ArgumentNullException">If input is null.</exception> public Tokenizer(TextElementEnumerator input, string name) { if (input == null) throw new ArgumentNullException("reader"); this.peek = new Stack<Token>(); this.input = input; this.Name = name; this.Position = 1; this.Line = 1; input.MoveNext(); // start the enumerator }
static void Main() { // Create a file that contains the Greek work ψυχή (psyche) when interpreted by using // code page 737 ((DOS) Greek). You can also create the file by using Character Map // to paste the characters into Microsoft Word and then "Save As" by using the DOS // (Greek) encoding. (Word will actually create a six-byte file by appending "\r\n" at the end.) System.IO.File.WriteAllBytes(@"greek.txt", new byte[] { 0xAF, 0xAC, 0xAE, 0x9E }); // Specify the code page to correctly interpret byte values Encoding encoding = Encoding.GetEncoding(737); //(DOS) Greek code page byte[] codePageValues = System.IO.File.ReadAllBytes(@"greek.txt"); // Same content is now encoded as UTF-16 string unicodeValues = encoding.GetString(codePageValues); // Show that the text content is still intact in Unicode string // (Add a reference to System.Windows.Forms.dll) System.Windows.Forms.MessageBox.Show(unicodeValues); // Same content "ψυχή" is stored as UTF-8 System.IO.File.WriteAllText(@"greek_unicode.txt", unicodeValues); // Conversion is complete. Show the bytes to prove the conversion. Console.WriteLine("8-bit encoding byte values:"); foreach (byte b in codePageValues) { Console.Write("{0:X}-", b); } Console.WriteLine(); Console.WriteLine("Unicode values:"); string unicodeString = System.IO.File.ReadAllText("greek_unicode.txt"); System.Globalization.TextElementEnumerator enumerator = System.Globalization.StringInfo.GetTextElementEnumerator(unicodeString); while (enumerator.MoveNext()) { string s = enumerator.GetTextElement(); int i = Char.ConvertToUtf32(s, 0); Console.Write("{0:X}-", i); } Console.WriteLine(); // Keep the console window open in debug mode. Console.Write("Press any key to exit."); Console.ReadKey(); }
/// <summary>Returns the indexes of each base character, high surrogate, or control character within the specified string.</summary> /// <returns>An array of integers that contains the zero-based indexes of each base character, high surrogate, or control character within the specified string.</returns> /// <param name="str">The string to search. </param> /// <exception cref="T:System.ArgumentNullException"> /// <paramref name="str" /> is null. </exception> public static int[] ParseCombiningCharacters(string str) { if (str == null) { throw new ArgumentNullException("string is null"); } ArrayList arrayList = new ArrayList(str.Length); TextElementEnumerator textElementEnumerator = StringInfo.GetTextElementEnumerator(str); textElementEnumerator.Reset(); while (textElementEnumerator.MoveNext()) { arrayList.Add(textElementEnumerator.ElementIndex); } return((int[])arrayList.ToArray(typeof(int))); }
// INITIALIZATION //_________________________________________________________________________________________ /// <summary> /// Creates a new string iterator. /// </summary> /// <param name="prototype"> The next object in the prototype chain. </param> /// <param name="iteratedString"> The string to iterate over. </param> internal StringIterator(ObjectInstance prototype, string iteratedString) : base(prototype) { this.enumerator = StringInfo.GetTextElementEnumerator(iteratedString); }
public static void Main() { // <Snippet6> // First sentence of The Mystery of the Yellow Room, by Leroux. string opening = "Ce n'est pas sans une certaine émotion que " + "je commence à raconter ici les aventures " + "extraordinaires de Joseph Rouletabille."; // Character counters. int nChars = 0; // Objects to store word count. List <int> chars = new List <int>(); List <int> elements = new List <int>(); foreach (var ch in opening) { // Skip the ' character. if (ch == '\u0027') { continue; } if (Char.IsWhiteSpace(ch) | (Char.IsPunctuation(ch))) { chars.Add(nChars); nChars = 0; } else { nChars++; } } System.Globalization.TextElementEnumerator te = System.Globalization.StringInfo.GetTextElementEnumerator(opening); while (te.MoveNext()) { string s = te.GetTextElement(); // Skip the ' character. if (s == "\u0027") { continue; } if (String.IsNullOrEmpty(s.Trim()) | (s.Length == 1 && Char.IsPunctuation(Convert.ToChar(s)))) { elements.Add(nChars); nChars = 0; } else { nChars++; } } // Display character counts. Console.WriteLine("{0,6} {1,20} {2,20}", "Word #", "Char Objects", "Characters"); for (int ctr = 0; ctr < chars.Count; ctr++) { Console.WriteLine("{0,6} {1,20} {2,20}", ctr, chars[ctr], elements[ctr]); } // The example displays the following output: // Word # Char Objects Characters // 0 2 2 // 1 4 4 // 2 3 3 // 3 4 4 // 4 3 3 // 5 8 8 // 6 8 7 // 7 3 3 // 8 2 2 // 9 8 8 // 10 2 1 // 11 8 8 // 12 3 3 // 13 3 3 // 14 9 9 // 15 15 15 // 16 2 2 // 17 6 6 // 18 12 12 // </Snippet6> }
public static String ToEncode(IntPtr code, IntPtr nNormalizeForm) { String text = ""; try { if ((Int32)(dynamic)Hm.Macro.Var["selecting"] == 1) { text = Hm.Edit.SelectedText; } else { text = Hm.Edit.TotalText; } int lineCnt = text.Count(c => c == '\n'); if (lineCnt < 2) { text += "\n\n"; } } catch (Exception ex0) { return(ex0.GetType() + ":" + ex0.Message + "\n元のファイルにバイナリが混在していないかを確認してください。"); } String normalize_text = ""; try { if (nNormalizeForm.ToInt32() == 2) { normalize_text = text.Normalize(); } else if (nNormalizeForm.ToInt32() == 1) { StringBuilder sb = new StringBuilder(500 * 1024); // 500k程度をデフォルトとして確保 //TextElementEnumeratorを作成する System.Globalization.TextElementEnumerator tee = System.Globalization.StringInfo.GetTextElementEnumerator(text); //読み取る位置をテキストの先頭にする tee.Reset(); //1文字ずつ取得する while (tee.MoveNext()) { //1文字取得する string te = tee.GetTextElement(); //1文字が2つ以上のCharから成る場合は、サロゲートペアか結合文字列と判断する if (te.Length > 1) { // ノーマライズして足す sb.Append(te.Normalize()); /* * //サロゲートペアか調べる * if (te.Length == 2 && char.IsSurrogatePair(te, 0)) * { * Console.WriteLine("サロゲートペア「{0}」が「{1}」の位置にあります。", * te, tee.ElementIndex); * } * else * { * //サロゲートペアでない場合は結合文字列と判断する * Console.WriteLine("結合文字列「{0}」が「{1}」の位置にあります。", * te, tee.ElementIndex); * } */ } else { // 普通に足す sb.Append(te); } } normalize_text = sb.ToString(); } else { normalize_text = text; } } catch (Exception ex1) { return(ex1.GetType() + ":" + ex1.Message + "\n元のファイルにバイナリが混在していないかを確認してください。"); } try { // 独自に実装したフォールバックを指定してEncodingを取得 var encode = Encoding.GetEncoding(code.ToInt32(), new HmEncoderScalarValueFallback(), DecoderFallback.ReplacementFallback); var bytes = encode.GetBytes(normalize_text); // 再び文字列に戻して return(encode.GetString(bytes)); } catch (Exception ex2) { return(ex2.GetType() + ":" + ex2.Message + "\n指定の「#ToEncodeCodePage」の値等が正しいか、よく確認してください。"); } }