static void ProcessLine(char cleanLetter, string filterText, string[] lx) { if (lx[1].Contains(filterText + Char.ToUpper(cleanLetter) + " WITH") || lx[1].EndsWith(filterText + Char.ToUpper(cleanLetter))) { var isOk = Int32.TryParse(lx[0], NumberStyles.AllowHexSpecifier, null, out int point); System.Diagnostics.Debug.Assert(isOk); string utf16Char = Char.ConvertFromUtf32(point); byte[] utf16Bytes = Encoding.Unicode.GetBytes(utf16Char); byte[] cp1252Byte = Encoding.Convert(Encoding.Unicode, cp1252, utf16Bytes); if (cp1252Byte.Length == 0) { GenMap1252.Map1252Add(point, (byte)cleanLetter, lx[1]); } } }
static void Main(string[] args) { // See comment at top for usage. using (var db = new StreamReader("UnicodeData.txt")) while (!db.EndOfStream) { udb.Add(db.ReadLine().Split(';')); } for (byte[] b1252 = new byte[] { 0 }; ; ++b1252[0]) { string utf16 = cp1252.GetString(b1252); int p32 = Char.ConvertToUtf32(utf16, 0); var row = udb.First(xx => Int32.Parse(xx[0], NumberStyles.AllowHexSpecifier) == p32); if (p32 != b1252[0]) { GenMap1252.Map1252Add(p32, b1252[0], row[1] + "**"); } if (b1252[0] == 0xFF) { break; } } var totalExactMaps = GenMap1252.map1252.Count; foreach (var lx in udb) { if (lx[1].Contains("LATIN")) { for (var ch = 'A'; ch <= 'Z'; ++ch) { ProcessLine(ch, "LATIN CAPITAL LETTER ", lx); } for (var ch = 'a'; ch <= 'z'; ++ch) { ProcessLine(ch, "LATIN SMALL LETTER ", lx); } } } // There are many more potential custom remaps like these: Map1252Add(Char.ConvertToUtf32("⁓", 0), (byte)'~'); Map1252Add(Char.ConvertToUtf32("‒", 0), (byte)'-'); Map1252Add(Char.ConvertToUtf32("―", 0), (byte)'-'); Console.WriteLine("// Generated from v8.0 of www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"); Console.WriteLine("// Total = " + map1252.Count + ", Scrubbed = " + (map1252.Count - totalExactMaps)); var b1 = new byte[1]; var countdown = map1252.Count; foreach (var kv in map1252) { char delim = --countdown == 0? ' ': ','; b1[0] = kv.Value.Octet; char[] char1252 = cp1252.GetChars(b1); Console.WriteLine("0x" + kv.Key.ToString("X6") + kv.Value.Octet.ToString("X2") + delim + " // " + kv.Value.Desc); } /* Output: * * 0x00010041, // LATIN CAPITAL LETTER A WITH MACRON * 0x00010161, // LATIN SMALL LETTER A WITH MACRON * . * . * . * 0x0E007979, // TAG LATIN SMALL LETTER Y * 0x0E007A7A // TAG LATIN SMALL LETTER Z * */ }