private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap) { if (map [i].Defined) return; int up = (int) ti.ToUpper ((char) i); if (checkUpper && map [up].Category == 0xF) { if (i == up) return; FillLetterNFKD (up, checkUpper, greekRemap); map [i] = new CharMapEntry (0xF, map [up].Level1, map [up].Level2); } else { int idx = decompIndex [i]; if (idx == 0) return; int primary = decompValues [decompIndex [i]]; FillLetterNFKD (primary, checkUpper, greekRemap); int lv2 = map [primary].Level2; byte off = 0; for (int l = 1; l < decompLength [i]; l++) { int tmp = decompValues [idx + l]; if (map [tmp].Category != 1) return; if (greekRemap && map [tmp].Level2 == 0xC) off += 3; else off += map [tmp].Level2; } if (off > 0) { if (lv2 == 0) lv2 += 2; lv2 += off; } // ... but override if the value already exists. if (diacritical [i] != 0) lv2 = diacritical [i]; map [i] = new CharMapEntry ( map [primary].Category, map [primary].Level1, (byte) lv2); } }
private bool AddCharMap (char c, byte category, byte increment, byte alt) { if (IsIgnorable ((int) c) || map [(int) c].Defined) return false; // do nothing map [(int) c] = new CharMapEntry (category, category == 1 ? alt : fillIndex [category], category == 1 ? fillIndex [category] : alt); fillIndex [category] += increment; return true; }
void GenerateCore () { UnicodeCategory uc; #region Specially ignored // 01 // This will raise "Defined" flag up. // FIXME: Check If it is really fine. Actually for // Japanese voice marks this code does remapping. foreach (char c in specialIgnore) map [(int) c] = new CharMapEntry (0, 0, 0); #endregion #region Extenders (FF FF) fillIndex [0xFF] = 0xFF; char [] specialBiggest = new char [] { '\u3005', '\u3031', '\u3032', '\u309D', '\u309E', '\u30FC', '\u30FD', '\u30FE', '\uFE7C', '\uFE7D', '\uFF70'}; foreach (char c in specialBiggest) AddCharMap (c, 0xFF, 0); #endregion #region Variable weights // Controls : 06 03 - 06 3D fillIndex [0x6] = 3; for (int i = 0; i < 65536; i++) { if (IsIgnorable (i)) continue; char c = (char) i; uc = Char.GetUnicodeCategory (c); // NEL is whitespace but not ignored here. if (uc == UnicodeCategory.Control && !Char.IsWhiteSpace (c) || c == '\u0085') AddCharMap (c, 6, 1); } // Apostrophe 06 80 fillIndex [0x6] = 0x80; AddCharMap ('\'', 6, 0); AddCharMap ('\uFF07', 6, 1); AddCharMap ('\uFE63', 6, 1); // SPECIAL CASE: fill FE32 here in prior to be added // at 2013. Windows does not always respect NFKD. map [0xFE32] = new CharMapEntry (6, 0x90, 0); // Hyphen/Dash : 06 81 - 06 90 for (int i = 0; i < char.MaxValue; i++) { if (!IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.DashPunctuation) { AddCharMapGroup2 ((char) i, 6, 1, 0); if (i == 0x2011) { // SPECIAL: add 2027 and 2043 // Maybe they are regarded the // same hyphens in "central" // position. AddCharMap ('\u2027', 6, 1); AddCharMap ('\u2043', 6, 1); } } } // They are regarded as primarily equivalent to '-' map [0x208B] = new CharMapEntry (6, 0x82, 0); map [0x207B] = new CharMapEntry (6, 0x82, 0); map [0xFF0D] = new CharMapEntry (6, 0x82, 0); // Arabic variable weight chars 06 A0 - fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) AddArabicCharMap ((char) i, 6, 1, 0); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda AddCharMapGroup ('\u0651', 6, 1, 0); #endregion #region Nonspacing marks // 01 // FIXME: 01 03 - 01 B6 ... annoyance :( // Combining diacritical marks: 01 DC - fillIndex [0x1] = 0x41; for (int i = 0x030E; i <= 0x0326; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x0329; i <= 0x0334; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1]++; for (int i = 0x0339; i <= 0x0341; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0x74; for (int i = 0x0346; i <= 0x0348; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02BE; i <= 0x02BF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02C1; i <= 0x02C5; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); for (int i = 0x02CE; i <= 0x02CF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1]++; for (int i = 0x02D1; i <= 0x02D3; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); AddCharMap ('\u02DE', 0x1, 1); for (int i = 0x02E4; i <= 0x02E9; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); // FIXME: needs more love here (it should eliminate // all the hacky code above). for (int i = 0x0300; i < 0x0370; i++) if (!IsIgnorable (i) && diacritical [i] != 0 && !map [i].Defined) map [i] = new CharMapEntry ( 0x1, 0x1, diacritical [i]); // Cyrillic and Armenian nonspacing mark fillIndex [0x1] = 0x94; for (int i = 0x400; i < 0x580; i++) if (!IsIgnorable (i) && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); fillIndex [0x1] = 0x8D; // syriac dotted nonspacing marks (1) AddCharMap ('\u0740', 0x1, 1); AddCharMap ('\u0741', 0x1, 1); AddCharMap ('\u0742', 0x1, 1); // syriac oblique nonspacing marks AddCharMap ('\u0747', 0x1, 1); AddCharMap ('\u0748', 0x1, 1); // syriac dotted nonspacing marks (2) fillIndex [0x1] = 0x94; // this reset is mandatory AddCharMap ('\u0732', 0x1, 1); AddCharMap ('\u0735', 0x1, 1); AddCharMap ('\u0738', 0x1, 1); AddCharMap ('\u0739', 0x1, 1); AddCharMap ('\u073C', 0x1, 1); // SPECIAL CASES: superscripts AddCharMap ('\u073F', 0x1, 1); AddCharMap ('\u0711', 0x1, 1); // syriac "DOTS" for (int i = 0x0743; i <= 0x0746; i++) AddCharMap ((char) i, 0x1, 1); for (int i = 0x0730; i <= 0x0780; i++) if (!map [i].Defined && Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 0x1, 1); // LAMESPEC: It should not stop at '\u20E1'. There are // a few more characters (that however results in // overflow of level 2 unless we start before 0xDD). fillIndex [0x1] = 0xDD; for (int i = 0x20D0; i <= 0x20DC; i++) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0xEC; for (int i = 0x20DD; i <= 0x20E1; i++) AddCharMap ((char) i, 0x1, 1); fillIndex [0x1] = 0x4; AddCharMap ('\u0CD5', 0x1, 1); AddCharMap ('\u0CD6', 0x1, 1); AddCharMap ('\u093C', 0x1, 1); for (int i = 0x302A; i <= 0x302D; i++) AddCharMap ((char) i, 0x1, 1); AddCharMap ('\u0C55', 0x1, 1); AddCharMap ('\u0C56', 0x1, 1); fillIndex [0x1] = 0x50; // I wonder how they are sorted for (int i = 0x02D4; i <= 0x02D7; i++) AddCharMap ((char) i, 0x1, 1); // They are not part of Nonspacing marks, but have // only diacritical weight. for (int i = 0x3099; i <= 0x309C; i++) map [i] = new CharMapEntry (1, 1, 1); map [0xFF9E] = new CharMapEntry (1, 1, 1); map [0xFF9F] = new CharMapEntry (1, 1, 2); map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1); map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1); for (int i = 0x30FC; i <= 0x30FE; i++) map [i] = new CharMapEntry (0xFF, 0xFF, 1); fillIndex [0x1] = 0xA; for (int i = 0x0951; i <= 0x0954; i++) AddCharMap ((char) i, 0x1, 2); #endregion #region Whitespaces // 07 03 - fillIndex [0x7] = 0x2; AddCharMap (' ', 0x7, 2); AddCharMap ('\u00A0', 0x7, 1); for (int i = 9; i <= 0xD; i++) AddCharMap ((char) i, 0x7, 1); for (int i = 0x2000; i <= 0x200B; i++) AddCharMap ((char) i, 0x7, 1); fillIndex [0x7] = 0x17; AddCharMapGroup ('\u2028', 0x7, 1, 0); AddCharMapGroup ('\u2029', 0x7, 1, 0); // Characters which used to represent layout control. // LAMESPEC: Windows developers seem to have thought // that those characters are kind of whitespaces, // while they aren't. AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol AddCharMap ('\u2423', 0x7, 1, 0); // open box #endregion // category 09 - continued symbols from 08 fillIndex [0x9] = 2; // misc tech mark for (int cp = 0x2300; cp <= 0x237A; cp++) AddCharMap ((char) cp, 0x9, 1, 0); // arrows byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; foreach (DictionaryEntry de in arrowValues) { int idx = (int) de.Value; int cp = (int) de.Key; if (map [cp].Defined) continue; fillIndex [0x9] = (byte) (0xD8 + idx); AddCharMapGroup ((char) cp, 0x9, 0, arrowLv2 [idx]); arrowLv2 [idx]++; } // boxes byte [] boxLv2 = new byte [128]; // 0-63 will be used for those offsets are positive, // and 64-127 are for negative ones. for (int i = 0; i < boxLv2.Length; i++) boxLv2 [i] = 3; foreach (DictionaryEntry de in boxValues) { int cp = (int) de.Key; int off = (int) de.Value; if (map [cp].Defined) continue; if (off < 0) { fillIndex [0x9] = (byte) (0xE5 + off); AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++); } else { fillIndex [0x9] = (byte) (0xE5 + off); AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [off]++); } } // Some special characters (slanted) fillIndex [0x9] = 0xF4; AddCharMap ('\u2571', 0x9, 3); AddCharMap ('\u2572', 0x9, 3); AddCharMap ('\u2573', 0x9, 3); // FIXME: implement 0A #region Symbols fillIndex [0xA] = 2; // byte currency symbols for (int cp = 0; cp < 0x100; cp++) { uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.CurrencySymbol && cp != '$') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // byte other symbols for (int cp = 0; cp < 0x100; cp++) { if (cp == 0xA6) continue; // SPECIAL: skip FIXME: why? uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.OtherSymbol || cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // U+30FB here AddCharMapGroup ('\u30FB', 0xA, 1, 0); for (int cp = 0x2020; cp <= 0x2031; cp++) if (Char.IsPunctuation ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // SPECIAL CASES: why? AddCharMap ('\u203B', 0xA, 1, 0); AddCharMap ('\u2040', 0xA, 1, 0); AddCharMap ('\u2041', 0xA, 1, 0); AddCharMap ('\u2042', 0xA, 1, 0); for (int cp = 0x20A0; cp <= 0x20AB; cp++) AddCharMap ((char) cp, 0xA, 1, 0); // 3004 is skipped at first... for (int cp = 0x3010; cp <= 0x3040; cp++) if (Char.IsSymbol ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // SPECIAL CASES: added here AddCharMap ('\u3004', 0xA, 1, 0); AddCharMap ('\u327F', 0xA, 1, 0); for (int cp = 0x2600; cp <= 0x2613; cp++) AddCharMap ((char) cp, 0xA, 1, 0); // Dingbats for (int cp = 0x2620; cp <= 0x2770; cp++) if (Char.IsSymbol ((char) cp)) AddCharMap ((char) cp, 0xA, 1, 0); // OCR for (int i = 0x2440; i < 0x2460; i++) AddCharMap ((char) i, 0xA, 1, 0); // SPECIAL CASES: why? AddCharMap ('\u0E3F', 0xA, 1, 0); AddCharMap ('\u2117', 0xA, 1, 0); AddCharMap ('\u20AC', 0xA, 1, 0); #endregion #region Numbers // 0C 02 - 0C E1 fillIndex [0xC] = 2; // 9F8 : Bengali "one less than the denominator" AddCharMap ('\u09F8', 0xC, 1, 0x3C); ArrayList numbers = new ArrayList (); for (int i = 0; i < 65536; i++) if (!IsIgnorable (i) && Char.IsNumber ((char) i) && (i < 0x3190 || 0x32C0 < i)) // they are CJK characters numbers.Add (i); ArrayList numberValues = new ArrayList (); foreach (int i in numbers) numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i])); // SPECIAL CASE: Cyrillic Thousand sign numberValues.Add (new DictionaryEntry (0x0482, 1000m)); numberValues.Sort (DecimalDictionaryValueComparer.Instance); //foreach (DictionaryEntry de in numberValues) //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]); // FIXME: fillIndex adjustment lines are too // complicated. It must be simpler. decimal prevValue = -1; foreach (DictionaryEntry de in numberValues) { int cp = (int) de.Key; decimal currValue = (decimal) de.Value; bool addnew = false; if (prevValue < currValue && prevValue - (int) prevValue == 0 && prevValue >= 1) { addnew = true; // Process Hangzhou and Roman numbers // There are some SPECIAL cases. if (currValue != 4) // no increment for 4 fillIndex [0xC]++; int xcp; if (currValue <= 13) { if (currValue == 4) fillIndex [0xC]++; // SPECIAL CASE if (currValue == 11) AddCharMap ('\u0BF0', 0xC, 1); xcp = (int) prevValue + 0x2160 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = (int) prevValue + 0x2170 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; } if (currValue < 12) fillIndex [0xC]++; if (currValue <= 10) { xcp = (int) prevValue + 0x3021 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; } } if (prevValue < currValue) prevValue = currValue; if (map [cp].Defined) continue; // HangZhou and Roman are add later // (code is above) if (0x3021 <= cp && cp < 0x302A || 0x2160 <= cp && cp < 0x216C || 0x2170 <= cp && cp < 0x217C) continue; if (cp == 0x215B) // FIXME: why? fillIndex [0xC] += 2; else if (cp == 0x3021) // FIXME: why? fillIndex [0xC]++; if (addnew || cp <= '9') { int mod = (int) currValue - 1; int xcp; if (1 <= currValue && currValue <= 11) { xcp = mod + 0x2776; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2780; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x278A; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } if (1 <= currValue && currValue <= 20) { xcp = mod + 0x2460; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2474; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2488; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } } if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9) fillIndex [0xC]++; AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true); switch (cp) { // Maybe Bengali digit numbers do not increase // indexes, but 0x09E6 does. case 0x09E7: case 0x09E8: case 0x09E9: case 0x09EA: // SPECIAL CASES case 0x0BF0: case 0x2180: case 0x2181: break; // SPECIAL CASE case 0x0BF1: fillIndex [0xC]++; break; default: if (currValue < 11 || currValue == 1000) fillIndex [0xC]++; break; } // Add special cases that are not regarded as // numbers in UnicodeCategory speak. if (cp == '5') { // TONE FIVE AddCharMapGroup ('\u01BD', 0xC, 0, 0); AddCharMapGroup ('\u01BC', 0xC, 1, 0); } else if (cp == '2' || cp == '6') // FIXME: why? fillIndex [0xC]++; } // 221E: infinity fillIndex [0xC] = 0xFF; AddCharMap ('\u221E', 0xC, 1); #endregion #region Letters and NonSpacing Marks (general) // ASCII Latin alphabets for (int i = 0; i < alphabets.Length; i++) AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); // non-ASCII Latin alphabets // FIXME: there is no such characters that are placed // *after* "alphabets" array items. This is nothing // more than a hack that creates dummy weight for // primary characters. for (int i = 0x0080; i < 0x0300; i++) { if (!Char.IsLetter ((char) i)) continue; // For those Latin Letters which has NFKD are // not added as independent primary character. if (decompIndex [i] != 0) continue; // SPECIAL CASES: // 1.some alphabets have primarily // equivalent ASCII alphabets. // 2.some have independent primary weights, // but inside a-to-z range. // 3.there are some expanded characters that // are not part of Unicode Standard NFKD. // 4. some characters are letter in IsLetter // but not in sortkeys (maybe unicode version // difference caused it). switch (i) { // 1. skipping them does not make sense // case 0xD0: case 0xF0: case 0x131: case 0x138: // case 0x184: case 0x185: case 0x186: case 0x189: // case 0x18D: case 0x18E: case 0x18F: case 0x190: // case 0x194: case 0x195: case 0x196: case 0x19A: // case 0x19B: case 0x19C: // 2. skipping them does not make sense // case 0x14A: // Ng // case 0x14B: // ng // 3. case 0xC6: // AE case 0xE6: // ae case 0xDE: // Icelandic Thorn case 0xFE: // Icelandic Thorn case 0xDF: // German ss case 0xFF: // German ss // 4. case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3: // not classified yet // case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9: // case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8: // case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF: // case 0x1DD: continue; } AddCharMapGroup ((char) i, 0xE, 1, 0); } // IPA extensions // FIXME: this results in not equivalent values to // Windows, but is safer for comparison. char [] ipaArray = new char [0x300 - 0x250 + 0x20]; for (int i = 0x40; i < 0x60; i++) if (Char.IsLetter ((char) i)) ipaArray [i - 0x40] = (char) (i); for (int i = 0x250; i < 0x300; i++) if (Char.IsLetter ((char) i)) ipaArray [i - 0x250 + 0x20] = (char) i; Array.Sort (ipaArray, UCAComparer.Instance); int targetASCII = 0; byte latinDiacritical = 0x7B; foreach (char c in ipaArray) { if (c <= 'Z') { targetASCII = c; latinDiacritical = 0x7B; } else map [(int) c] = new CharMapEntry ( 0xE, map [targetASCII].Level1, latinDiacritical++); } // Greek and Coptic // FIXME: this is (mysterious and) incomplete. for (int i = 0x0380; i < 0x0400; i++) if (diacritical [i] == 0 && decompLength [i] == 1 && decompType [i] == DecompositionCompat) diacritical [i] = 3; fillIndex [0xF] = 2; for (int i = 0x0391; i < 0x03AA; i++) if (i != 0x03A2) AddCharMap ((char) i, 0xF, 1, diacritical [i]); fillIndex [0xF] = 2; for (int i = 0x03B1; i < 0x03CA; i++) if (i != 0x03C2) AddCharMap ((char) i, 0xF, 1, diacritical [i]); // Final Sigma map [0x03C2] = new CharMapEntry (0xF, map [0x03C3].Level1, map [0x03C3].Level2); fillIndex [0xF] = 0x40; for (int i = 0x03DA; i < 0x03F0; i++) AddCharMap ((char) i, 0xF, (byte) (i % 2 == 0 ? 0 : 2), diacritical [i]); // NFKD for (int i = 0x0386; i <= 0x0400; i++) FillLetterNFKD (i, true, true); // Cyrillic. // Cyrillic letters are sorted like Latin letters i.e. // containing culture-specific letters between the // standard Cyrillic sequence. // // We can't use UCA here; it has different sorting. char [] orderedCyrillic = new char [] { '\u0430', '\u0431', '\u0432', '\u0433', '\u0434', '\u0452', // DJE for Serbocroatian '\u0435', '\u0454', // IE for Ukrainian '\u0436', '\u0437', '\u0455', // DZE '\u0438', '\u0456', // Byelorussian-Ukrainian I '\u0457', // YI '\u0439', '\u0458', // JE '\u043A', '\u043B', '\u0459', // LJE '\u043C', '\u043D', '\u045A', // NJE '\u043E', // 4E9 goes here. '\u043F', '\u0440', '\u0441', '\u0442', '\u045B', // TSHE for Serbocroatian '\u0443', '\u045E', // Short U for Byelorussian '\u04B1', // Straight U w/ stroke (diacritical!) '\u0444', '\u0445', '\u0446', '\u0447', '\u045F', // DZHE '\u0448', '\u0449', '\u044A', '\u044B', '\u044C', '\u044D', '\u044E', '\u044F'}; // For some characters here is a map to basic cyrillic // letters. See UnicodeData.txt character names for // the sources. Here I simply declare an equiv. array. // The content characters are map from U+490(,491), // skipping small letters. char [] cymap_src = new char [] { '\u0433', '\u0433', '\u0433', '\u0436', '\u0437', '\u043A', '\u043A', '\u043A', '\u043A', '\u043D', '\u043D', '\u043F', '\u0445', '\u0441', '\u0442', '\u0443', '\u0443', '\u0445', '\u0446', '\u0447', '\u0447', '\u0432', '\u0435', '\u0435', '\u0406', '\u0436', '\u043A', '\u043D', '\u0447', '\u0435'}; fillIndex [0x10] = 0x8D; for (int i = 0x0460; i < 0x0481; i++) { if (Char.IsLetter ((char) i)) { if (i == 0x0476) // U+476/477 have the same // primary weight as U+474/475. fillIndex [0x10] -= 3; AddLetterMap ((char) i, 0x10, 3); } } fillIndex [0x10] = 0x6; for (int i = 0; i < orderedCyrillic.Length; i++) { char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture); if (!IsIgnorable ((int) c) && Char.IsLetter (c) && !map [c].Defined) { AddLetterMap (c, 0x10, 0); fillIndex [0x10] += 3; } } // NFKD for (int i = 0x0401; i <= 0x045F; i++) FillLetterNFKD (i, false, false); for (int i = 0; i < cymap_src.Length; i++) { char c = cymap_src [i]; fillIndex [0x10] = map [c].Level1; int c2 = 0x0490 + i * 2; AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false); } // Armenian fillIndex [0x11] = 0x3; fillIndex [0x1] = 0x98; for (int i = 0x0531; i < 0x0586; i++) { if (i == 0x0559 || i == 0x55A) AddCharMap ((char) i, 1, 1); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x11, 1); } // Hebrew // -Letters fillIndex [0x12] = 0x2; for (int i = 0x05D0; i < 0x05FF; i++) if (Char.IsLetter ((char) i)) { if (isUppercase [i]) { fillIndex [0x12]--; AddLetterMap ((char) i, 0x12, 2); } else AddLetterMap ((char) i, 0x12, 1); } // -Accents fillIndex [0x1] = 0x3; for (int i = 0x0591; i <= 0x05C2; i++) { if (i == 0x05A3 || i == 0x05BB) fillIndex [0x1]++; if (i != 0x05BE) AddCharMap ((char) i, 0x1, 1); } // Arabic fillIndex [0x1] = 0x8E; fillIndex [0x13] = 0x3; for (int i = 0x0621; i <= 0x064A; i++) { // Abjad if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.OtherLetter) { // FIXME: arabic nonspacing marks are // in different order. AddCharMap ((char) i, 0x1, 1); continue; } // map [i] = new CharMapEntry (0x13, // (byte) arabicLetterPrimaryValues [i], 1); fillIndex [0x13] = (byte) arabicLetterPrimaryValues [i]; byte formDiacritical = 8; // default // SPECIAL CASES: switch (i) { case 0x0622: formDiacritical = 9; break; case 0x0623: formDiacritical = 0xA; break; case 0x0624: formDiacritical = 5; break; case 0x0625: formDiacritical = 0xB; break; case 0x0626: formDiacritical = 7; break; case 0x0649: formDiacritical = 5; break; case 0x064A: formDiacritical = 7; break; } // AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false); AddArabicCharMap ((char) i, 0x13, 1, formDiacritical); } for (int i = 0x0670; i < 0x0673; i++) map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670)); fillIndex [0x13] = 0x84; for (int i = 0x0674; i < 0x06D6; i++) if (Char.IsLetter ((char) i)) AddLetterMapCore ((char) i, 0x13, 1, 0, false); // Devanagari // FIXME: this could be fixed in more decent way for (int i = 0x0958; i <= 0x095F; i++) diacritical [i] = 8; // FIXME: it does seem straight codepoint mapping. fillIndex [0x14] = 04; for (int i = 0x0901; i < 0x0905; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xB; for (int i = 0x0905; i < 0x093A; i++) { if (i == 0x0928) AddCharMap ('\u0929', 0x14, 0, 8); if (i == 0x0930) AddCharMap ('\u0931', 0x14, 0, 8); if (i == 0x0933) AddCharMap ('\u0934', 0x14, 0, 8); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x14, 4); if (i == 0x090B) AddCharMap ('\u0960', 0x14, 4); if (i == 0x090C) AddCharMap ('\u0961', 0x14, 4); } fillIndex [0x14] = 0xDA; for (int i = 0x093E; i < 0x0945; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); fillIndex [0x14] = 0xEC; for (int i = 0x0945; i < 0x094F; i++) if (!IsIgnorable (i)) AddLetterMap ((char) i, 0x14, 2); // Bengali // -Letters fillIndex [0x15] = 02; for (int i = 0x0980; i < 0x9FF; i++) { if (IsIgnorable (i)) continue; if (i == 0x09E0) fillIndex [0x15] = 0x3B; switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: case UnicodeCategory.OtherNumber: continue; } AddLetterMap ((char) i, 0x15, 1); } // -Signs fillIndex [0x1] = 0x3; for (int i = 0x0981; i < 0x0A00; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 0x1, 1); // Gurmukhi. orderedGurmukhi is from UCA // FIXME: it does not look equivalent to UCA. fillIndex [0x16] = 04; fillIndex [0x1] = 3; for (int i = 0; i < orderedGurmukhi.Length; i++) { char c = orderedGurmukhi [i]; if (IsIgnorable ((int) c)) continue; if (IsIgnorableNonSpacing (c)) { AddLetterMap (c, 0x1, 1); continue; } if (c == '\u0A3C' || c == '\u0A4D' || '\u0A66' <= c && c <= '\u0A71') continue; // SPECIAL CASES byte shift = 4; switch (c) { case '\u0A33': case '\u0A36': case '\u0A16': case '\u0A17': case '\u0A5B': case '\u0A5E': shift = 0; break; } if (c == '\u0A3E') // Skip fillIndex [0x16] = 0xC0; AddLetterMap (c, 0x16, shift); } // Gujarati. orderedGujarati is from UCA fillIndex [0x17] = 0x4; // nonspacing marks map [0x0A4D] = new CharMapEntry (1, 0, 0x3); map [0x0ABD] = new CharMapEntry (1, 0, 0x3); map [0x0A3C] = new CharMapEntry (1, 0, 0x4); map [0x0A71] = new CharMapEntry (1, 0, 0x6); map [0x0ABC] = new CharMapEntry (1, 0, 0xB); map [0x0A70] = new CharMapEntry (1, 0, 0xE); // letters go first. for (int i = 0; i < orderedGujarati.Length; i++) { // SPECIAL CASE char c = orderedGujarati [i]; if (Char.IsLetter (c)) { // SPECIAL CASES if (c == '\u0AB3' || c == '\u0A32') continue; if (c == '\u0A33') { AddCharMap ('\u0A32', 0x17, 0); AddCharMap ('\u0A33', 0x17, 4, 4); continue; } if (c == '\u0A8B') AddCharMap ('\u0AE0', 0x17, 0, 5); AddCharMap (c, 0x17, 4); if (c == '\u0AB9') AddCharMap ('\u0AB3', 0x17, 6); } } // non-letters byte gujaratiShift = 4; fillIndex [0x17] = 0xC0; for (int i = 0; i < orderedGujarati.Length; i++) { char c = orderedGujarati [i]; if (fillIndex [0x17] == 0xCC) gujaratiShift = 3; if (!Char.IsLetter (c)) { // SPECIAL CASES if (c == '\u0A82') AddCharMap ('\u0A81', 0x17, 2); if (c == '\u0AC2') fillIndex [0x17]++; AddLetterMap (c, 0x17, gujaratiShift); } } // Oriya fillIndex [0x1] = 03; fillIndex [0x18] = 02; for (int i = 0x0B00; i < 0x0B7F; i++) { switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.NonSpacingMark: case UnicodeCategory.DecimalDigitNumber: AddLetterMap ((char) i, 0x1, 1); continue; } AddLetterMapCore ((char) i, 0x18, 1, 0, true); } // Tamil fillIndex [0x19] = 2; AddCharMap ('\u0BD7', 0x19, 0); fillIndex [0x19] = 0xA; // vowels for (int i = 0x0B82; i <= 0x0B94; i++) if (!IsIgnorable ((char) i)) AddCharMap ((char) i, 0x19, 2); // special vowel fillIndex [0x19] = 0x28; // The array for Tamil consonants is a constant. // Windows have almost similar sequence to TAM from // tamilnet but a bit different in Grantha. for (int i = 0; i < orderedTamilConsonants.Length; i++) AddLetterMap (orderedTamilConsonants [i], 0x19, 4); // combining marks fillIndex [0x19] = 0x82; for (int i = 0x0BBE; i < 0x0BCD; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.SpacingCombiningMark || i == 0x0BC0) AddLetterMap ((char) i, 0x19, 2); // Telugu fillIndex [0x1A] = 0x4; for (int i = 0x0C00; i < 0x0C62; i++) { if (i == 0x0C55 || i == 0x0C56) continue; // skip AddCharMap ((char) i, 0x1A, 3); char supp = (i == 0x0C0B) ? '\u0C60': i == 0x0C0C ? '\u0C61' : char.MinValue; if (supp == char.MinValue) continue; AddCharMap (supp, 0x1A, 3); } // Kannada fillIndex [0x1B] = 4; for (int i = 0x0C80; i < 0x0CE5; i++) { if (i == 0x0CD5 || i == 0x0CD6) continue; // ignore if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE) continue; // shift after 0xCB9 AddCharMap ((char) i, 0x1B, 3); if (i == 0x0CB9) { // SPECIAL CASES: but why? AddCharMap ('\u0CB1', 0x1B, 3); // RRA AddCharMap ('\u0CB3', 0x1B, 3); // LLA AddCharMap ('\u0CDE', 0x1B, 3); // FA } if (i == 0x0CB2) AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL } // Malayalam fillIndex [0x1C] = 2; fillIndex [0x1] = 3; for (int i = 0x0D02; i < 0x0D61; i++) { // FIXME: I avoided MSCompatUnicodeTable usage // here (it results in recursion). So check if // using NonSpacingMark makes sense or not. if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) // if (!MSCompatUnicodeTable.IsIgnorable ((char) i)) AddCharMap ((char) i, 0x1C, 1); else if (!IsIgnorable ((char) i)) AddCharMap ((char) i, 1, 1); } // Thai ... note that it breaks 0x1E wall after E2B! // Also, all Thai characters have level 2 value 3. fillIndex [0x1E] = 2; fillIndex [0x1] = 3; for (int i = 0xE40; i <= 0xE44; i++) AddCharMap ((char) i, 0x1E, 1, 3); for (int i = 0xE01; i < 0xE2B; i++) AddCharMap ((char) i, 0x1E, 6, 3); fillIndex [0x1F] = 5; for (int i = 0xE2B; i < 0xE30; i++) AddCharMap ((char) i, 0x1F, 6, 3); fillIndex [0x1F] = 0x1E; for (int i = 0xE30; i < 0xE3B; i++) AddCharMap ((char) i, 0x1F, 1, 3); // some Thai characters remains. char [] specialThai = new char [] {'\u0E45', '\u0E46', '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'}; foreach (char c in specialThai) AddCharMap (c, 0x1F, 1, 3); for (int i = 0xE00; i < 0xE80; i++) if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); // Lao fillIndex [0x1F] = 2; fillIndex [0x1] = 3; for (int i = 0xE80; i < 0xEDF; i++) { if (IsIgnorable ((char) i)) continue; else if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x1F, 1); else if (Char.GetUnicodeCategory ((char) i) == UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 1, 1); } // Georgian. orderedGeorgian is from UCA DUCET. fillIndex [0x21] = 5; for (int i = 0; i < orderedGeorgian.Length; i++) { char c = orderedGeorgian [i]; if (map [(int) c].Defined) continue; AddCharMap (c, 0x21, 0); if (c < '\u10F6') AddCharMap ((char) (c - 0x30), 0x21, 0); fillIndex [0x21] += 5; } // Japanese Kana. fillIndex [0x22] = 2; int kanaOffset = 0x3041; byte [] kanaLines = new byte [] {2, 2, 2, 2, 1, 3, 1, 2, 1}; for (int gyo = 0; gyo < 9; gyo++) { for (int dan = 0; dan < 5; dan++) { if (gyo == 7 && dan % 2 == 1) { // 'ya'-gyo fillIndex [0x22]++; kanaOffset -= 2; // There is no space for yi and ye. continue; } int cp = kanaOffset + dan * kanaLines [gyo]; // small lines (a-gyo, ya-gyo) if (gyo == 0 || gyo == 7) { AddKanaMap (cp, 1); // small AddKanaMap (cp + 1, 1); } else AddKanaMap (cp, kanaLines [gyo]); fillIndex [0x22]++; if (cp == 0x30AB) { // add small 'ka' (before normal one) AddKanaMap (0x30F5, 1); kanaOffset++; } if (cp == 0x30B1) { // add small 'ke' (before normal one) AddKanaMap (0x30F6, 1); kanaOffset++; } if (cp == 0x3061) { // add small 'Tsu' (before normal one) AddKanaMap (0x3063, 1); kanaOffset++; } } fillIndex [0x22] += 3; kanaOffset += 5 * kanaLines [gyo]; } // Wa-gyo is almost special, so I just manually add. AddLetterMap ((char) 0x308E, 0x22, 0); AddLetterMap ((char) (0x308E + 0x60), 0x22, 0); AddLetterMap ((char) 0x308F, 0x22, 0); AddLetterMap ((char) (0x308F + 0x60), 0x22, 0); fillIndex [0x22]++; AddLetterMap ((char) 0x3090, 0x22, 0); AddLetterMap ((char) (0x3090 + 0x60), 0x22, 0); fillIndex [0x22] += 2; // no "Wu" in Japanese. AddLetterMap ((char) 0x3091, 0x22, 0); AddLetterMap ((char) (0x3091 + 0x60), 0x22, 0); fillIndex [0x22]++; AddLetterMap ((char) 0x3092, 0x22, 0); AddLetterMap ((char) (0x3092 + 0x60), 0x22, 0); // Nn fillIndex [0x22] = 0x80; AddLetterMap ((char) 0x3093, 0x22, 0); AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0); map [0x3094] = new CharMapEntry (map [0x30A6].Category, map [0x30A6].Level1, 3);// voiced hiragana U map [0x30F4] = new CharMapEntry (map [0x30A6].Category, map [0x30A6].Level1, 3);// voiced katakana U map [0x30F5] = new CharMapEntry (map [0x30AB].Category, map [0x30AB].Level1, 0);// small katakana Ka map [0x30F6] = new CharMapEntry (map [0x30B1].Category, map [0x30B1].Level1, 0);// small katakana Ke // voiced Wa lines for (int i = 0x30F7; i < 0x30FB; i++) map [i] = new CharMapEntry (map [i - 8].Category, map [i - 8].Level1, 3); // JIS Japanese square chars. fillIndex [0x22] = 0x97; jisJapanese.Sort (JISComparer.Instance); foreach (JISCharacter j in jisJapanese) if (0x3300 <= j.CP && j.CP <= 0x3357) AddCharMap ((char) j.CP, 0x22, 1); // non-JIS Japanese square chars. nonJisJapanese.Sort (NonJISComparer.Instance); foreach (NonJISCharacter j in nonJisJapanese) AddCharMap ((char) j.CP, 0x22, 1); // Bopomofo fillIndex [0x23] = 0x02; for (int i = 0x3105; i <= 0x312C; i++) AddCharMap ((char) i, 0x23, 1); // Estrangela: ancient Syriac fillIndex [0x24] = 0x0B; // FIXME: is 0x71E really alternative form? ArrayList syriacAlternatives = new ArrayList ( new int [] {0x714, 0x716, 0x71C, 0x71E, 0x724, 0x727}); for (int i = 0x0710; i <= 0x072C; i++) { if (i == 0x0711) // NonSpacingMark continue; if (syriacAlternatives.Contains (i)) continue; AddCharMap ((char) i, 0x24, 4); // FIXME: why? if (i == 0x721) fillIndex [0x24]++; } foreach (int cp in syriacAlternatives) map [cp] = new CharMapEntry (0x24, (byte) (map [cp - 1].Level1 + 2), 0); // FIXME: Syriac NonSpacingMark should go here. // Thaana // FIXME: it turned out that it does not look like UCA fillIndex [0x24] = 0x6E; fillIndex [0x1] = 0xAC; for (int i = 0; i < orderedThaana.Length; i++) { char c = orderedThaana [i]; if (IsIgnorableNonSpacing ((int) c)) AddCharMap (c, 1, 1); AddCharMap (c, 0x24, 2); if (c == '\u0782') // SPECIAL CASE: why? fillIndex [0x24] += 2; } #endregion // FIXME: Add more culture-specific letters (that are // not supported in Windows collation) here. // Surrogate ... they are computed. #region Hangul // Hangul. // // Unlike UCA Windows Hangul sequence mixes Jongseong // with Choseong sequence as well as Jungseong, // adjusted to have the same primary weight for the // same base character. So it is impossible to compute // those sort keys. // // Here I introduce an ordered sequence of mixed // 'commands' and 'characters' that is similar to // LDML text: // - ',' increases primary weight. // - [A B] means a range, increasing index // - {A B} means a range, without increasing index // - '=' is no operation (it means the characters // of both sides have the same weight). // - '>' inserts a Hangul Syllable block that // contains 0x251 characters. // - '<' decreases the index // - '0'-'9' means skip count // - whitespaces are ignored // string hangulSequence = "\u1100=\u11A8 > \u1101=\u11A9 >" + "\u11C3, \u11AA, \u11C4, \u1102=\u11AB >" + "<{\u1113 \u1116}, \u3165," + "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8," + "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >" + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >" + "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1," + "[\u11D1 \u11D2], \u11B2," + "[\u11D3 \u11D5], \u11B3," + "[\u11D6 \u11D7], \u11B4, \u11B5," + "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >" + "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >" + "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >" + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, " + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178," + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>" + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C " + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >" + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB," + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, " + "\u11F1,, \u11F2,,," + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >" + "<\u114D, \u110D,, >" + "<{\u114E \u1151},, \u110E=\u11BE,, >" + "<{\u1152 \u1155},,, \u110F=\u11BF >" + "\u1110=\u11C0 > \u1111=\u11C1 >" + "<\u1156=\u1157, \u11F3, \u11F4, \u1112=\u11C2 >" + "<\u1158=\u1159=\u115F, \u3185, \u11F9," + "[\u11F5 \u11F8]" ; byte hangulCat = 0x52; fillIndex [hangulCat] = 0x2; int syllableBlock = 0; for (int n = 0; n < hangulSequence.Length; n++) { char c = hangulSequence [n]; int start, end; if (Char.IsWhiteSpace (c)) continue; switch (c) { case '=': break; // NOP case ',': IncrementSequentialIndex (ref hangulCat); break; case '<': if (fillIndex [hangulCat] == 2) throw new Exception ("FIXME: handle it correctly (yes it is hacky, it is really unfortunate)."); fillIndex [hangulCat]--; break; case '>': IncrementSequentialIndex (ref hangulCat); for (int l = 0; l < 0x15; l++) for (int v = 0; v < 0x1C; v++) { AddCharMap ( (char) (0xAC00 + syllableBlock * 0x1C * 0x15 + l * 0x1C + v), hangulCat, 0); IncrementSequentialIndex (ref hangulCat); } syllableBlock++; break; case '[': start = hangulSequence [n + 1]; end = hangulSequence [n + 3]; for (int i = start; i <= end; i++) { AddCharMap ((char) i, hangulCat, 0); if (end > i) IncrementSequentialIndex (ref hangulCat); } n += 4; // consumes 5 characters for this operation break; case '{': start = hangulSequence [n + 1]; end = hangulSequence [n + 3]; for (int i = start; i <= end; i++) AddCharMap ((char) i, hangulCat, 0); n += 4; // consumes 5 characters for this operation break; default: AddCharMap (c, hangulCat, 0); break; } } // Some Jamo NFKD. for (int i = 0x3200; i < 0x3300; i++) { if (IsIgnorable (i) || map [i].Defined) continue; int ch = 0; // w/ bracket if (decompLength [i] == 4 && decompValues [decompIndex [i]] == '(') ch = decompIndex [i] + 1; // circled else if (decompLength [i] == 2 && decompValues [decompIndex [i] + 1] == '\u1161') ch = decompIndex [i]; else if (decompLength [i] == 1) ch = decompIndex [i]; else continue; ch = decompValues [ch]; if (ch < 0x1100 || 0x1200 < ch && ch < 0xAC00 || 0xD800 < ch) continue; // SPECIAL CASE ? int offset = i < 0x3260 ? 1 : 0; if (0x326E <= i && i <= 0x3273) offset = 1; map [i] = new CharMapEntry (map [ch].Category, (byte) (map [ch].Level1 + offset), map [ch].Level2); // Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]); } #endregion // Letterlike characters and CJK compatibility square sortableCharNames.Sort (StringDictionaryValueComparer.Instance); int [] counts = new int ['Z' - 'A' + 1]; char [] namedChars = new char [sortableCharNames.Count]; int nCharNames = 0; foreach (DictionaryEntry de in sortableCharNames) { counts [((string) de.Value) [0] - 'A']++; namedChars [nCharNames++] = (char) ((int) de.Key); } nCharNames = 0; // reset for (int a = 0; a < counts.Length; a++) { fillIndex [0xE] = (byte) (alphaWeights [a + 1] - counts [a]); for (int i = 0; i < counts [a]; i++) //Console.Error.WriteLine ("---- {0:X04} : {1:x02} / {2} {3}", (int) namedChars [nCharNames], fillIndex [0xE], ((DictionaryEntry) sortableCharNames [nCharNames]).Value, Char.GetUnicodeCategory (namedChars [nCharNames])); AddCharMap (namedChars [nCharNames++], 0xE, 1); } // CJK unified ideograph. byte cjkCat = 0x9E; fillIndex [cjkCat] = 0x2; for (int cp = 0x4E00; cp <= 0x9FBB; cp++) if (!IsIgnorable (cp)) AddCharMapGroupCJK ((char) cp, ref cjkCat); // CJK Extensions goes here. // LAMESPEC: With this Windows style CJK layout, it is // impossible to add more CJK ideograph i.e. 0x9FA6- // 0x9FBB can never be added w/o breaking compat. for (int cp = 0xF900; cp <= 0xFA2D; cp++) if (!IsIgnorable (cp)) AddCharMapGroupCJK ((char) cp, ref cjkCat); // PrivateUse ... computed. // remaining Surrogate ... computed. #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07 // non-alphanumeric ASCII except for: + - < = > ' for (int i = 0x21; i < 0x7F; i++) { // SPECIAL CASE: 02C6 looks regarded as // equivalent to '^', which does not conform // to Unicode standard character database. if (i == 0x005B) AddCharMap ('\u2045', 0x7, 0, 0x1C); if (i == 0x005D) AddCharMap ('\u2046', 0x7, 0, 0x1C); if (i == 0x005E) AddCharMap ('\u02C6', 0x7, 0, 3); if (i == 0x0060) AddCharMap ('\u02CB', 0x7, 0, 3); if (Char.IsLetterOrDigit ((char) i) || "+-<=>'".IndexOf ((char) i) >= 0) continue; // they are not added here. AddCharMapGroup2 ((char) i, 0x7, 1, 0); // Insert 3001 after ',' and 3002 after '.' if (i == 0x2C) AddCharMapGroup2 ('\u3001', 0x7, 1, 0); else if (i == 0x2E) AddCharMapGroup2 ('\u3002', 0x7, 1, 0); else if (i == 0x3A) AddCharMap ('\uFE30', 0x7, 1, 0); } #endregion #region 07 - Punctuations and something else for (int i = 0xA0; i < char.MaxValue; i++) { if (IsIgnorable (i)) continue; // FIXME: actually those reset should not be // done but here I put for easy goal. if (i == 0x05C3) fillIndex [0x7]++; if (i == 0x0700) fillIndex [0x7] = 0xE2; if (i == 0x2016) fillIndex [0x7] = 0x77; if (i == 0x3008) fillIndex [0x7] = 0x93; if (0x02C8 <= i && i <= 0x02CD) continue; // nonspacing marks // SPECIAL CASE: maybe they could be allocated // dummy NFKD mapping and no special processing // would be required here. if (i == 0x00AF) AddCharMap ('\u02C9', 0x7, 0, 3); if (i == 0x00B4) AddCharMap ('\u02CA', 0x7, 0, 3); if (i == 0x02C7) AddCharMap ('\u02D8', 0x7, 0, 3); // SPECIAL CASES: switch (i) { case 0xAB: // 08 case 0xB7: // 0A case 0xBB: // 08 case 0x02B9: // 01 case 0x02BA: // 01 case 0x2329: // 09 case 0x232A: // 09 continue; } switch (Char.GetUnicodeCategory ((char) i)) { case UnicodeCategory.OtherPunctuation: case UnicodeCategory.ClosePunctuation: case UnicodeCategory.OpenPunctuation: case UnicodeCategory.ConnectorPunctuation: case UnicodeCategory.InitialQuotePunctuation: case UnicodeCategory.FinalQuotePunctuation: case UnicodeCategory.ModifierSymbol: // SPECIAL CASES: // 0xA if (0x2020 <= i && i <= 0x2031) continue; if (i == 0x3003) // added later continue; AddCharMapGroup2 ((char) i, 0x7, 1, 0); break; default: if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why? goto case UnicodeCategory.OtherPunctuation; break; } } // Control pictures // FIXME: it should not need to reset level 1, but // it's for easy goal. fillIndex [0x7] = 0xB6; for (int i = 0x2400; i <= 0x2424; i++) AddCharMap ((char) i, 0x7, 1, 0); // FIXME: what are they? AddCharMap ('\u3003', 0x7, 1); AddCharMap ('\u3006', 0x7, 1); AddCharMap ('\u02D0', 0x7, 1); AddCharMap ('\u10FB', 0x7, 1); AddCharMap ('\u0950', 0x7, 1); AddCharMap ('\u093D', 0x7, 1); AddCharMap ('\u0964', 0x7, 1); AddCharMap ('\u0965', 0x7, 1); AddCharMap ('\u0970', 0x7, 1); #endregion #region category 08 - symbols fillIndex [0x8] = 2; // Here Windows mapping is not straightforward. It is // not based on computation but seems manual sorting. AddCharMapGroup ('+', 0x8, 1, 0); // plus AddCharMapGroup ('\u2212', 0x8, 1); // minus AddCharMapGroup ('\u229D', 0x8, 1); // minus AddCharMapGroup ('\u2297', 0x8, 1); // mul AddCharMapGroup ('\u2044', 0x8, 1); // div AddCharMapGroup ('\u2215', 0x8, 0); // div AddCharMapGroup ('\u2298', 0x8, 1); // div slash AddCharMapGroup ('\u2217', 0x8, 0); // mul AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper AddCharMapGroup ('\u2218', 0x8, 0); // ring AddCharMapGroup ('\u229A', 0x8, 1); // ring AddCharMapGroup ('\u2219', 0x8, 0); // bullet AddCharMapGroup ('\u2299', 0x8, 1); // dot oper AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus AddCharMapGroup ('\u003C', 0x8, 1); // < AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation for (int cp = 0; cp < 0x2300; cp++) { if (cp == 0xAC) // SPECIAL CASE: skip continue; if (cp == 0x200) { cp = 0x2200; // skip to 2200 fillIndex [0x8] = 0x21; } if (cp == 0x2295) fillIndex [0x8] = 0x3; if (cp == 0x22A2) fillIndex [0x8] = 0xAB; if (cp == 0x22B2) fillIndex [0x8] = 0xB9; if (!map [cp].Defined && // Char.GetUnicodeCategory ((char) cp) == // UnicodeCategory.MathSymbol) Char.IsSymbol ((char) cp)) AddCharMapGroup ((char) cp, 0x8, 1); // SPECIAL CASES: no idea why Windows sorts as such switch (cp) { case 0x3E: AddCharMap ('\u227B', 0x8, 1, 0); AddCharMap ('\u22B1', 0x8, 1, 0); break; case 0xB1: AddCharMapGroup ('\u00AB', 0x8, 1); AddCharMapGroup ('\u226A', 0x8, 1); AddCharMapGroup ('\u00BB', 0x8, 1); AddCharMapGroup ('\u226B', 0x8, 1); break; case 0xF7: AddCharMap ('\u01C0', 0x8, 1, 0); AddCharMap ('\u01C1', 0x8, 1, 0); AddCharMap ('\u01C2', 0x8, 1, 0); break; } } #endregion #region Hack! // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) continue; if (decompIndex [i] == 0) continue; int start = decompIndex [i]; int primaryChar = decompValues [start]; int secondary = diacritical [i]; bool skip = false; int length = decompLength [i]; // special processing for parenthesized ones. if (length == 3 && decompValues [start] == '(' && decompValues [start + 2] == ')') { primaryChar = decompValues [start + 1]; length = 1; } if (map [primaryChar].Level1 == 0) continue; for (int l = 1; l < length; l++) { int c = decompValues [start + l]; if (map [c].Level1 != 0) skip = true; secondary += diacritical [c]; } if (skip) continue; map [i] = new CharMapEntry ( map [primaryChar].Category, map [primaryChar].Level1, (byte) secondary); } // Diacritical weight adjustment // Arabic Hamzah diacritical [0x624] = 0x5; diacritical [0x626] = 0x7; diacritical [0x622] = 0x9; diacritical [0x623] = 0xA; diacritical [0x625] = 0xB; diacritical [0x649] = 0x5; // 'alif maqs.uurah diacritical [0x64A] = 0x7; // Yaa' for (int i = 0; i < char.MaxValue; i++) { byte mod = 0; byte cat = map [i].Category; switch (cat) { case 0xE: // Latin diacritics case 0x22: // Japanese: circled characters mod = diacritical [i]; break; case 0x13: // Arabic if (i == 0x0621) break; // 0 if (diacritical [i] == 0 && decompLength [i] != 0) diacritical [i] = map [decompValues [decompIndex [i]]].Level2; if (diacritical [i] == 0 && i >= 0xFE8D) mod = 0x8; // default for arabic break; } if (0x52 <= cat && cat <= 0x7F) // Hangul mod = diacritical [i]; if (mod > 0) map [i] = new CharMapEntry ( cat, map [i].Level1, mod); } // FIXME: this is halfly hack but those NonSpacingMark // characters and still undefined are likely to // be nonspacing. for (int i = 0; i < char.MaxValue; i++) { if (map [i].Defined || IsIgnorable (i)) continue; switch (i) { // SPECIAL CASES. case 0x02B9: case 0x02BA: break; default: if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) continue; break; } if (diacritical [i] != 0) map [i] = new CharMapEntry (1, 1, diacritical [i]); else AddCharMap ((char) i, 1, 1); } #endregion }