SCount = LCount * NCount; // 11172 /** * For use in an applet: just load a minimal set of data. */ private static void SetMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose, LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { String[] decomposeData = { "\u005E", "\u0020\u0302", "K", "\u005F", "\u0020\u0332", "K", "\u0060", "\u0020\u0300", "K", "\u00A0", "\u0020", "K", "\u00A8", "\u0020\u0308", "K", "\u00AA", "\u0061", "K", "\u00AF", "\u0020\u0304", "K", "\u00B2", "\u0032", "K", "\u00B3", "\u0033", "K", "\u00B4", "\u0020\u0301", "K", "\u00B5", "\u03BC", "K", "\u00B8", "\u0020\u0327", "K", "\u00B9", "\u0031", "K", "\u00BA", "\u006F", "K", "\u00BC", "\u0031\u2044\u0034", "K", "\u00BD", "\u0031\u2044\u0032", "K", "\u00BE", "\u0033\u2044\u0034", "K", "\u00C0", "\u0041\u0300", "", "\u00C1", "\u0041\u0301", "", "\u00C2", "\u0041\u0302", "", "\u00C3", "\u0041\u0303", "", "\u00C4", "\u0041\u0308", "", "\u00C5", "\u0041\u030A", "", "\u00C7", "\u0043\u0327", "", "\u00C8", "\u0045\u0300", "", "\u00C9", "\u0045\u0301", "", "\u00CA", "\u0045\u0302", "", "\u00CB", "\u0045\u0308", "", "\u00CC", "\u0049\u0300", "", "\u00CD", "\u0049\u0301", "", "\u00CE", "\u0049\u0302", "", "\u00CF", "\u0049\u0308", "", "\u00D1", "\u004E\u0303", "", "\u00D2", "\u004F\u0300", "", "\u00D3", "\u004F\u0301", "", "\u00D4", "\u004F\u0302", "", "\u00D5", "\u004F\u0303", "", "\u00D6", "\u004F\u0308", "", "\u00D9", "\u0055\u0300", "", "\u00DA", "\u0055\u0301", "", "\u00DB", "\u0055\u0302", "", "\u00DC", "\u0055\u0308", "", "\u00DD", "\u0059\u0301", "", "\u00E0", "\u0061\u0300", "", "\u00E1", "\u0061\u0301", "", "\u00E2", "\u0061\u0302", "", "\u00E3", "\u0061\u0303", "", "\u00E4", "\u0061\u0308", "", "\u00E5", "\u0061\u030A", "", "\u00E7", "\u0063\u0327", "", "\u00E8", "\u0065\u0300", "", "\u00E9", "\u0065\u0301", "", "\u00EA", "\u0065\u0302", "", "\u00EB", "\u0065\u0308", "", "\u00EC", "\u0069\u0300", "", "\u00ED", "\u0069\u0301", "", "\u00EE", "\u0069\u0302", "", "\u00EF", "\u0069\u0308", "", "\u00F1", "\u006E\u0303", "", "\u00F2", "\u006F\u0300", "", "\u00F3", "\u006F\u0301", "", "\u00F4", "\u006F\u0302", "", "\u00F5", "\u006F\u0303", "", "\u00F6", "\u006F\u0308", "", "\u00F9", "\u0075\u0300", "", "\u00FA", "\u0075\u0301", "", "\u00FB", "\u0075\u0302", "", "\u00FC", "\u0075\u0308", "", "\u00FD", "\u0079\u0301", "", // EXTRAS, outside of Latin 1 "\u1EA4", "\u00C2\u0301", "", "\u1EA5", "\u00E2\u0301", "", "\u1EA6", "\u00C2\u0300", "", "\u1EA7", "\u00E2\u0300", "", }; int[] classData = { 0x0300, 230, 0x0301, 230, 0x0302, 230, 0x0303, 230, 0x0304, 230, 0x0305, 230, 0x0306, 230, 0x0307, 230, 0x0308, 230, 0x0309, 230, 0x030A, 230, 0x030B, 230, 0x030C, 230, 0x030D, 230, 0x030E, 230, 0x030F, 230, 0x0310, 230, 0x0311, 230, 0x0312, 230, 0x0313, 230, 0x0314, 230, 0x0315, 232, 0x0316, 220, 0x0317, 220, 0x0318, 220, 0x0319, 220, 0x031A, 232, 0x031B, 216, 0x031C, 220, 0x031D, 220, 0x031E, 220, 0x031F, 220, 0x0320, 220, 0x0321, 202, 0x0322, 202, 0x0323, 220, 0x0324, 220, 0x0325, 220, 0x0326, 220, 0x0327, 202, 0x0328, 202, 0x0329, 220, 0x032A, 220, 0x032B, 220, 0x032C, 220, 0x032D, 220, 0x032E, 220, 0x032F, 220, 0x0330, 220, 0x0331, 220, 0x0332, 220, 0x0333, 220, 0x0334, 1, 0x0335, 1, 0x0336, 1, 0x0337, 1, 0x0338, 1, 0x0339, 220, 0x033A, 220, 0x033B, 220, 0x033C, 220, 0x033D, 230, 0x033E, 230, 0x033F, 230, 0x0340, 230, 0x0341, 230, 0x0342, 230, 0x0343, 230, 0x0344, 230, 0x0345, 240, 0x0360, 234, 0x0361, 234 }; // build the same tables we would otherwise get from the // Unicode Character Database, just with limited data for (int i = 0; i < decomposeData.Length; i += 3) { char value = decomposeData[i][0]; String decomp = decomposeData[i + 1]; bool compat = decomposeData[i + 2].Equals("K"); if (compat) { isCompatibility.Set(value); } decompose.Put(value, decomp); if (!compat) { int first = '\u0000'; int second = UTF16Util.NextCodePoint(decomp, 0); if (decomp.Length > 1) { first = second; second = UTF16Util.NextCodePoint(decomp, UTF16Util.CodePointLength(first)); } long pair = (first << 16) | second; compose.Put(pair, value); } } for (int i = 0; i < classData.Length;) { canonicalClass.Put(classData[i++], classData[i++]); } }
/** * Builds a decomposition table from a UnicodeData file */ private static void BuildDecompositionTables( IntHashtable canonicalClass, IntStringHashtable decompose, LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { if (DEBUG) { Console.Out.WriteLine("Reading Unicode Character Database"); } //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024); TextReader @in = null; try { @in = TestUtil.GetDataReader("unicode.UnicodeData.txt"); } catch (Exception e) { Console.Error.WriteLine("Failed to read UnicodeData.txt"); Environment.Exit(1); } int value; long pair; int counter = 0; while (true) { // read a line, discarding comments and blank lines String line = @in.ReadLine(); if (line == null) { break; } int comment = line.IndexOf('#'); // strip comments if (comment != -1) { line = line.Substring(0, comment - 0); // ICU4N: Checked 2nd substring parameter } if (line.Length == 0) { continue; } if (DEBUG) { counter++; if ((counter & 0xFF) == 0) { Console.Out.WriteLine("At: " + line); } } // find the values of the particular fields that we need // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0; int start = 0; int end = line.IndexOf(';'); // code value = int.Parse(line.Substring(start, end - start), NumberStyles.HexNumber, CultureInfo.InvariantCulture); // ICU4N: Corrected 2nd substring parameter if (true && value == '\u00c0') { //Console.Out.WriteLine("debug: " + line); } end = line.IndexOf(';', start = end + 1); // name /*String name = line.substring(start,end);*/ end = line.IndexOf(';', start = end + 1); // general category end = line.IndexOf(';', start = end + 1); // canonical class // check consistency: canonical classes must be from 0 to 255 int cc = int.Parse(line.Substring(start, end - start), CultureInfo.InvariantCulture); // ICU4N: Corrected 2nd substring parameter if (cc != (cc & 0xFF)) { Console.Error.WriteLine("Bad canonical class at: " + line); } canonicalClass.Put(value, cc); end = line.IndexOf(';', start = end + 1); // BIDI end = line.IndexOf(';', start = end + 1); // decomp // decomp requires more processing. // store whether it is canonical or compatibility. // store the decomp in one table, and the reverse mapping (from pairs) in another if (start != end) { String segment = line.Substring(start, end - start); // ICU4N: Corrected 2nd parameter bool compat = segment[0] == '<'; if (compat) { isCompatibility.Set(value); } String decomp = fromHex(segment); // a small snippet of code to generate the Applet data /*if (GENERATING) { * if (value < 0xFF) { * Console.Out.WriteLine( * "\"\\u" + hex((char)value) + "\", " + "\"\\u" + hex(decomp, "\\u") + "\", " + (compat ? "\"K\"," : "\"\",") + "// " + name); + } + }*/ // check consistency: all canon decomps must be singles or pairs! int decompLen = UTF16Util.CountCodePoint(decomp); if (decompLen < 1 || decompLen > 2 && !compat) { Console.Error.WriteLine("Bad decomp at: " + line); } decompose.Put(value, decomp); // only compositions are canonical pairs // skip if script exclusion if (!compat && !isExcluded.Get(value)) { int first = '\u0000'; int second = UTF16Util.NextCodePoint(decomp, 0); if (decompLen > 1) { first = second; second = UTF16Util.NextCodePoint(decomp, UTF16Util.CodePointLength(first)); } // store composition pair in single integer pair = ((long)first << 32) | (uint)second; if (DEBUG && value == '\u00C0') { Console.Out.WriteLine("debug2: " + line); } compose.Put(pair, value); } else if (DEBUG) { Console.Out.WriteLine("Excluding: " + decomp); } } } @in.Dispose(); if (DEBUG) { Console.Out.WriteLine("Done reading Unicode Character Database"); } // add algorithmic Hangul decompositions // this is more compact if done at runtime, but for simplicity we // do it this way. if (DEBUG) { Console.Out.WriteLine("Adding Hangul"); } for (int SIndex = 0; SIndex < SCount; ++SIndex) { int TIndex = SIndex % TCount; char first, second; if (TIndex != 0) { // triple first = (char)(SBase + SIndex - TIndex); second = (char)(TBase + TIndex); } else { first = (char)(LBase + SIndex / NCount); second = (char)(VBase + (SIndex % NCount) / TCount); } pair = ((long)first << 32) | second; value = SIndex + SBase; decompose.Put(value, Convert.ToString(first, CultureInfo.InvariantCulture) + second); compose.Put(pair, value); } if (DEBUG) { Console.Out.WriteLine("Done adding Hangul"); } }