/** * Gets recursive decomposition of a character from the * Unicode Character Database. * @param canonical If true * bit is on in this byte, then selects the recursive * canonical decomposition, otherwise selects * the recursive compatibility and canonical decomposition. * @param ch the source character * @param buffer buffer to be filled with the decomposition */ public void GetRecursiveDecomposition(bool canonical, int ch, StringBuffer buffer) { string decomp = decompose.Get(ch); if (decomp != null && !(canonical && isCompatibility.SafeGet(ch))) { for (int i = 0; i < decomp.Length; i += UTF16Util.CodePointLength(ch)) { ch = UTF16Util.NextCodePoint(decomp, i); GetRecursiveDecomposition(canonical, ch, buffer); } } else { // if no decomp, append UTF16Util.AppendCodePoint(buffer, ch); } }
/** * Decomposes text, either canonical or compatibility, * replacing contents of the target buffer. * @param form the normalization form. If COMPATIBILITY_MASK * bit is on in this byte, then selects the recursive * compatibility decomposition, otherwise selects * the recursive canonical decomposition. * @param source the original text, unnormalized * @param target the resulting normalized text */ private void internalDecompose(String source, StringBuffer target) { StringBuffer buffer = new StringBuffer(); bool canonical = (form & COMPATIBILITY_MASK) == 0; int ch; for (int i = 0; i < source.Length;) { buffer.Length = (0); ch = UTF16Util.NextCodePoint(source, i); i += UTF16Util.CodePointLength(ch); data.GetRecursiveDecomposition(canonical, ch, buffer); // add all of the characters in the decomposition. // (may be just the original character, if there was // no decomposition mapping) for (int j = 0; j < buffer.Length;) { ch = UTF16Util.NextCodePoint(buffer, j); j += UTF16Util.CodePointLength(ch); int chClass = data.GetCanonicalClass(ch); int k = target.Length; // insertion point if (chClass != 0) { // bubble-sort combining marks as necessary int ch2; for (; k > 0; k -= UTF16Util.CodePointLength(ch2)) { ch2 = UTF16Util.PrevCodePoint(target, k); if (data.GetCanonicalClass(ch2) <= chClass) { break; } } } UTF16Util.InsertCodePoint(target, k, ch); } } }
/** * Composes text in place. Target must already * have been decomposed. * @param target input: decomposed text. * output: the resulting normalized text. */ private void internalCompose(StringBuffer target) { int starterPos = 0; int starterCh = UTF16Util.NextCodePoint(target, 0); int compPos = UTF16Util.CodePointLength(starterCh); int lastClass = data.GetCanonicalClass(starterCh); if (lastClass != 0) { lastClass = 256; // fix for irregular combining sequence } // Loop on the decomposed characters, combining where possible for (int decompPos = UTF16Util.CodePointLength(starterCh); decompPos < target.Length;) { int ch = UTF16Util.NextCodePoint(target, decompPos); decompPos += UTF16Util.CodePointLength(ch); int chClass = data.GetCanonicalClass(ch); int composite = data.GetPairwiseComposition(starterCh, ch); if (composite != NormalizerData.NOT_COMPOSITE && (lastClass < chClass || lastClass == 0)) { UTF16Util.SetCodePointAt(target, starterPos, composite); starterCh = composite; } else { if (chClass == 0) { starterPos = compPos; starterCh = ch; } lastClass = chClass; decompPos += UTF16Util.SetCodePointAt(target, compPos, ch); compPos += UTF16Util.CodePointLength(ch); } } target.Length = (compPos); }
SCount = LCount * NCount; // 11172 /** * For use in an applet: just load a minimal set of data. */ private static void SetMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose, LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { String[] decomposeData = { "\u005E", "\u0020\u0302", "K", "\u005F", "\u0020\u0332", "K", "\u0060", "\u0020\u0300", "K", "\u00A0", "\u0020", "K", "\u00A8", "\u0020\u0308", "K", "\u00AA", "\u0061", "K", "\u00AF", "\u0020\u0304", "K", "\u00B2", "\u0032", "K", "\u00B3", "\u0033", "K", "\u00B4", "\u0020\u0301", "K", "\u00B5", "\u03BC", "K", "\u00B8", "\u0020\u0327", "K", "\u00B9", "\u0031", "K", "\u00BA", "\u006F", "K", "\u00BC", "\u0031\u2044\u0034", "K", "\u00BD", "\u0031\u2044\u0032", "K", "\u00BE", "\u0033\u2044\u0034", "K", "\u00C0", "\u0041\u0300", "", "\u00C1", "\u0041\u0301", "", "\u00C2", "\u0041\u0302", "", "\u00C3", "\u0041\u0303", "", "\u00C4", "\u0041\u0308", "", "\u00C5", "\u0041\u030A", "", "\u00C7", "\u0043\u0327", "", "\u00C8", "\u0045\u0300", "", "\u00C9", "\u0045\u0301", "", "\u00CA", "\u0045\u0302", "", "\u00CB", "\u0045\u0308", "", "\u00CC", "\u0049\u0300", "", "\u00CD", "\u0049\u0301", "", "\u00CE", "\u0049\u0302", "", "\u00CF", "\u0049\u0308", "", "\u00D1", "\u004E\u0303", "", "\u00D2", "\u004F\u0300", "", "\u00D3", "\u004F\u0301", "", "\u00D4", "\u004F\u0302", "", "\u00D5", "\u004F\u0303", "", "\u00D6", "\u004F\u0308", "", "\u00D9", "\u0055\u0300", "", "\u00DA", "\u0055\u0301", "", "\u00DB", "\u0055\u0302", "", "\u00DC", "\u0055\u0308", "", "\u00DD", "\u0059\u0301", "", "\u00E0", "\u0061\u0300", "", "\u00E1", "\u0061\u0301", "", "\u00E2", "\u0061\u0302", "", "\u00E3", "\u0061\u0303", "", "\u00E4", "\u0061\u0308", "", "\u00E5", "\u0061\u030A", "", "\u00E7", "\u0063\u0327", "", "\u00E8", "\u0065\u0300", "", "\u00E9", "\u0065\u0301", "", "\u00EA", "\u0065\u0302", "", "\u00EB", "\u0065\u0308", "", "\u00EC", "\u0069\u0300", "", "\u00ED", "\u0069\u0301", "", "\u00EE", "\u0069\u0302", "", "\u00EF", "\u0069\u0308", "", "\u00F1", "\u006E\u0303", "", "\u00F2", "\u006F\u0300", "", "\u00F3", "\u006F\u0301", "", "\u00F4", "\u006F\u0302", "", "\u00F5", "\u006F\u0303", "", "\u00F6", "\u006F\u0308", "", "\u00F9", "\u0075\u0300", "", "\u00FA", "\u0075\u0301", "", "\u00FB", "\u0075\u0302", "", "\u00FC", "\u0075\u0308", "", "\u00FD", "\u0079\u0301", "", // EXTRAS, outside of Latin 1 "\u1EA4", "\u00C2\u0301", "", "\u1EA5", "\u00E2\u0301", "", "\u1EA6", "\u00C2\u0300", "", "\u1EA7", "\u00E2\u0300", "", }; int[] classData = { 0x0300, 230, 0x0301, 230, 0x0302, 230, 0x0303, 230, 0x0304, 230, 0x0305, 230, 0x0306, 230, 0x0307, 230, 0x0308, 230, 0x0309, 230, 0x030A, 230, 0x030B, 230, 0x030C, 230, 0x030D, 230, 0x030E, 230, 0x030F, 230, 0x0310, 230, 0x0311, 230, 0x0312, 230, 0x0313, 230, 0x0314, 230, 0x0315, 232, 0x0316, 220, 0x0317, 220, 0x0318, 220, 0x0319, 220, 0x031A, 232, 0x031B, 216, 0x031C, 220, 0x031D, 220, 0x031E, 220, 0x031F, 220, 0x0320, 220, 0x0321, 202, 0x0322, 202, 0x0323, 220, 0x0324, 220, 0x0325, 220, 0x0326, 220, 0x0327, 202, 0x0328, 202, 0x0329, 220, 0x032A, 220, 0x032B, 220, 0x032C, 220, 0x032D, 220, 0x032E, 220, 0x032F, 220, 0x0330, 220, 0x0331, 220, 0x0332, 220, 0x0333, 220, 0x0334, 1, 0x0335, 1, 0x0336, 1, 0x0337, 1, 0x0338, 1, 0x0339, 220, 0x033A, 220, 0x033B, 220, 0x033C, 220, 0x033D, 230, 0x033E, 230, 0x033F, 230, 0x0340, 230, 0x0341, 230, 0x0342, 230, 0x0343, 230, 0x0344, 230, 0x0345, 240, 0x0360, 234, 0x0361, 234 }; // build the same tables we would otherwise get from the // Unicode Character Database, just with limited data for (int i = 0; i < decomposeData.Length; i += 3) { char value = decomposeData[i][0]; String decomp = decomposeData[i + 1]; bool compat = decomposeData[i + 2].Equals("K"); if (compat) { isCompatibility.Set(value); } decompose.Put(value, decomp); if (!compat) { int first = '\u0000'; int second = UTF16Util.NextCodePoint(decomp, 0); if (decomp.Length > 1) { first = second; second = UTF16Util.NextCodePoint(decomp, UTF16Util.CodePointLength(first)); } long pair = (first << 16) | second; compose.Put(pair, value); } } for (int i = 0; i < classData.Length;) { canonicalClass.Put(classData[i++], classData[i++]); } }
/** * Builds a decomposition table from a UnicodeData file */ private static void BuildDecompositionTables( IntHashtable canonicalClass, IntStringHashtable decompose, LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) { if (DEBUG) { Console.Out.WriteLine("Reading Unicode Character Database"); } //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024); TextReader @in = null; try { @in = TestUtil.GetDataReader("unicode.UnicodeData.txt"); } catch (Exception e) { Console.Error.WriteLine("Failed to read UnicodeData.txt"); Environment.Exit(1); } int value; long pair; int counter = 0; while (true) { // read a line, discarding comments and blank lines String line = @in.ReadLine(); if (line == null) { break; } int comment = line.IndexOf('#'); // strip comments if (comment != -1) { line = line.Substring(0, comment - 0); // ICU4N: Checked 2nd substring parameter } if (line.Length == 0) { continue; } if (DEBUG) { counter++; if ((counter & 0xFF) == 0) { Console.Out.WriteLine("At: " + line); } } // find the values of the particular fields that we need // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0; int start = 0; int end = line.IndexOf(';'); // code value = int.Parse(line.Substring(start, end - start), NumberStyles.HexNumber, CultureInfo.InvariantCulture); // ICU4N: Corrected 2nd substring parameter if (true && value == '\u00c0') { //Console.Out.WriteLine("debug: " + line); } end = line.IndexOf(';', start = end + 1); // name /*String name = line.substring(start,end);*/ end = line.IndexOf(';', start = end + 1); // general category end = line.IndexOf(';', start = end + 1); // canonical class // check consistency: canonical classes must be from 0 to 255 int cc = int.Parse(line.Substring(start, end - start), CultureInfo.InvariantCulture); // ICU4N: Corrected 2nd substring parameter if (cc != (cc & 0xFF)) { Console.Error.WriteLine("Bad canonical class at: " + line); } canonicalClass.Put(value, cc); end = line.IndexOf(';', start = end + 1); // BIDI end = line.IndexOf(';', start = end + 1); // decomp // decomp requires more processing. // store whether it is canonical or compatibility. // store the decomp in one table, and the reverse mapping (from pairs) in another if (start != end) { String segment = line.Substring(start, end - start); // ICU4N: Corrected 2nd parameter bool compat = segment[0] == '<'; if (compat) { isCompatibility.Set(value); } String decomp = fromHex(segment); // a small snippet of code to generate the Applet data /*if (GENERATING) { * if (value < 0xFF) { * Console.Out.WriteLine( * "\"\\u" + hex((char)value) + "\", " + "\"\\u" + hex(decomp, "\\u") + "\", " + (compat ? "\"K\"," : "\"\",") + "// " + name); + } + }*/ // check consistency: all canon decomps must be singles or pairs! int decompLen = UTF16Util.CountCodePoint(decomp); if (decompLen < 1 || decompLen > 2 && !compat) { Console.Error.WriteLine("Bad decomp at: " + line); } decompose.Put(value, decomp); // only compositions are canonical pairs // skip if script exclusion if (!compat && !isExcluded.Get(value)) { int first = '\u0000'; int second = UTF16Util.NextCodePoint(decomp, 0); if (decompLen > 1) { first = second; second = UTF16Util.NextCodePoint(decomp, UTF16Util.CodePointLength(first)); } // store composition pair in single integer pair = ((long)first << 32) | (uint)second; if (DEBUG && value == '\u00C0') { Console.Out.WriteLine("debug2: " + line); } compose.Put(pair, value); } else if (DEBUG) { Console.Out.WriteLine("Excluding: " + decomp); } } } @in.Dispose(); if (DEBUG) { Console.Out.WriteLine("Done reading Unicode Character Database"); } // add algorithmic Hangul decompositions // this is more compact if done at runtime, but for simplicity we // do it this way. if (DEBUG) { Console.Out.WriteLine("Adding Hangul"); } for (int SIndex = 0; SIndex < SCount; ++SIndex) { int TIndex = SIndex % TCount; char first, second; if (TIndex != 0) { // triple first = (char)(SBase + SIndex - TIndex); second = (char)(TBase + TIndex); } else { first = (char)(LBase + SIndex / NCount); second = (char)(VBase + (SIndex % NCount) / TCount); } pair = ((long)first << 32) | second; value = SIndex + SBase; decompose.Put(value, Convert.ToString(first, CultureInfo.InvariantCulture) + second); compose.Put(pair, value); } if (DEBUG) { Console.Out.WriteLine("Done adding Hangul"); } }