Example #1
0
        /**
         * Gets recursive decomposition of a character from the
         * Unicode Character Database.
         * @param   canonical    If true
         *                  bit is on in this byte, then selects the recursive
         *                  canonical decomposition, otherwise selects
         *                  the recursive compatibility and canonical decomposition.
         * @param   ch      the source character
         * @param   buffer  buffer to be filled with the decomposition
         */
        public void GetRecursiveDecomposition(bool canonical, int ch, StringBuffer buffer)
        {
            string decomp = decompose.Get(ch);

            if (decomp != null && !(canonical && isCompatibility.SafeGet(ch)))
            {
                for (int i = 0; i < decomp.Length; i += UTF16Util.CodePointLength(ch))
                {
                    ch = UTF16Util.NextCodePoint(decomp, i);
                    GetRecursiveDecomposition(canonical, ch, buffer);
                }
            }
            else
            {                    // if no decomp, append
                UTF16Util.AppendCodePoint(buffer, ch);
            }
        }
Example #2
0
        /**
         * Decomposes text, either canonical or compatibility,
         * replacing contents of the target buffer.
         * @param   form        the normalization form. If COMPATIBILITY_MASK
         *                      bit is on in this byte, then selects the recursive
         *                      compatibility decomposition, otherwise selects
         *                      the recursive canonical decomposition.
         * @param   source      the original text, unnormalized
         * @param   target      the resulting normalized text
         */
        private void internalDecompose(String source, StringBuffer target)
        {
            StringBuffer buffer    = new StringBuffer();
            bool         canonical = (form & COMPATIBILITY_MASK) == 0;
            int          ch;

            for (int i = 0; i < source.Length;)
            {
                buffer.Length = (0);
                ch            = UTF16Util.NextCodePoint(source, i);
                i            += UTF16Util.CodePointLength(ch);
                data.GetRecursiveDecomposition(canonical, ch, buffer);

                // add all of the characters in the decomposition.
                // (may be just the original character, if there was
                // no decomposition mapping)

                for (int j = 0; j < buffer.Length;)
                {
                    ch = UTF16Util.NextCodePoint(buffer, j);
                    j += UTF16Util.CodePointLength(ch);
                    int chClass = data.GetCanonicalClass(ch);
                    int k       = target.Length; // insertion point
                    if (chClass != 0)
                    {
                        // bubble-sort combining marks as necessary

                        int ch2;
                        for (; k > 0; k -= UTF16Util.CodePointLength(ch2))
                        {
                            ch2 = UTF16Util.PrevCodePoint(target, k);
                            if (data.GetCanonicalClass(ch2) <= chClass)
                            {
                                break;
                            }
                        }
                    }
                    UTF16Util.InsertCodePoint(target, k, ch);
                }
            }
        }
Example #3
0
        /**
         * Composes text in place. Target must already
         * have been decomposed.
         * @param   target      input: decomposed text.
         *                      output: the resulting normalized text.
         */
        private void internalCompose(StringBuffer target)
        {
            int starterPos = 0;
            int starterCh  = UTF16Util.NextCodePoint(target, 0);
            int compPos    = UTF16Util.CodePointLength(starterCh);
            int lastClass  = data.GetCanonicalClass(starterCh);

            if (lastClass != 0)
            {
                lastClass = 256;                 // fix for irregular combining sequence
            }
            // Loop on the decomposed characters, combining where possible

            for (int decompPos = UTF16Util.CodePointLength(starterCh); decompPos < target.Length;)
            {
                int ch = UTF16Util.NextCodePoint(target, decompPos);
                decompPos += UTF16Util.CodePointLength(ch);
                int chClass   = data.GetCanonicalClass(ch);
                int composite = data.GetPairwiseComposition(starterCh, ch);
                if (composite != NormalizerData.NOT_COMPOSITE &&
                    (lastClass < chClass || lastClass == 0))
                {
                    UTF16Util.SetCodePointAt(target, starterPos, composite);
                    starterCh = composite;
                }
                else
                {
                    if (chClass == 0)
                    {
                        starterPos = compPos;
                        starterCh  = ch;
                    }
                    lastClass  = chClass;
                    decompPos += UTF16Util.SetCodePointAt(target, compPos, ch);
                    compPos   += UTF16Util.CodePointLength(ch);
                }
            }
            target.Length = (compPos);
        }
Example #4
0
            SCount = LCount * NCount;   // 11172

        /**
         * For use in an applet: just load a minimal set of data.
         */
        private static void SetMinimalDecomp(IntHashtable canonicalClass, IntStringHashtable decompose,
                                             LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
        {
            String[] decomposeData =
            {
                "\u005E", "\u0020\u0302",       "K",
                "\u005F", "\u0020\u0332",       "K",
                "\u0060", "\u0020\u0300",       "K",
                "\u00A0", "\u0020",             "K",
                "\u00A8", "\u0020\u0308",       "K",
                "\u00AA", "\u0061",             "K",
                "\u00AF", "\u0020\u0304",       "K",
                "\u00B2", "\u0032",             "K",
                "\u00B3", "\u0033",             "K",
                "\u00B4", "\u0020\u0301",       "K",
                "\u00B5", "\u03BC",             "K",
                "\u00B8", "\u0020\u0327",       "K",
                "\u00B9", "\u0031",             "K",
                "\u00BA", "\u006F",             "K",
                "\u00BC", "\u0031\u2044\u0034", "K",
                "\u00BD", "\u0031\u2044\u0032", "K",
                "\u00BE", "\u0033\u2044\u0034", "K",
                "\u00C0", "\u0041\u0300",       "",
                "\u00C1", "\u0041\u0301",       "",
                "\u00C2", "\u0041\u0302",       "",
                "\u00C3", "\u0041\u0303",       "",
                "\u00C4", "\u0041\u0308",       "",
                "\u00C5", "\u0041\u030A",       "",
                "\u00C7", "\u0043\u0327",       "",
                "\u00C8", "\u0045\u0300",       "",
                "\u00C9", "\u0045\u0301",       "",
                "\u00CA", "\u0045\u0302",       "",
                "\u00CB", "\u0045\u0308",       "",
                "\u00CC", "\u0049\u0300",       "",
                "\u00CD", "\u0049\u0301",       "",
                "\u00CE", "\u0049\u0302",       "",
                "\u00CF", "\u0049\u0308",       "",
                "\u00D1", "\u004E\u0303",       "",
                "\u00D2", "\u004F\u0300",       "",
                "\u00D3", "\u004F\u0301",       "",
                "\u00D4", "\u004F\u0302",       "",
                "\u00D5", "\u004F\u0303",       "",
                "\u00D6", "\u004F\u0308",       "",
                "\u00D9", "\u0055\u0300",       "",
                "\u00DA", "\u0055\u0301",       "",
                "\u00DB", "\u0055\u0302",       "",
                "\u00DC", "\u0055\u0308",       "",
                "\u00DD", "\u0059\u0301",       "",
                "\u00E0", "\u0061\u0300",       "",
                "\u00E1", "\u0061\u0301",       "",
                "\u00E2", "\u0061\u0302",       "",
                "\u00E3", "\u0061\u0303",       "",
                "\u00E4", "\u0061\u0308",       "",
                "\u00E5", "\u0061\u030A",       "",
                "\u00E7", "\u0063\u0327",       "",
                "\u00E8", "\u0065\u0300",       "",
                "\u00E9", "\u0065\u0301",       "",
                "\u00EA", "\u0065\u0302",       "",
                "\u00EB", "\u0065\u0308",       "",
                "\u00EC", "\u0069\u0300",       "",
                "\u00ED", "\u0069\u0301",       "",
                "\u00EE", "\u0069\u0302",       "",
                "\u00EF", "\u0069\u0308",       "",
                "\u00F1", "\u006E\u0303",       "",
                "\u00F2", "\u006F\u0300",       "",
                "\u00F3", "\u006F\u0301",       "",
                "\u00F4", "\u006F\u0302",       "",
                "\u00F5", "\u006F\u0303",       "",
                "\u00F6", "\u006F\u0308",       "",
                "\u00F9", "\u0075\u0300",       "",
                "\u00FA", "\u0075\u0301",       "",
                "\u00FB", "\u0075\u0302",       "",
                "\u00FC", "\u0075\u0308",       "",
                "\u00FD", "\u0079\u0301",       "",
// EXTRAS, outside of Latin 1
                "\u1EA4", "\u00C2\u0301",       "",
                "\u1EA5", "\u00E2\u0301",       "",
                "\u1EA6", "\u00C2\u0300",       "",
                "\u1EA7", "\u00E2\u0300",       "",
            };

            int[] classData =
            {
                0x0300, 230,
                0x0301, 230,
                0x0302, 230,
                0x0303, 230,
                0x0304, 230,
                0x0305, 230,
                0x0306, 230,
                0x0307, 230,
                0x0308, 230,
                0x0309, 230,
                0x030A, 230,
                0x030B, 230,
                0x030C, 230,
                0x030D, 230,
                0x030E, 230,
                0x030F, 230,
                0x0310, 230,
                0x0311, 230,
                0x0312, 230,
                0x0313, 230,
                0x0314, 230,
                0x0315, 232,
                0x0316, 220,
                0x0317, 220,
                0x0318, 220,
                0x0319, 220,
                0x031A, 232,
                0x031B, 216,
                0x031C, 220,
                0x031D, 220,
                0x031E, 220,
                0x031F, 220,
                0x0320, 220,
                0x0321, 202,
                0x0322, 202,
                0x0323, 220,
                0x0324, 220,
                0x0325, 220,
                0x0326, 220,
                0x0327, 202,
                0x0328, 202,
                0x0329, 220,
                0x032A, 220,
                0x032B, 220,
                0x032C, 220,
                0x032D, 220,
                0x032E, 220,
                0x032F, 220,
                0x0330, 220,
                0x0331, 220,
                0x0332, 220,
                0x0333, 220,
                0x0334,   1,
                0x0335,   1,
                0x0336,   1,
                0x0337,   1,
                0x0338,   1,
                0x0339, 220,
                0x033A, 220,
                0x033B, 220,
                0x033C, 220,
                0x033D, 230,
                0x033E, 230,
                0x033F, 230,
                0x0340, 230,
                0x0341, 230,
                0x0342, 230,
                0x0343, 230,
                0x0344, 230,
                0x0345, 240,
                0x0360, 234,
                0x0361, 234
            };

            // build the same tables we would otherwise get from the
            // Unicode Character Database, just with limited data

            for (int i = 0; i < decomposeData.Length; i += 3)
            {
                char   value  = decomposeData[i][0];
                String decomp = decomposeData[i + 1];
                bool   compat = decomposeData[i + 2].Equals("K");
                if (compat)
                {
                    isCompatibility.Set(value);
                }
                decompose.Put(value, decomp);
                if (!compat)
                {
                    int first  = '\u0000';
                    int second = UTF16Util.NextCodePoint(decomp, 0);
                    if (decomp.Length > 1)
                    {
                        first  = second;
                        second = UTF16Util.NextCodePoint(decomp,
                                                         UTF16Util.CodePointLength(first));
                    }
                    long pair = (first << 16) | second;
                    compose.Put(pair, value);
                }
            }

            for (int i = 0; i < classData.Length;)
            {
                canonicalClass.Put(classData[i++], classData[i++]);
            }
        }
Example #5
0
        /**
         * Builds a decomposition table from a UnicodeData file
         */
        private static void BuildDecompositionTables(
            IntHashtable canonicalClass, IntStringHashtable decompose,
            LongHashtable compose, BitSet isCompatibility, BitSet isExcluded)
        {
            if (DEBUG)
            {
                Console.Out.WriteLine("Reading Unicode Character Database");
            }
            //BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64*1024);
            TextReader @in = null;

            try
            {
                @in = TestUtil.GetDataReader("unicode.UnicodeData.txt");
            }
            catch (Exception e)
            {
                Console.Error.WriteLine("Failed to read UnicodeData.txt");
                Environment.Exit(1);
            }

            int  value;
            long pair;
            int  counter = 0;

            while (true)
            {
                // read a line, discarding comments and blank lines

                String line = @in.ReadLine();
                if (line == null)
                {
                    break;
                }
                int comment = line.IndexOf('#');                    // strip comments
                if (comment != -1)
                {
                    line = line.Substring(0, comment - 0);                // ICU4N: Checked 2nd substring parameter
                }
                if (line.Length == 0)
                {
                    continue;
                }
                if (DEBUG)
                {
                    counter++;
                    if ((counter & 0xFF) == 0)
                    {
                        Console.Out.WriteLine("At: " + line);
                    }
                }

                // find the values of the particular fields that we need
                // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN ... GRAVE;;;00E0;

                int start = 0;
                int end   = line.IndexOf(';');                                                                               // code
                value = int.Parse(line.Substring(start, end - start), NumberStyles.HexNumber, CultureInfo.InvariantCulture); // ICU4N: Corrected 2nd substring parameter
                if (true && value == '\u00c0')
                {
                    //Console.Out.WriteLine("debug: " + line);
                }
                end = line.IndexOf(';', start = end + 1); // name
                                                          /*String name = line.substring(start,end);*/
                end = line.IndexOf(';', start = end + 1); // general category
                end = line.IndexOf(';', start = end + 1); // canonical class

                // check consistency: canonical classes must be from 0 to 255

                int cc = int.Parse(line.Substring(start, end - start), CultureInfo.InvariantCulture); // ICU4N: Corrected 2nd substring parameter
                if (cc != (cc & 0xFF))
                {
                    Console.Error.WriteLine("Bad canonical class at: " + line);
                }
                canonicalClass.Put(value, cc);
                end = line.IndexOf(';', start = end + 1); // BIDI
                end = line.IndexOf(';', start = end + 1); // decomp

                // decomp requires more processing.
                // store whether it is canonical or compatibility.
                // store the decomp in one table, and the reverse mapping (from pairs) in another

                if (start != end)
                {
                    String segment = line.Substring(start, end - start); // ICU4N: Corrected 2nd parameter
                    bool   compat  = segment[0] == '<';
                    if (compat)
                    {
                        isCompatibility.Set(value);
                    }
                    String decomp = fromHex(segment);

                    // a small snippet of code to generate the Applet data

                    /*if (GENERATING) {
                     *  if (value < 0xFF) {
                     *      Console.Out.WriteLine(
                     *          "\"\\u" + hex((char)value) + "\", "
                     + "\"\\u" + hex(decomp, "\\u") + "\", "
                     + (compat ? "\"K\"," : "\"\",")
                     + "// " + name);
                     +  }
                     + }*/

                    // check consistency: all canon decomps must be singles or pairs!
                    int decompLen = UTF16Util.CountCodePoint(decomp);
                    if (decompLen < 1 || decompLen > 2 && !compat)
                    {
                        Console.Error.WriteLine("Bad decomp at: " + line);
                    }
                    decompose.Put(value, decomp);

                    // only compositions are canonical pairs
                    // skip if script exclusion

                    if (!compat && !isExcluded.Get(value))
                    {
                        int first  = '\u0000';
                        int second = UTF16Util.NextCodePoint(decomp, 0);
                        if (decompLen > 1)
                        {
                            first  = second;
                            second = UTF16Util.NextCodePoint(decomp,
                                                             UTF16Util.CodePointLength(first));
                        }

                        // store composition pair in single integer

                        pair = ((long)first << 32) | (uint)second;
                        if (DEBUG && value == '\u00C0')
                        {
                            Console.Out.WriteLine("debug2: " + line);
                        }
                        compose.Put(pair, value);
                    }
                    else if (DEBUG)
                    {
                        Console.Out.WriteLine("Excluding: " + decomp);
                    }
                }
            }
            @in.Dispose();
            if (DEBUG)
            {
                Console.Out.WriteLine("Done reading Unicode Character Database");
            }

            // add algorithmic Hangul decompositions
            // this is more compact if done at runtime, but for simplicity we
            // do it this way.

            if (DEBUG)
            {
                Console.Out.WriteLine("Adding Hangul");
            }

            for (int SIndex = 0; SIndex < SCount; ++SIndex)
            {
                int  TIndex = SIndex % TCount;
                char first, second;
                if (TIndex != 0)
                { // triple
                    first  = (char)(SBase + SIndex - TIndex);
                    second = (char)(TBase + TIndex);
                }
                else
                {
                    first  = (char)(LBase + SIndex / NCount);
                    second = (char)(VBase + (SIndex % NCount) / TCount);
                }
                pair  = ((long)first << 32) | second;
                value = SIndex + SBase;
                decompose.Put(value, Convert.ToString(first, CultureInfo.InvariantCulture) + second);
                compose.Put(pair, value);
            }
            if (DEBUG)
            {
                Console.Out.WriteLine("Done adding Hangul");
            }
        }