Пример #1
0
        /// <summary>Is followed by a dot above (without cc==230 in between) ?</summary>
        private bool IsFollowedByDotAbove(IContextIterator iter)
        {
            int c;
            int dotType;

            if (iter == null)
            {
                return(false);
            }

            for (iter.Reset(1); (c = iter.Next()) >= 0;)
            {
                if (c == 0x307)
                {
                    return(true);
                }
                dotType = GetDotType(c);
                if (dotType != OTHER_ACCENT)
                {
                    return(false); /* next base character or cc==230 in between */
                }
            }

            return(false); /* no dot above following */
        }
Пример #2
0
        /// <summary>Is followed by one or more cc==230 ?</summary>
        private bool IsFollowedByMoreAbove(IContextIterator iter)
        {
            int c;
            int dotType;

            if (iter == null)
            {
                return(false);
            }

            for (iter.Reset(1); (c = iter.Next()) >= 0;)
            {
                dotType = GetDotType(c);
                if (dotType == ABOVE)
                {
                    return(true); /* at least one cc==230 following */
                }
                else if (dotType != OTHER_ACCENT)
                {
                    return(false); /* next base character, no more cc==230 following */
                }
            }

            return(false); /* no more cc==230 following */
        }
Пример #3
0
        /*
         * See Jitterbug 2344:
         * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
         * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
         * we made those releases compatible with Unicode 3.2 which had not fixed
         * a related bug in SpecialCasing.txt.
         *
         * From the Jitterbug 2344 text:
         * ... this bug is listed as a Unicode erratum
         * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
         * <quote>
         * There are two errors in SpecialCasing.txt.
         * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
         * 2. An incorrect context definition. Correct as follows:
         * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
         * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
         * ---
         * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
         * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
         * where the context After_I is defined as:
         * The last preceding base character was an uppercase I, and there is no
         * intervening combining character class 230 (ABOVE).
         * </quote>
         *
         * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
         *
         * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
         * # This matches the behavior of the canonically equivalent I-dot_above
         *
         * See also the description in this place in older versions of uchar.c (revision 1.100).
         *
         * Markus W. Scherer 2003-feb-15
         */

        /// <summary>Is preceded by base character 'I' with no intervening cc=230 ?</summary>
        private bool IsPrecededBy_I(IContextIterator iter)
        {
            int c;
            int dotType;

            if (iter == null)
            {
                return(false);
            }

            for (iter.Reset(-1); (c = iter.Next()) >= 0;)
            {
                if (c == 0x49)
                {
                    return(true); /* preceded by I */
                }
                dotType = GetDotType(c);
                if (dotType != OTHER_ACCENT)
                {
                    return(false); /* preceded by different base character (not I), or intervening cc==230 */
                }
            }

            return(false); /* not preceded by I */
        }
Пример #4
0
        /// <summary>Is preceded by Soft_Dotted character with no intervening cc=230 ?</summary>
        private bool IsPrecededBySoftDotted(IContextIterator iter)
        {
            int c;
            int dotType;

            if (iter == null)
            {
                return(false);
            }

            for (iter.Reset(-1); (c = iter.Next()) >= 0;)
            {
                dotType = GetDotType(c);
                if (dotType == SOFT_DOTTED)
                {
                    return(true); /* preceded by TYPE_i */
                }
                else if (dotType != OTHER_ACCENT)
                {
                    return(false); /* preceded by different base character (not TYPE_i), or intervening cc==230 */
                }
            }

            return(false); /* not preceded by TYPE_i */
        }
Пример #5
0
        /// <summary>Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward)</summary>
        private bool IsFollowedByCasedLetter(IContextIterator iter, int dir)
        {
            int c;

            if (iter == null)
            {
                return(false);
            }

            for (iter.Reset(dir); (c = iter.Next()) >= 0;)
            {
                int type = GetTypeOrIgnorable(c);
                if ((type & 4) != 0)
                {
                    /* case-ignorable, continue with the loop */
                }
                else if (type != NONE)
                {
                    return(true); /* followed by cased letter */
                }
                else
                {
                    return(false); /* uncased and not case-ignorable */
                }
            }

            return(false); /* not followed by cased letter */
        }
Пример #6
0
 internal int ToFullTitle(int c, IContextIterator iter,
                          IAppendable output,
                          int caseLocale)
 {
     return(ToUpperOrTitle(c, iter, output, caseLocale, false));
 }
Пример #7
0
 public int ToFullTitle(int c, IContextIterator iter,
                        StringBuilder output,
                        int caseLocale)
 {
     return(ToUpperOrTitle(c, iter, output, caseLocale, false));
 }
Пример #8
0
        /* internal */
        private int ToUpperOrTitle(int c, IContextIterator iter,
                                   IAppendable output,
                                   int loc,
                                   bool upperNotTitle)
        {
            int result;
            int props;

            result = c;
            props  = trie.Get(c);
            if (!PropsHasException(props))
            {
                if (GetTypeFromProps(props) == LOWER)
                {
                    result = c + GetDelta(props);
                }
            }
            else
            {
                int excOffset = GetExceptionsOffset(props), excOffset2;
                int excWord = exceptions[excOffset++];
                int full, index;

                excOffset2 = excOffset;

                if ((excWord & EXC_CONDITIONAL_SPECIAL) != 0)
                {
                    /* use hardcoded conditions and mappings */
                    if (loc == LOC_TURKISH && c == 0x69)
                    {
                        /*
                         # Turkish and Azeri
                         #
                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                         # The following rules handle those cases.
                         #
                         # When uppercasing, i turns into a dotted capital I
                         #
                         #  0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
                         #  0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
                         */
                        return(0x130);
                    }
                    else if (loc == LOC_LITHUANIAN && c == 0x307 && IsPrecededBySoftDotted(iter))
                    {
                        /*
                         # Lithuanian
                         #
                         # Lithuanian retains the dot in a lowercase i when followed by accents.
                         #
                         # Remove DOT ABOVE after "i" with upper or titlecase
                         #
                         #  0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
                         */
                        return(0); /* remove the dot (continue without output) */
                    }
                    else
                    {
                        /* no known conditional special case mapping, use a normal mapping */
                    }
                }
                else if (HasSlot(excWord, EXC_FULL_MAPPINGS))
                {
                    long value = GetSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
                    full = (int)value & 0xffff;

                    /* start of full case mapping strings */
                    excOffset = (int)(value >> 32) + 1;

                    /* skip the lowercase and case-folding result strings */
                    excOffset += full & FULL_LOWER;
                    full     >>= 4;
                    excOffset += full & 0xf;
                    full     >>= 4;

                    if (upperNotTitle)
                    {
                        full &= 0xf;
                    }
                    else
                    {
                        /* skip the uppercase result string */
                        excOffset += full & 0xf;
                        full       = (full >> 4) & 0xf;
                    }

                    if (full != 0)
                    {
                        try
                        {
                            // append the result string
                            output.Append(exceptions, excOffset, excOffset + full);

                            /* return the string length */
                            return(full);
                        }
                        catch (IOException e)
                        {
                            throw new ICUUncheckedIOException(e);
                        }
                    }
                }

                if (!upperNotTitle && HasSlot(excWord, EXC_TITLE))
                {
                    index = EXC_TITLE;
                }
                else if (HasSlot(excWord, EXC_UPPER))
                {
                    /* here, titlecase is same as uppercase */
                    index = EXC_UPPER;
                }
                else
                {
                    return(~c);
                }
                result = GetSlotValue(excWord, index, excOffset2);
            }

            return((result == c) ? ~result : result);
        }
Пример #9
0
        /// <summary>
        /// Get the full lowercase mapping for <paramref name="c"/>.
        /// </summary>
        /// <param name="c">Character to be mapped.</param>
        /// <param name="iter">
        /// Character iterator, used for context-sensitive mappings.
        /// See <see cref="IContextIterator"/> for details.
        /// If iter==null then a context-independent result is returned.
        /// </param>
        /// <param name="output">If the mapping result is a string, then it is appended to <paramref name="output"/>.</param>
        /// <param name="caseLocale">Case locale value from ucase_getCaseLocale().</param>
        /// <returns>Output code point or string length, see <see cref="MAX_STRING_LENGTH"/>.</returns>
        /// <seealso cref="IContextIterator"/>
        /// <seealso cref="MAX_STRING_LENGTH"/>
        /// <internal/>
        public int ToFullLower(int c, IContextIterator iter, StringBuilder output, int caseLocale)
        {
            int result, props;

            result = c;
            props  = trie.Get(c);
            if (!PropsHasException(props))
            {
                if (GetTypeFromProps(props) >= UPPER)
                {
                    result = c + GetDelta(props);
                }
            }
            else
            {
                int excOffset = GetExceptionsOffset(props), excOffset2;
                int excWord = exceptions[excOffset++];
                int full;

                excOffset2 = excOffset;

                if ((excWord & EXC_CONDITIONAL_SPECIAL) != 0)
                {
                    /* use hardcoded conditions and mappings */

                    /*
                     * Test for conditional mappings first
                     *   (otherwise the unconditional default mappings are always taken),
                     * then test for characters that have unconditional mappings in SpecialCasing.txt,
                     * then get the UnicodeData.txt mappings.
                     */
                    if (caseLocale == LOC_LITHUANIAN &&
                        /* base characters, find accents above */
                        (((c == 0x49 || c == 0x4a || c == 0x12e) &&
                          IsFollowedByMoreAbove(iter)) ||
                         /* precomposed with accent above, no need to find one */
                         (c == 0xcc || c == 0xcd || c == 0x128))
                        )
                    {
                        /*
                         # Lithuanian
                         #
                         # Lithuanian retains the dot in a lowercase i when followed by accents.
                         #
                         # Introduce an explicit dot above when lowercasing capital I's and J's
                         # whenever there are more accents above.
                         # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
                         #
                         #  0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
                         #  004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
                         #  012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
                         #  00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
                         #  00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
                         #  0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
                         */
                        try
                        {
                            switch (c)
                            {
                            case 0x49:      /* LATIN CAPITAL LETTER I */
                                output.Append(iDot);
                                return(2);

                            case 0x4a:      /* LATIN CAPITAL LETTER J */
                                output.Append(jDot);
                                return(2);

                            case 0x12e:     /* LATIN CAPITAL LETTER I WITH OGONEK */
                                output.Append(iOgonekDot);
                                return(2);

                            case 0xcc:      /* LATIN CAPITAL LETTER I WITH GRAVE */
                                output.Append(iDotGrave);
                                return(3);

                            case 0xcd:      /* LATIN CAPITAL LETTER I WITH ACUTE */
                                output.Append(iDotAcute);
                                return(3);

                            case 0x128:     /* LATIN CAPITAL LETTER I WITH TILDE */
                                output.Append(iDotTilde);
                                return(3);

                            default:
                                return(0);    /* will not occur */
                            }
                        }
                        catch (IOException e)
                        {
                            throw new ICUUncheckedIOException(e);
                        }
                        /* # Turkish and Azeri */
                    }
                    else if (caseLocale == LOC_TURKISH && c == 0x130)
                    {
                        /*
                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
                         # The following rules handle those cases.
                         #
                         #  0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
                         #  0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
                         */
                        return(0x69);
                    }
                    else if (caseLocale == LOC_TURKISH && c == 0x307 && IsPrecededBy_I(iter))
                    {
                        /*
                         # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
                         # This matches the behavior of the canonically equivalent I-dot_above
                         #
                         #  0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
                         #  0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
                         */
                        return(0); /* remove the dot (continue without output) */
                    }
                    else if (caseLocale == LOC_TURKISH && c == 0x49 && !IsFollowedByDotAbove(iter))
                    {
                        /*
                         # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
                         #
                         #  0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
                         #  0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
                         */
                        return(0x131);
                    }
                    else if (c == 0x130)
                    {
                        /*
                         # Preserve canonical equivalence for I with dot. Turkic is handled below.
                         #
                         #  0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
                         */
                        try
                        {
                            output.Append(iDot);
                            return(2);
                        }
                        catch (IOException e)
                        {
                            throw new ICUUncheckedIOException(e);
                        }
                    }
                    else if (c == 0x3a3 &&
                             !IsFollowedByCasedLetter(iter, 1) &&
                             IsFollowedByCasedLetter(iter, -1)  /* -1=preceded */
                             )
                    {
                        /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */

                        /*
                         # Special case for final form of sigma
                         #
                         #  03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
                         */
                        return(0x3c2); /* greek small final sigma */
                    }
                    else
                    {
                        /* no known conditional special case mapping, use a normal mapping */
                    }
                }
                else if (HasSlot(excWord, EXC_FULL_MAPPINGS))
                {
                    long value = GetSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
                    full = (int)value & FULL_LOWER;
                    if (full != 0)
                    {
                        /* start of full case mapping strings */
                        excOffset = (int)(value >> 32) + 1;

                        try
                        {
                            // append the lowercase mapping
                            output.Append(exceptions, excOffset, full); // ICU4N: (excOffset + full) - excOffset == full

                            /* return the string length */
                            return(full);
                        }
                        catch (IOException e)
                        {
                            throw new ICUUncheckedIOException(e);
                        }
                    }
                }

                if (HasSlot(excWord, EXC_LOWER))
                {
                    result = GetSlotValue(excWord, EXC_LOWER, excOffset2);
                }
            }

            return((result == c) ? ~result : result);
        }