Beispiel #1
0
        // set of property starts for UnicodeSet ------------------------------- ***

        public void AddPropertyStarts(UnicodeSet set)
        {
            int i, length;
            int c, start, limit;

            sbyte prev, jg;

            /* add the start code point of each same-value range of the trie */
            TrieIterator iter = new TrieIterator(trie);

            RangeValueIterator_Constants.Element element = new RangeValueIterator_Constants.Element();

            while (iter.Next(element))
            {
                set.Add(element.start);
            }

            /* add the code points from the bidi mirroring table */
            length = indexes[IX_MIRROR_LENGTH];
            for (i = 0; i < length; ++i)
            {
                c = GetMirrorCodePoint(mirrors[i]);
                set.Add(c, c + 1);
            }

            /*
             * add the code points from the Joining_Group array where the value
             * changes
             */
            start  = indexes[IX_JG_START];
            limit  = indexes[IX_JG_LIMIT];
            length = limit - start;
            prev   = 0;
            for (i = 0; i < length; ++i)
            {
                jg = jgArray[i];
                if (jg != prev)
                {
                    set.Add(start);
                    prev = jg;
                }
                ++start;
            }
            if (prev != 0)
            {
                /*
                 * add the limit code point if the last value was not 0 (it is now
                 * start==limit)
                 */
                set.Add(limit);
            }

            /*
             * add code points with hardcoded properties, plus the ones following
             * them
             */

            /* (none right now) */
        }
 static public void AddAll(UnicodeSetIterator source, UnicodeSet result)
 {
     while (source.NextRange())
     {
         if (source.codepoint == IBM.ICU.Text.UnicodeSetIterator.IS_STRING)
         {
             result.Add(source.str0);
         }
         else
         {
             result.Add(source.codepoint, source.codepointEnd);
         }
     }
 }
Beispiel #3
0
        /// <summary>
        /// Temporary test, just to see how the stuff works.
        /// </summary>
        ///
        static public void Main(String[] args)
        {
            String[]     testCases = { "fiss", "h\u03a3" };
            CaseIterator ci        = new CaseIterator();

            for (int i = 0; i < testCases.Length; ++i)
            {
                String item = testCases[i];
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine("Testing: " + toName.Transliterate(item));
                System.Console.Out.WriteLine();
                ci.Reset(item);
                int count_0 = 0;
                for (String temp = ci.Next(); temp != null; temp = ci.Next())
                {
                    System.Console.Out.WriteLine(toName.Transliterate(temp));
                    count_0++;
                }
                System.Console.Out.WriteLine("Total: " + count_0);
            }

            // generate a list of all caseless characters -- characters whose
            // case closure is themselves.

            UnicodeSet caseless = new UnicodeSet();

            for (int i_1 = 0; i_1 <= 0x10FFFF; ++i_1)
            {
                String cp = IBM.ICU.Text.UTF16.ValueOf(i_1);
                ci.Reset(cp);
                int    count_2 = 0;
                String fold    = null;
                for (String temp_3 = ci.Next(); temp_3 != null; temp_3 = ci.Next())
                {
                    fold = temp_3;
                    if (++count_2 > 1)
                    {
                        break;
                    }
                }
                if (count_2 == 1 && fold.Equals(cp))
                {
                    caseless.Add(i_1);
                }
            }

            System.Console.Out.WriteLine("caseless = " + caseless.ToPattern(true));

            UnicodeSet not_lc = new UnicodeSet("[:^lc:]");

            UnicodeSet a = new UnicodeSet();

            a.Set(not_lc);
            a.RemoveAll(caseless);
            System.Console.Out.WriteLine("[:^lc:] - caseless = " + a.ToPattern(true));

            a.Set(caseless);
            a.RemoveAll(not_lc);
            System.Console.Out.WriteLine("caseless - [:^lc:] = " + a.ToPattern(true));
        }
Beispiel #4
0
 // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
 //  Also, ICU4C requires handleCE32() to be public because it is used by the callback
 //  function (enumTailoredRange()). This is not necessary for Java implementation.
 private void HandleCE32(int start, int end, int ce32)
 {
     Debug.Assert(ce32 != Collation.FALLBACK_CE32);
     if (Collation.IsSpecialCE32(ce32))
     {
         ce32 = data.GetIndirectCE32(ce32);
         if (ce32 == Collation.FALLBACK_CE32)
         {
             return;
         }
     }
     do
     {
         int baseCE32 = baseData.GetFinalCE32(baseData.GetCE32(start));
         // Do not just continue if ce32 == baseCE32 because
         // contractions and expansions in different data objects
         // normally differ even if they have the same data offsets.
         if (Collation.IsSelfContainedCE32(ce32) && Collation.IsSelfContainedCE32(baseCE32))
         {
             // fastpath
             if (ce32 != baseCE32)
             {
                 tailored.Add(start);
             }
         }
         else
         {
             Compare(start, ce32, baseCE32);
         }
     } while (++start <= end);
 }
Beispiel #5
0
 internal void AddExpansions(int start, int end)
 {
     if (unreversedPrefix.Length == 0 && suffix == null)
     {
         if (expansions != null)
         {
             expansions.Add(start, end);
         }
     }
     else
     {
         AddStrings(start, end, expansions);
     }
 }
Beispiel #6
0
            /*
             * @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject()
             */
            protected internal override bool _addTestObject(IList list)
            {
                if (list.Count > 32)
                {
                    return(false);
                }
                UnicodeSet result = new UnicodeSet();

                for (int i = 0; i < 50; ++i)
                {
                    result.Add(IBM.ICU.Charset.TestUtilities.random.Next(100));
                }
                ILOG.J2CsMapping.Collections.Generics.Collections.Add(list, result.ToString());
                return(true);
            }
Beispiel #7
0
        public static UnicodeSet GetSet(IDictionary <Integer, T> m, T value)
        {
            UnicodeSet result = new UnicodeSet();

            foreach (var key in m.Keys)
            {
                T val = m.Get(key);
                if (!val.Equals(value))
                {
                    continue;
                }
                result.Add(key.Value);
            }
            return(result);
        }
        public static UnicodeSet GetSet(IDictionary m, Object value_ren)
        {
            UnicodeSet result = new UnicodeSet();

            for (IIterator it = new ILOG.J2CsMapping.Collections.IteratorAdapter(new ILOG.J2CsMapping.Collections.ListSet(m.Keys).GetEnumerator()); it.HasNext();)
            {
                Object key = it.Next();
                Object val = ILOG.J2CsMapping.Collections.Collections.Get(m, key);
                if (!val.Equals(value_ren))
                {
                    continue;
                }
                result.Add(((Int32)key));
            }
            return(result);
        }
Beispiel #9
0
        public static UnicodeSet GetSet(IDictionary <int, T> m, T value)
        {
            UnicodeSet result = new UnicodeSet();

            // ICU4N: Optimized by looping over the pair instead of doing a separate
            // value lookup on each loop.
            foreach (var pair in m)
            {
                if (!pair.Value.Equals(value))
                {
                    continue;
                }
                result.Add(pair.Key);
            }
            return(result);
        }
Beispiel #10
0
        /// <summary>
        /// Union the set of all characters that may output by this object into the
        /// given set.
        /// </summary>
        ///
        /// <param name="toUnionTo">the set into which to union the output characters</param>
        public virtual void AddReplacementSetTo(UnicodeSet toUnionTo)
        {
            int ch;

            for (int i = 0; i < output.Length; i += IBM.ICU.Text.UTF16.GetCharCount(ch))
            {
                ch = IBM.ICU.Text.UTF16.CharAt(output, i);
                UnicodeReplacer r = data.LookupReplacer(ch);
                if (r == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    r.AddReplacementSetTo(toUnionTo);
                }
            }
        }
Beispiel #11
0
        /// <summary>
        /// Implementation of UnicodeMatcher API. Union the set of all characters
        /// that may be matched by this object into the given set.
        /// </summary>
        ///
        /// <param name="toUnionTo">the set into which to union the source characters</param>
        public virtual void AddMatchSetTo(UnicodeSet toUnionTo)
        {
            int ch;

            for (int i = 0; i < pattern.Length; i += IBM.ICU.Text.UTF16.GetCharCount(ch))
            {
                ch = IBM.ICU.Text.UTF16.CharAt(pattern, i);
                UnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    matcher.AddMatchSetTo(toUnionTo);
                }
            }
        }
Beispiel #12
0
        internal void AddStrings(int start, int end, UnicodeSet set)
        {
            if (set == null)
            {
                return;
            }
            StringBuilder s = new StringBuilder(unreversedPrefix.ToString());

            do
            {
                s.AppendCodePoint(start);
                if (suffix != null)
                {
                    s.Append(suffix);
                }
                set.Add(s);
                s.Length = unreversedPrefix.Length;
            } while (++start <= end);
        }
        /// <summary>
        /// Union the set of all characters that may be modified by this rule into
        /// the given set.
        /// </summary>
        ///
        internal void AddSourceSetTo(UnicodeSet toUnionTo)
        {
            int limit = anteContextLength + keyLength;

            for (int i = anteContextLength; i < limit;)
            {
                int ch = IBM.ICU.Text.UTF16.CharAt(pattern, i);
                i += IBM.ICU.Text.UTF16.GetCharCount(ch);
                UnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    matcher.AddMatchSetTo(toUnionTo);
                }
            }
        }
Beispiel #14
0
        // set of property starts for UnicodeSet ------------------------------- ***

        public void AddPropertyStarts(UnicodeSet set)
        {
            /* add the start code point of each same-value range of the trie */
            using (var trieIterator = trie.GetEnumerator())
            {
                Trie2.Range range;
                while (trieIterator.MoveNext() && !(range = trieIterator.Current).LeadSurrogate)
                {
                    set.Add(range.StartCodePoint);
                }
            }

            /* add code points with hardcoded properties, plus the ones following them */

            /* (none right now, see comment below) */

            /*
             * Omit code points with hardcoded specialcasing properties
             * because we do not build property UnicodeSets for them right now.
             */
        }
        /// <summary>
        /// Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd]
        /// Returns the set for chaining.
        /// </summary>
        ///
        /// <param name="exemplar1"></param>
        /// <returns></returns>
        public static UnicodeSet Flatten(UnicodeSet exemplar1)
        {
            UnicodeSet result    = new UnicodeSet();
            bool       gotString = false;

            for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it
                 .NextRange();)
            {
                if (it.codepoint == IBM.ICU.Text.UnicodeSetIterator.IS_STRING)
                {
                    result.AddAll(it.str0);
                    gotString = true;
                }
                else
                {
                    result.Add(it.codepoint, it.codepointEnd);
                }
            }
            if (gotString)
            {
                exemplar1.Set(result);
            }
            return(exemplar1);
        }
Beispiel #16
0
        // set of property starts for UnicodeSet ------------------------------- ***

        public void AddPropertyStarts(UnicodeSet set)
        {
            int i, length;
            int c, start, limit;

            byte prev, jg;

            /* add the start code point of each same-value range of the trie */
            using (var trieIterator = trie.GetEnumerator())
            {
                Trie2Range range;
                while (trieIterator.MoveNext() && !(range = trieIterator.Current).IsLeadSurrogate)
                {
                    set.Add(range.StartCodePoint);
                }
            }

            /* add the code points from the bidi mirroring table */
            length = indexes[IX_MIRROR_LENGTH];
            for (i = 0; i < length; ++i)
            {
                c = GetMirrorCodePoint(mirrors[i]);
                set.Add(c, c + 1);
            }

            /* add the code points from the Joining_Group array where the value changes */
            start = indexes[IX_JG_START];
            limit = indexes[IX_JG_LIMIT];
            byte[] jga = jgArray;
            for (; ;)
            {
                length = limit - start;
                prev   = 0;
                for (i = 0; i < length; ++i)
                {
                    jg = jga[i];
                    if (jg != prev)
                    {
                        set.Add(start);
                        prev = jg;
                    }
                    ++start;
                }
                if (prev != 0)
                {
                    /* add the limit code point if the last value was not 0 (it is now start==limit) */
                    set.Add(limit);
                }
                if (limit == indexes[IX_JG_LIMIT])
                {
                    /* switch to the second Joining_Group range */
                    start = indexes[IX_JG_START2];
                    limit = indexes[IX_JG_LIMIT2];
                    jga   = jgArray2;
                }
                else
                {
                    break;
                }
            }

            /* add code points with hardcoded properties, plus the ones following them */

            /* (none right now) */
        }
        /// <summary>
        /// Quote a literal string, using the available settings. Thus syntax
        /// characters, quote characters, and ignorable characters will be put into
        /// quotes.
        /// </summary>
        ///
        /// <param name="string"></param>
        /// <returns></returns>
        public String QuoteLiteral(String str0)
        {
            if (needingQuoteCharacters == null)
            {
                needingQuoteCharacters = new UnicodeSet().AddAll(syntaxCharacters)
                                         .AddAll(ignorableCharacters).AddAll(extraQuotingCharacters); // .addAll(quoteCharacters)
                if (usingSlash)
                {
                    needingQuoteCharacters.Add(BACK_SLASH);
                }
                if (usingQuote)
                {
                    needingQuoteCharacters.Add(SINGLE_QUOTE);
                }
            }
            StringBuilder result     = new StringBuilder();
            int           quotedChar = NO_QUOTE;
            int           cp;

            for (int i = 0; i < str0.Length; i += IBM.ICU.Text.UTF16.GetCharCount(cp))
            {
                cp = IBM.ICU.Text.UTF16.CharAt(str0, i);
                if (escapeCharacters.Contains(cp))
                {
                    // we may have to fix up previous characters
                    if (quotedChar == IN_QUOTE)
                    {
                        result.Append(SINGLE_QUOTE);
                        quotedChar = NO_QUOTE;
                    }
                    AppendEscaped(result, cp);
                    continue;
                }

                if (needingQuoteCharacters.Contains(cp))
                {
                    // if we have already started a quote
                    if (quotedChar == IN_QUOTE)
                    {
                        IBM.ICU.Text.UTF16.Append(result, cp);
                        if (usingQuote && cp == SINGLE_QUOTE)       // double it
                        {
                            result.Append(SINGLE_QUOTE);
                        }
                        continue;
                    }
                    // otherwise not already in quote
                    if (usingSlash)
                    {
                        result.Append(BACK_SLASH);
                        IBM.ICU.Text.UTF16.Append(result, cp);
                        continue;
                    }
                    if (usingQuote)
                    {
                        if (cp == SINGLE_QUOTE)       // double it and continue
                        {
                            result.Append(SINGLE_QUOTE);
                            result.Append(SINGLE_QUOTE);
                            continue;
                        }
                        result.Append(SINGLE_QUOTE);
                        IBM.ICU.Text.UTF16.Append(result, cp);
                        quotedChar = IN_QUOTE;
                        continue;
                    }
                    // we have no choice but to use \\u or \\U
                    AppendEscaped(result, cp);
                    continue;
                }
                // otherwise cp doesn't need quoting
                // we may have to fix up previous characters
                if (quotedChar == IN_QUOTE)
                {
                    result.Append(SINGLE_QUOTE);
                    quotedChar = NO_QUOTE;
                }
                IBM.ICU.Text.UTF16.Append(result, cp);
            }
            // all done.
            // we may have to fix up previous characters
            if (quotedChar == IN_QUOTE)
            {
                result.Append(SINGLE_QUOTE);
            }
            return(result.ToString());
        }
Beispiel #18
0
        /// <summary>
        /// Adds all simple case mappings and the full case folding for <paramref name="c"/> to sa,
        /// and also adds special case closure mappings.
        /// </summary>
        /// <remarks>
        /// <paramref name="c"/> itself is not added.
        /// For example, the mappings
        /// <list type="bullet">
        ///     <item><description>for s include long s</description></item>
        ///     <item><description>for sharp s include ss</description></item>
        ///     <item><description>for k include the Kelvin sign</description></item>
        /// </list>
        /// </remarks>
        public void AddCaseClosure(int c, UnicodeSet set)
        {
            /*
             * Hardcode the case closure of i and its relatives and ignore the
             * data file data for these characters.
             * The Turkic dotless i and dotted I with their case mapping conditions
             * and case folding option make the related characters behave specially.
             * This code matches their closure behavior to their case folding behavior.
             */

            switch (c)
            {
            case 0x49:
                /* regular i and I are in one equivalence class */
                set.Add(0x69);
                return;

            case 0x69:
                set.Add(0x49);
                return;

            case 0x130:
                /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
                set.Add(iDot);
                return;

            case 0x131:
                /* dotless i is in a class by itself */
                return;

            default:
                /* otherwise use the data file data */
                break;
            }

            int props = trie.Get(c);

            if (!PropsHasException(props))
            {
                if (GetTypeFromProps(props) != NONE)
                {
                    /* add the one simple case mapping, no matter what type it is */
                    int delta = GetDelta(props);
                    if (delta != 0)
                    {
                        set.Add(c + delta);
                    }
                }
            }
            else
            {
                /*
                 * c has exceptions, so there may be multiple simple and/or
                 * full case mappings. Add them all.
                 */
                int excOffset0, excOffset = GetExceptionsOffset(props);
                int closureOffset;
                int excWord = exceptions[excOffset++];
                int index, closureLength, fullLength, length;

                excOffset0 = excOffset;

                /* add all simple case mappings */
                for (index = EXC_LOWER; index <= EXC_TITLE; ++index)
                {
                    if (HasSlot(excWord, index))
                    {
                        excOffset = excOffset0;
                        c         = GetSlotValue(excWord, index, excOffset);
                        set.Add(c);
                    }
                }

                /* get the closure string pointer & length */
                if (HasSlot(excWord, EXC_CLOSURE))
                {
                    excOffset = excOffset0;
                    long value = GetSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
                    closureLength = (int)value & CLOSURE_MAX_LENGTH; /* higher bits are reserved */
                    closureOffset = (int)(value >> 32) + 1;          /* behind this slot, unless there are full case mappings */
                }
                else
                {
                    closureLength = 0;
                    closureOffset = 0;
                }

                /* add the full case folding */
                if (HasSlot(excWord, EXC_FULL_MAPPINGS))
                {
                    excOffset = excOffset0;
                    long value = GetSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
                    fullLength = (int)value;

                    /* start of full case mapping strings */
                    excOffset = (int)(value >> 32) + 1;

                    fullLength &= 0xffff; /* bits 16 and higher are reserved */

                    /* skip the lowercase result string */
                    excOffset   += fullLength & FULL_LOWER;
                    fullLength >>= 4;

                    /* add the full case folding string */
                    length = fullLength & 0xf;
                    if (length != 0)
                    {
                        set.Add(exceptions.Substring(excOffset, length)); // ICU4N: excOffset + length - excOffset == length
                        excOffset += length;
                    }

                    /* skip the uppercase and titlecase strings */
                    fullLength >>= 4;
                    excOffset   += fullLength & 0xf;
                    fullLength >>= 4;
                    excOffset   += fullLength;

                    closureOffset = excOffset; /* behind full case mappings */
                }

                /* add each code point in the closure string */
                int limit = closureOffset + closureLength;
                for (index = closureOffset; index < limit; index += UTF16.GetCharCount(c))
                {
                    c = exceptions.CodePointAt(index);
                    set.Add(c);
                }
            }
        }
Beispiel #19
0
        /// <summary>
        /// Maps the string to single code points and adds the associated case closure
        /// mappings.
        /// </summary>
        /// <remarks>
        /// The string is mapped to code points if it is their full case folding string.
        /// In other words, this performs a reverse full case folding and then
        /// adds the case closure items of the resulting code points.
        /// If the string is found and its closure applied, then
        /// the string itself is added as well as part of its code points' closure.
        /// </remarks>
        /// <returns>true if the string was found.</returns>
        public bool AddStringCaseClosure(string s, UnicodeSet set)
        {
            int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

            if (unfold == null || s == null)
            {
                return(false); /* no reverse case folding data, or no string */
            }
            length = s.Length;
            if (length <= 1)
            {
                /* the string is too short to find any match */

                /*
                 * more precise would be:
                 * if(!u_strHasMoreChar32Than(s, length, 1))
                 * but this does not make much practical difference because
                 * a single supplementary code point would just not be found
                 */
                return(false);
            }

            unfoldRows        = unfold[UNFOLD_ROWS];
            unfoldRowWidth    = unfold[UNFOLD_ROW_WIDTH];
            unfoldStringWidth = unfold[UNFOLD_STRING_WIDTH];
            //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

            if (length > unfoldStringWidth)
            {
                /* the string is too long to find any match */
                return(false);
            }

            /* do a binary search for the string */
            start = 0;
            limit = unfoldRows;
            while (start < limit)
            {
                i            = (start + limit) / 2;
                unfoldOffset = ((i + 1) * unfoldRowWidth); // +1 to skip the header values above
                result       = StrCmpMax(s, unfoldOffset, unfoldStringWidth);

                if (result == 0)
                {
                    /* found the string: add each code point, and its case closure */
                    int c;

                    for (i = unfoldStringWidth; i < unfoldRowWidth && unfold[unfoldOffset + i] != 0; i += UTF16.GetCharCount(c))
                    {
                        c = UTF16.CharAt(unfold, unfoldOffset, unfold.Length, i);
                        set.Add(c);
                        AddCaseClosure(c, set);
                    }
                    return(true);
                }
                else if (result < 0)
                {
                    limit = i;
                }
                else /* result>0 */
                {
                    start = i + 1;
                }
            }

            return(false); /* string not found */
        }