Example #1
0
        /// <summary>
        /// Returns the primary weight before <paramref name="p"/>.
        /// <paramref name="p"/> must be greater than the first root primary.
        /// </summary>
        /// <param name="p"></param>
        /// <param name="isCompressible"></param>
        /// <returns></returns>
        internal long GetPrimaryBefore(long p, bool isCompressible)
        {
            int  index = FindPrimary(p);
            int  step;
            long q = elements[index];

            if (p == (q & 0xffffff00L))
            {
                // Found p itself. Return the previous primary.
                // See if p is at the end of a previous range.
                step = (int)q & PRIMARY_STEP_MASK;
                if (step == 0)
                {
                    // p is not at the end of a range. Look for the previous primary.
                    do
                    {
                        p = elements[--index];
                    } while ((p & SEC_TER_DELTA_FLAG) != 0);
                    return(p & 0xffffff00L);
                }
            }
            else
            {
                // p is in a range, and not at the start.
                long nextElement = elements[index + 1];
                Debug.Assert(IsEndOfPrimaryRange(nextElement));
                step = (int)nextElement & PRIMARY_STEP_MASK;
            }
            // Return the previous range primary.
            if ((p & 0xffff) == 0)
            {
                return(Collation.DecTwoBytePrimaryByOneStep(p, isCompressible, step));
            }
            else
            {
                return(Collation.DecThreeBytePrimaryByOneStep(p, isCompressible, step));
            }
        }
        private bool GetCEsFromContractionCE32(CollationData data, int ce32)
        {
            int trieIndex = Collation.IndexFromCE32(ce32);

            ce32 = data.GetCE32FromContexts(trieIndex);  // Default if no suffix match.
                                                         // Since the original ce32 is not a prefix mapping,
                                                         // the default ce32 must not be another contraction.
            Debug.Assert(!Collation.IsContractionCE32(ce32));
            int contractionIndex = contractionCEs.Count;

            if (GetCEsFromCE32(data, Collation.SentinelCodePoint, ce32))
            {
                AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
            }
            else
            {
                // Bail out for c-without-contraction.
                AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NoCE, 0);
            }
            // Handle an encodable contraction unless the next contraction is too long
            // and starts with the same character.
            int  prevX          = -1;
            bool addContraction = false;

            using (CharsTrieEnumerator suffixes = CharsTrie.GetEnumerator(data.contexts, trieIndex + 2, 0))
            {
                while (suffixes.MoveNext())
                {
                    CharsTrieEntry entry  = suffixes.Current;
                    ICharSequence  suffix = entry.Chars;
                    int            x      = CollationFastLatin.GetCharIndex(suffix[0]);
                    if (x < 0)
                    {
                        continue;
                    }                         // ignore anything but fast Latin text
                    if (x == prevX)
                    {
                        if (addContraction)
                        {
                            // Bail out for all contractions starting with this character.
                            AddContractionEntry(x, Collation.NoCE, 0);
                            addContraction = false;
                        }
                        continue;
                    }
                    if (addContraction)
                    {
                        AddContractionEntry(prevX, ce0, ce1);
                    }
                    ce32 = entry.Value;
                    if (suffix.Length == 1 && GetCEsFromCE32(data, Collation.SentinelCodePoint, ce32))
                    {
                        addContraction = true;
                    }
                    else
                    {
                        AddContractionEntry(x, Collation.NoCE, 0);
                        addContraction = false;
                    }
                    prevX = x;
                }
            }
            if (addContraction)
            {
                AddContractionEntry(prevX, ce0, ce1);
            }
            // Note: There might not be any fast Latin contractions, but
            // we need to enter contraction handling anyway so that we can bail out
            // when there is a non-fast-Latin character following.
            // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
            // following umlaut and bail out, rather than return the difference of Y vs. u.
            ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | (uint)contractionIndex;
            ce1 = 0;
            return(true);
        }
        private bool GetCEsFromCE32(CollationData data, int c, int ce32)
        {
            ce32 = data.GetFinalCE32(ce32);
            ce1  = 0;
            if (Collation.IsSimpleOrLongCE32(ce32))
            {
                ce0 = Collation.CeFromCE32(ce32);
            }
            else
            {
                switch (Collation.TagFromCE32(ce32))
                {
                case Collation.LATIN_EXPANSION_TAG:
                    ce0 = Collation.LatinCE0FromCE32(ce32);
                    ce1 = Collation.LatinCE1FromCE32(ce32);
                    break;

                case Collation.EXPANSION32_TAG:
                {
                    int index  = Collation.IndexFromCE32(ce32);
                    int length = Collation.LengthFromCE32(ce32);
                    if (length <= 2)
                    {
                        ce0 = Collation.CeFromCE32(data.ce32s[index]);
                        if (length == 2)
                        {
                            ce1 = Collation.CeFromCE32(data.ce32s[index + 1]);
                        }
                        break;
                    }
                    else
                    {
                        return(false);
                    }
                }

                case Collation.EXPANSION_TAG:
                {
                    int index  = Collation.IndexFromCE32(ce32);
                    int length = Collation.LengthFromCE32(ce32);
                    if (length <= 2)
                    {
                        ce0 = data.ces[index];
                        if (length == 2)
                        {
                            ce1 = data.ces[index + 1];
                        }
                        break;
                    }
                    else
                    {
                        return(false);
                    }
                }

                // Note: We could support PREFIX_TAG (assert c>=0)
                // by recursing on its default CE32 and checking that none of the prefixes starts
                // with a fast Latin character.
                // However, currently (2013) there are only the L-before-middle-dot
                // prefix mappings in the Latin range, and those would be rejected anyway.
                case Collation.CONTRACTION_TAG:
                    Debug.Assert(c >= 0);
                    return(GetCEsFromContractionCE32(data, ce32));

                case Collation.OFFSET_TAG:
                    Debug.Assert(c >= 0);
                    ce0 = data.GetCEFromOffsetCE32(c, ce32);
                    break;

                default:
                    return(false);
                }
            }
            // A mapping can be completely ignorable.
            if (ce0 == 0)
            {
                return(ce1 == 0);
            }
            // We do not support an ignorable ce0 unless it is completely ignorable.
            long p0 = ce0.TripleShift(32);

            if (p0 == 0)
            {
                return(false);
            }
            // We only support primaries up to the Latin script.
            if (p0 > lastLatinPrimary)
            {
                return(false);
            }
            // We support non-common secondary and case weights only together with short primaries.
            int lower32_0 = (int)ce0;

            if (p0 < firstShortPrimary)
            {
                int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK;
                if (sc0 != Collation.COMMON_SECONDARY_CE)
                {
                    return(false);
                }
            }
            // No below-common tertiary weights.
            if ((lower32_0 & Collation.OnlyTertiaryMask) < Collation.CommonWeight16)
            {
                return(false);
            }
            if (ce1 != 0)
            {
                // Both primaries must be in the same group,
                // or both must get short mini primaries,
                // or a short-primary CE is followed by a secondary CE.
                // This is so that we can test the first primary and use the same mask for both,
                // and determine for both whether they are variable.
                long p1 = ce1.TripleShift(32);
                if (p1 == 0 ? p0 < firstShortPrimary : !InSameGroup(p0, p1))
                {
                    return(false);
                }
                int lower32_1 = (int)ce1;
                // No tertiary CEs.
                if ((lower32_1.TripleShift(16)) == 0)
                {
                    return(false);
                }
                // We support non-common secondary and case weights
                // only for secondary CEs or together with short primaries.
                if (p1 != 0 && p1 < firstShortPrimary)
                {
                    int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK;
                    if (sc1 != Collation.COMMON_SECONDARY_CE)
                    {
                        return(false);
                    }
                }
                // No below-common tertiary weights.
                if ((lower32_0 & Collation.OnlyTertiaryMask) < Collation.CommonWeight16)
                {
                    return(false);
                }
            }
            // No quaternary weights.
            if (((ce0 | ce1) & Collation.QuaternaryMask) != 0)
            {
                return(false);
            }
            return(true);
        }
Example #4
0
        private void Compare(int c, int ce32, int baseCE32)
        {
            if (Collation.IsPrefixCE32(ce32))
            {
                int dataIndex = Collation.IndexFromCE32(ce32);
                ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex));
                if (Collation.IsPrefixCE32(baseCE32))
                {
                    int baseIndex = Collation.IndexFromCE32(baseCE32);
                    baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                    ComparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
                }
                else
                {
                    AddPrefixes(data, c, data.contexts, dataIndex + 2);
                }
            }
            else if (Collation.IsPrefixCE32(baseCE32))
            {
                int baseIndex = Collation.IndexFromCE32(baseCE32);
                baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                AddPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
            }

            if (Collation.IsContractionCE32(ce32))
            {
                int dataIndex = Collation.IndexFromCE32(ce32);
                if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0)
                {
                    ce32 = Collation.NO_CE32;
                }
                else
                {
                    ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex));
                }
                if (Collation.IsContractionCE32(baseCE32))
                {
                    int baseIndex = Collation.IndexFromCE32(baseCE32);
                    if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0)
                    {
                        baseCE32 = Collation.NO_CE32;
                    }
                    else
                    {
                        baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                    }
                    CompareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
                }
                else
                {
                    AddContractions(c, data.contexts, dataIndex + 2);
                }
            }
            else if (Collation.IsContractionCE32(baseCE32))
            {
                int baseIndex = Collation.IndexFromCE32(baseCE32);
                baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                AddContractions(c, baseData.contexts, baseIndex + 2);
            }

            int tag;

            if (Collation.IsSpecialCE32(ce32))
            {
                tag = Collation.TagFromCE32(ce32);
                Debug.Assert(tag != Collation.PREFIX_TAG);
                Debug.Assert(tag != Collation.CONTRACTION_TAG);
                // Currently, the tailoring data builder does not write offset tags.
                // They might be useful for saving space,
                // but they would complicate the builder,
                // and in tailorings we assume that performance of tailored characters is more important.
                Debug.Assert(tag != Collation.OFFSET_TAG);
            }
            else
            {
                tag = -1;
            }
            int baseTag;

            if (Collation.IsSpecialCE32(baseCE32))
            {
                baseTag = Collation.TagFromCE32(baseCE32);
                Debug.Assert(baseTag != Collation.PREFIX_TAG);
                Debug.Assert(baseTag != Collation.CONTRACTION_TAG);
            }
            else
            {
                baseTag = -1;
            }

            // Non-contextual mappings, expansions, etc.
            if (baseTag == Collation.OFFSET_TAG)
            {
                // We might be comparing a tailoring CE which is a copy of
                // a base offset-tag CE, via the [optimize [set]] syntax
                // or when a single-character mapping was copied for tailored contractions.
                // Offset tags always result in long-primary CEs,
                // with common secondary/tertiary weights.
                if (!Collation.IsLongPrimaryCE32(ce32))
                {
                    Add(c);
                    return;
                }
                long dataCE = baseData.ces[Collation.IndexFromCE32(baseCE32)];
                long p      = Collation.GetThreeBytePrimaryForOffsetData(c, dataCE);
                if (Collation.PrimaryFromLongPrimaryCE32(ce32) != p)
                {
                    Add(c);
                    return;
                }
            }

            if (tag != baseTag)
            {
                Add(c);
                return;
            }

            if (tag == Collation.EXPANSION32_TAG)
            {
                int length     = Collation.LengthFromCE32(ce32);
                int baseLength = Collation.LengthFromCE32(baseCE32);

                if (length != baseLength)
                {
                    Add(c);
                    return;
                }

                int idx0 = Collation.IndexFromCE32(ce32);
                int idx1 = Collation.IndexFromCE32(baseCE32);

                for (int i = 0; i < length; ++i)
                {
                    if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i])
                    {
                        Add(c);
                        break;
                    }
                }
            }
            else if (tag == Collation.EXPANSION_TAG)
            {
                int length     = Collation.LengthFromCE32(ce32);
                int baseLength = Collation.LengthFromCE32(baseCE32);

                if (length != baseLength)
                {
                    Add(c);
                    return;
                }

                int idx0 = Collation.IndexFromCE32(ce32);
                int idx1 = Collation.IndexFromCE32(baseCE32);

                for (int i = 0; i < length; ++i)
                {
                    if (data.ces[idx0 + i] != baseData.ces[idx1 + i])
                    {
                        Add(c);
                        break;
                    }
                }
            }
            else if (tag == Collation.HANGUL_TAG)
            {
                StringBuilder jamos  = new StringBuilder();
                int           length = Hangul.Decompose(c, jamos);
                if (tailored.Contains(jamos[0]) || tailored.Contains(jamos[1]) ||
                    (length == 3 && tailored.Contains(jamos[2])))
                {
                    Add(c);
                }
            }
            else if (ce32 != baseCE32)
            {
                Add(c);
            }
        }
Example #5
0
 internal bool IsDigit(int c)
 {
     return(c < 0x660 ? c <= 0x39 && 0x30 <= c :
            Collation.HasCE32Tag(GetCE32(c), Collation.DIGIT_TAG));
 }
Example #6
0
        /// <summary>
        /// Returns the single CE that c maps to.
        /// Throws <see cref="NotSupportedException"/> if <paramref name="c"/> does not map to a single CE.
        /// </summary>
        internal long GetSingleCE(int c)
        {
            CollationData d;
            int           ce32 = GetCE32(c);

            if (ce32 == Collation.FALLBACK_CE32)
            {
                d    = base_;
                ce32 = base_.GetCE32(c);
            }
            else
            {
                d = this;
            }
            while (Collation.IsSpecialCE32(ce32))
            {
                switch (Collation.TagFromCE32(ce32))
                {
                case Collation.LATIN_EXPANSION_TAG:
                case Collation.BUILDER_DATA_TAG:
                case Collation.PREFIX_TAG:
                case Collation.CONTRACTION_TAG:
                case Collation.HANGUL_TAG:
                case Collation.LEAD_SURROGATE_TAG:
                    throw new NotSupportedException(string.Format(
                                                        "there is not exactly one collation element for U+{0:X4} (CE32 0x{1:x8})",
                                                        c, ce32));

                case Collation.FALLBACK_TAG:
                case Collation.RESERVED_TAG_3:
                    throw new InvalidOperationException(string.Format(
                                                            "unexpected CE32 tag for U+{0:X4} (CE32 0x{1:x8})", c, ce32));

                case Collation.LONG_PRIMARY_TAG:
                    return(Collation.CeFromLongPrimaryCE32(ce32));

                case Collation.LONG_SECONDARY_TAG:
                    return(Collation.CeFromLongSecondaryCE32(ce32));

                case Collation.EXPANSION32_TAG:
                    if (Collation.LengthFromCE32(ce32) == 1)
                    {
                        ce32 = d.ce32s[Collation.IndexFromCE32(ce32)];
                        break;
                    }
                    else
                    {
                        throw new NotSupportedException(string.Format(
                                                            "there is not exactly one collation element for U+{0:X4} (CE32 0x{1:x8})",
                                                            c, ce32));
                    }

                case Collation.EXPANSION_TAG:
                {
                    if (Collation.LengthFromCE32(ce32) == 1)
                    {
                        return(d.ces[Collation.IndexFromCE32(ce32)]);
                    }
                    else
                    {
                        throw new NotSupportedException(string.Format(
                                                            "there is not exactly one collation element for U+{0:X4} (CE32 0x{1:x8})",
                                                            c, ce32));
                    }
                }

                case Collation.DIGIT_TAG:
                    // Fetch the non-numeric-collation CE32 and continue.
                    ce32 = d.ce32s[Collation.IndexFromCE32(ce32)];
                    break;

                case Collation.U0000_TAG:
                    Debug.Assert(c == 0);
                    // Fetch the normal ce32 for U+0000 and continue.
                    ce32 = d.ce32s[0];
                    break;

                case Collation.OFFSET_TAG:
                    return(d.GetCEFromOffsetCE32(c, ce32));

                case Collation.IMPLICIT_TAG:
                    return(Collation.UnassignedCEFromCodePoint(c));
                }
            }
            return(Collation.CeFromSimpleCE32(ce32));
        }
Example #7
0
        /// <summary>
        /// Computes a CE from <paramref name="c"/>'s <paramref name="ce32"/> which has the <see cref="Collation.OFFSET_TAG"/>.
        /// </summary>
        internal long GetCEFromOffsetCE32(int c, int ce32)
        {
            long dataCE = ces[Collation.IndexFromCE32(ce32)];

            return(Collation.MakeCE(Collation.GetThreeBytePrimaryForOffsetData(c, dataCE)));
        }
Example #8
0
        private void HandleCE32(int start, int end, int ce32)
        {
            for (; ;)
            {
                if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE)
                {
                    // !isSpecialCE32()
                    if (sink != null)
                    {
                        sink.HandleCE(Collation.CeFromSimpleCE32(ce32));
                    }
                    return;
                }
                switch (Collation.TagFromCE32(ce32))
                {
                case Collation.FALLBACK_TAG:
                    return;

                case Collation.RESERVED_TAG_3:
                case Collation.BUILDER_DATA_TAG:
                case Collation.LEAD_SURROGATE_TAG:
                    // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
                    throw new InvalidOperationException(
                              string.Format("Unexpected CE32 tag type {0} for ce32=0x{1:x8}",
                                            Collation.TagFromCE32(ce32), ce32));

                case Collation.LONG_PRIMARY_TAG:
                    if (sink != null)
                    {
                        sink.HandleCE(Collation.CeFromLongPrimaryCE32(ce32));
                    }
                    return;

                case Collation.LONG_SECONDARY_TAG:
                    if (sink != null)
                    {
                        sink.HandleCE(Collation.CeFromLongSecondaryCE32(ce32));
                    }
                    return;

                case Collation.LATIN_EXPANSION_TAG:
                    if (sink != null)
                    {
                        ces[0] = Collation.LatinCE0FromCE32(ce32);
                        ces[1] = Collation.LatinCE1FromCE32(ce32);
                        sink.HandleExpansion(ces, 0, 2);
                    }
                    // Optimization: If we have a prefix,
                    // then the relevant strings have been added already.
                    if (unreversedPrefix.Length == 0)
                    {
                        AddExpansions(start, end);
                    }
                    return;

                case Collation.EXPANSION32_TAG:
                    if (sink != null)
                    {
                        int idx    = Collation.IndexFromCE32(ce32);
                        int length = Collation.LengthFromCE32(ce32);
                        for (int i = 0; i < length; ++i)
                        {
                            ces[i] = Collation.CeFromCE32(data.ce32s[idx + i]);
                        }
                        sink.HandleExpansion(ces, 0, length);
                    }
                    // Optimization: If we have a prefix,
                    // then the relevant strings have been added already.
                    if (unreversedPrefix.Length == 0)
                    {
                        AddExpansions(start, end);
                    }
                    return;

                case Collation.EXPANSION_TAG:
                    if (sink != null)
                    {
                        int idx    = Collation.IndexFromCE32(ce32);
                        int length = Collation.LengthFromCE32(ce32);
                        sink.HandleExpansion(data.ces, idx, length);
                    }
                    // Optimization: If we have a prefix,
                    // then the relevant strings have been added already.
                    if (unreversedPrefix.Length == 0)
                    {
                        AddExpansions(start, end);
                    }
                    return;

                case Collation.PREFIX_TAG:
                    HandlePrefixes(start, end, ce32);
                    return;

                case Collation.CONTRACTION_TAG:
                    HandleContractions(start, end, ce32);
                    return;

                case Collation.DIGIT_TAG:
                    // Fetch the non-numeric-collation CE32 and continue.
                    ce32 = data.ce32s[Collation.IndexFromCE32(ce32)];
                    break;

                case Collation.U0000_TAG:
                    Debug.Assert(start == 0 && end == 0);
                    // Fetch the normal ce32 for U+0000 and continue.
                    ce32 = data.ce32s[0];
                    break;

                case Collation.HANGUL_TAG:
                    if (sink != null)
                    {
                        // TODO: This should be optimized,
                        // especially if [start..end] is the complete Hangul range. (assert that)
                        UTF16CollationIterator    iter   = new UTF16CollationIterator(data);
                        StringBuilderCharSequence hangul = new StringBuilderCharSequence(new StringBuilder(1));
                        for (int c = start; c <= end; ++c)
                        {
                            hangul.StringBuilder.Length = 0;
                            hangul.StringBuilder.AppendCodePoint(c);
                            iter.SetText(false, hangul, 0);
                            int length = iter.FetchCEs();
                            // Ignore the terminating non-CE.
                            Debug.Assert(length >= 2 && iter.GetCE(length - 1) == Collation.NO_CE);
                            sink.HandleExpansion(iter.GetCEs(), 0, length - 1);
                        }
                    }
                    // Optimization: If we have a prefix,
                    // then the relevant strings have been added already.
                    if (unreversedPrefix.Length == 0)
                    {
                        AddExpansions(start, end);
                    }
                    return;

                case Collation.OFFSET_TAG:
                    // Currently no need to send offset CEs to the sink.
                    return;

                case Collation.IMPLICIT_TAG:
                    // Currently no need to send implicit CEs to the sink.
                    return;
                }
            }
        }