/// <summary> /// Returns the primary weight before <paramref name="p"/>. /// <paramref name="p"/> must be greater than the first root primary. /// </summary> /// <param name="p"></param> /// <param name="isCompressible"></param> /// <returns></returns> internal long GetPrimaryBefore(long p, bool isCompressible) { int index = FindPrimary(p); int step; long q = elements[index]; if (p == (q & 0xffffff00L)) { // Found p itself. Return the previous primary. // See if p is at the end of a previous range. step = (int)q & PRIMARY_STEP_MASK; if (step == 0) { // p is not at the end of a range. Look for the previous primary. do { p = elements[--index]; } while ((p & SEC_TER_DELTA_FLAG) != 0); return(p & 0xffffff00L); } } else { // p is in a range, and not at the start. long nextElement = elements[index + 1]; Debug.Assert(IsEndOfPrimaryRange(nextElement)); step = (int)nextElement & PRIMARY_STEP_MASK; } // Return the previous range primary. if ((p & 0xffff) == 0) { return(Collation.DecTwoBytePrimaryByOneStep(p, isCompressible, step)); } else { return(Collation.DecThreeBytePrimaryByOneStep(p, isCompressible, step)); } }
private bool GetCEsFromContractionCE32(CollationData data, int ce32) { int trieIndex = Collation.IndexFromCE32(ce32); ce32 = data.GetCE32FromContexts(trieIndex); // Default if no suffix match. // Since the original ce32 is not a prefix mapping, // the default ce32 must not be another contraction. Debug.Assert(!Collation.IsContractionCE32(ce32)); int contractionIndex = contractionCEs.Count; if (GetCEsFromCE32(data, Collation.SentinelCodePoint, ce32)) { AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); } else { // Bail out for c-without-contraction. AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NoCE, 0); } // Handle an encodable contraction unless the next contraction is too long // and starts with the same character. int prevX = -1; bool addContraction = false; using (CharsTrieEnumerator suffixes = CharsTrie.GetEnumerator(data.contexts, trieIndex + 2, 0)) { while (suffixes.MoveNext()) { CharsTrieEntry entry = suffixes.Current; ICharSequence suffix = entry.Chars; int x = CollationFastLatin.GetCharIndex(suffix[0]); if (x < 0) { continue; } // ignore anything but fast Latin text if (x == prevX) { if (addContraction) { // Bail out for all contractions starting with this character. AddContractionEntry(x, Collation.NoCE, 0); addContraction = false; } continue; } if (addContraction) { AddContractionEntry(prevX, ce0, ce1); } ce32 = entry.Value; if (suffix.Length == 1 && GetCEsFromCE32(data, Collation.SentinelCodePoint, ce32)) { addContraction = true; } else { AddContractionEntry(x, Collation.NoCE, 0); addContraction = false; } prevX = x; } } if (addContraction) { AddContractionEntry(prevX, ce0, ce1); } // Note: There might not be any fast Latin contractions, but // we need to enter contraction handling anyway so that we can bail out // when there is a non-fast-Latin character following. // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the // following umlaut and bail out, rather than return the difference of Y vs. u. ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | (uint)contractionIndex; ce1 = 0; return(true); }
private bool GetCEsFromCE32(CollationData data, int c, int ce32) { ce32 = data.GetFinalCE32(ce32); ce1 = 0; if (Collation.IsSimpleOrLongCE32(ce32)) { ce0 = Collation.CeFromCE32(ce32); } else { switch (Collation.TagFromCE32(ce32)) { case Collation.LATIN_EXPANSION_TAG: ce0 = Collation.LatinCE0FromCE32(ce32); ce1 = Collation.LatinCE1FromCE32(ce32); break; case Collation.EXPANSION32_TAG: { int index = Collation.IndexFromCE32(ce32); int length = Collation.LengthFromCE32(ce32); if (length <= 2) { ce0 = Collation.CeFromCE32(data.ce32s[index]); if (length == 2) { ce1 = Collation.CeFromCE32(data.ce32s[index + 1]); } break; } else { return(false); } } case Collation.EXPANSION_TAG: { int index = Collation.IndexFromCE32(ce32); int length = Collation.LengthFromCE32(ce32); if (length <= 2) { ce0 = data.ces[index]; if (length == 2) { ce1 = data.ces[index + 1]; } break; } else { return(false); } } // Note: We could support PREFIX_TAG (assert c>=0) // by recursing on its default CE32 and checking that none of the prefixes starts // with a fast Latin character. // However, currently (2013) there are only the L-before-middle-dot // prefix mappings in the Latin range, and those would be rejected anyway. case Collation.CONTRACTION_TAG: Debug.Assert(c >= 0); return(GetCEsFromContractionCE32(data, ce32)); case Collation.OFFSET_TAG: Debug.Assert(c >= 0); ce0 = data.GetCEFromOffsetCE32(c, ce32); break; default: return(false); } } // A mapping can be completely ignorable. if (ce0 == 0) { return(ce1 == 0); } // We do not support an ignorable ce0 unless it is completely ignorable. long p0 = ce0.TripleShift(32); if (p0 == 0) { return(false); } // We only support primaries up to the Latin script. if (p0 > lastLatinPrimary) { return(false); } // We support non-common secondary and case weights only together with short primaries. int lower32_0 = (int)ce0; if (p0 < firstShortPrimary) { int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK; if (sc0 != Collation.COMMON_SECONDARY_CE) { return(false); } } // No below-common tertiary weights. if ((lower32_0 & Collation.OnlyTertiaryMask) < Collation.CommonWeight16) { return(false); } if (ce1 != 0) { // Both primaries must be in the same group, // or both must get short mini primaries, // or a short-primary CE is followed by a secondary CE. // This is so that we can test the first primary and use the same mask for both, // and determine for both whether they are variable. long p1 = ce1.TripleShift(32); if (p1 == 0 ? p0 < firstShortPrimary : !InSameGroup(p0, p1)) { return(false); } int lower32_1 = (int)ce1; // No tertiary CEs. if ((lower32_1.TripleShift(16)) == 0) { return(false); } // We support non-common secondary and case weights // only for secondary CEs or together with short primaries. if (p1 != 0 && p1 < firstShortPrimary) { int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK; if (sc1 != Collation.COMMON_SECONDARY_CE) { return(false); } } // No below-common tertiary weights. if ((lower32_0 & Collation.OnlyTertiaryMask) < Collation.CommonWeight16) { return(false); } } // No quaternary weights. if (((ce0 | ce1) & Collation.QuaternaryMask) != 0) { return(false); } return(true); }
private void Compare(int c, int ce32, int baseCE32) { if (Collation.IsPrefixCE32(ce32)) { int dataIndex = Collation.IndexFromCE32(ce32); ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex)); if (Collation.IsPrefixCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); ComparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); } else { AddPrefixes(data, c, data.contexts, dataIndex + 2); } } else if (Collation.IsPrefixCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); AddPrefixes(baseData, c, baseData.contexts, baseIndex + 2); } if (Collation.IsContractionCE32(ce32)) { int dataIndex = Collation.IndexFromCE32(ce32); if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { ce32 = Collation.NO_CE32; } else { ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex)); } if (Collation.IsContractionCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { baseCE32 = Collation.NO_CE32; } else { baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); } CompareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); } else { AddContractions(c, data.contexts, dataIndex + 2); } } else if (Collation.IsContractionCE32(baseCE32)) { int baseIndex = Collation.IndexFromCE32(baseCE32); baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex)); AddContractions(c, baseData.contexts, baseIndex + 2); } int tag; if (Collation.IsSpecialCE32(ce32)) { tag = Collation.TagFromCE32(ce32); Debug.Assert(tag != Collation.PREFIX_TAG); Debug.Assert(tag != Collation.CONTRACTION_TAG); // Currently, the tailoring data builder does not write offset tags. // They might be useful for saving space, // but they would complicate the builder, // and in tailorings we assume that performance of tailored characters is more important. Debug.Assert(tag != Collation.OFFSET_TAG); } else { tag = -1; } int baseTag; if (Collation.IsSpecialCE32(baseCE32)) { baseTag = Collation.TagFromCE32(baseCE32); Debug.Assert(baseTag != Collation.PREFIX_TAG); Debug.Assert(baseTag != Collation.CONTRACTION_TAG); } else { baseTag = -1; } // Non-contextual mappings, expansions, etc. if (baseTag == Collation.OFFSET_TAG) { // We might be comparing a tailoring CE which is a copy of // a base offset-tag CE, via the [optimize [set]] syntax // or when a single-character mapping was copied for tailored contractions. // Offset tags always result in long-primary CEs, // with common secondary/tertiary weights. if (!Collation.IsLongPrimaryCE32(ce32)) { Add(c); return; } long dataCE = baseData.ces[Collation.IndexFromCE32(baseCE32)]; long p = Collation.GetThreeBytePrimaryForOffsetData(c, dataCE); if (Collation.PrimaryFromLongPrimaryCE32(ce32) != p) { Add(c); return; } } if (tag != baseTag) { Add(c); return; } if (tag == Collation.EXPANSION32_TAG) { int length = Collation.LengthFromCE32(ce32); int baseLength = Collation.LengthFromCE32(baseCE32); if (length != baseLength) { Add(c); return; } int idx0 = Collation.IndexFromCE32(ce32); int idx1 = Collation.IndexFromCE32(baseCE32); for (int i = 0; i < length; ++i) { if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { Add(c); break; } } } else if (tag == Collation.EXPANSION_TAG) { int length = Collation.LengthFromCE32(ce32); int baseLength = Collation.LengthFromCE32(baseCE32); if (length != baseLength) { Add(c); return; } int idx0 = Collation.IndexFromCE32(ce32); int idx1 = Collation.IndexFromCE32(baseCE32); for (int i = 0; i < length; ++i) { if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { Add(c); break; } } } else if (tag == Collation.HANGUL_TAG) { StringBuilder jamos = new StringBuilder(); int length = Hangul.Decompose(c, jamos); if (tailored.Contains(jamos[0]) || tailored.Contains(jamos[1]) || (length == 3 && tailored.Contains(jamos[2]))) { Add(c); } } else if (ce32 != baseCE32) { Add(c); } }
internal bool IsDigit(int c) { return(c < 0x660 ? c <= 0x39 && 0x30 <= c : Collation.HasCE32Tag(GetCE32(c), Collation.DIGIT_TAG)); }
/// <summary> /// Returns the single CE that c maps to. /// Throws <see cref="NotSupportedException"/> if <paramref name="c"/> does not map to a single CE. /// </summary> internal long GetSingleCE(int c) { CollationData d; int ce32 = GetCE32(c); if (ce32 == Collation.FALLBACK_CE32) { d = base_; ce32 = base_.GetCE32(c); } else { d = this; } while (Collation.IsSpecialCE32(ce32)) { switch (Collation.TagFromCE32(ce32)) { case Collation.LATIN_EXPANSION_TAG: case Collation.BUILDER_DATA_TAG: case Collation.PREFIX_TAG: case Collation.CONTRACTION_TAG: case Collation.HANGUL_TAG: case Collation.LEAD_SURROGATE_TAG: throw new NotSupportedException(string.Format( "there is not exactly one collation element for U+{0:X4} (CE32 0x{1:x8})", c, ce32)); case Collation.FALLBACK_TAG: case Collation.RESERVED_TAG_3: throw new InvalidOperationException(string.Format( "unexpected CE32 tag for U+{0:X4} (CE32 0x{1:x8})", c, ce32)); case Collation.LONG_PRIMARY_TAG: return(Collation.CeFromLongPrimaryCE32(ce32)); case Collation.LONG_SECONDARY_TAG: return(Collation.CeFromLongSecondaryCE32(ce32)); case Collation.EXPANSION32_TAG: if (Collation.LengthFromCE32(ce32) == 1) { ce32 = d.ce32s[Collation.IndexFromCE32(ce32)]; break; } else { throw new NotSupportedException(string.Format( "there is not exactly one collation element for U+{0:X4} (CE32 0x{1:x8})", c, ce32)); } case Collation.EXPANSION_TAG: { if (Collation.LengthFromCE32(ce32) == 1) { return(d.ces[Collation.IndexFromCE32(ce32)]); } else { throw new NotSupportedException(string.Format( "there is not exactly one collation element for U+{0:X4} (CE32 0x{1:x8})", c, ce32)); } } case Collation.DIGIT_TAG: // Fetch the non-numeric-collation CE32 and continue. ce32 = d.ce32s[Collation.IndexFromCE32(ce32)]; break; case Collation.U0000_TAG: Debug.Assert(c == 0); // Fetch the normal ce32 for U+0000 and continue. ce32 = d.ce32s[0]; break; case Collation.OFFSET_TAG: return(d.GetCEFromOffsetCE32(c, ce32)); case Collation.IMPLICIT_TAG: return(Collation.UnassignedCEFromCodePoint(c)); } } return(Collation.CeFromSimpleCE32(ce32)); }
/// <summary> /// Computes a CE from <paramref name="c"/>'s <paramref name="ce32"/> which has the <see cref="Collation.OFFSET_TAG"/>. /// </summary> internal long GetCEFromOffsetCE32(int c, int ce32) { long dataCE = ces[Collation.IndexFromCE32(ce32)]; return(Collation.MakeCE(Collation.GetThreeBytePrimaryForOffsetData(c, dataCE))); }
private void HandleCE32(int start, int end, int ce32) { for (; ;) { if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) { // !isSpecialCE32() if (sink != null) { sink.HandleCE(Collation.CeFromSimpleCE32(ce32)); } return; } switch (Collation.TagFromCE32(ce32)) { case Collation.FALLBACK_TAG: return; case Collation.RESERVED_TAG_3: case Collation.BUILDER_DATA_TAG: case Collation.LEAD_SURROGATE_TAG: // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C. throw new InvalidOperationException( string.Format("Unexpected CE32 tag type {0} for ce32=0x{1:x8}", Collation.TagFromCE32(ce32), ce32)); case Collation.LONG_PRIMARY_TAG: if (sink != null) { sink.HandleCE(Collation.CeFromLongPrimaryCE32(ce32)); } return; case Collation.LONG_SECONDARY_TAG: if (sink != null) { sink.HandleCE(Collation.CeFromLongSecondaryCE32(ce32)); } return; case Collation.LATIN_EXPANSION_TAG: if (sink != null) { ces[0] = Collation.LatinCE0FromCE32(ce32); ces[1] = Collation.LatinCE1FromCE32(ce32); sink.HandleExpansion(ces, 0, 2); } // Optimization: If we have a prefix, // then the relevant strings have been added already. if (unreversedPrefix.Length == 0) { AddExpansions(start, end); } return; case Collation.EXPANSION32_TAG: if (sink != null) { int idx = Collation.IndexFromCE32(ce32); int length = Collation.LengthFromCE32(ce32); for (int i = 0; i < length; ++i) { ces[i] = Collation.CeFromCE32(data.ce32s[idx + i]); } sink.HandleExpansion(ces, 0, length); } // Optimization: If we have a prefix, // then the relevant strings have been added already. if (unreversedPrefix.Length == 0) { AddExpansions(start, end); } return; case Collation.EXPANSION_TAG: if (sink != null) { int idx = Collation.IndexFromCE32(ce32); int length = Collation.LengthFromCE32(ce32); sink.HandleExpansion(data.ces, idx, length); } // Optimization: If we have a prefix, // then the relevant strings have been added already. if (unreversedPrefix.Length == 0) { AddExpansions(start, end); } return; case Collation.PREFIX_TAG: HandlePrefixes(start, end, ce32); return; case Collation.CONTRACTION_TAG: HandleContractions(start, end, ce32); return; case Collation.DIGIT_TAG: // Fetch the non-numeric-collation CE32 and continue. ce32 = data.ce32s[Collation.IndexFromCE32(ce32)]; break; case Collation.U0000_TAG: Debug.Assert(start == 0 && end == 0); // Fetch the normal ce32 for U+0000 and continue. ce32 = data.ce32s[0]; break; case Collation.HANGUL_TAG: if (sink != null) { // TODO: This should be optimized, // especially if [start..end] is the complete Hangul range. (assert that) UTF16CollationIterator iter = new UTF16CollationIterator(data); StringBuilderCharSequence hangul = new StringBuilderCharSequence(new StringBuilder(1)); for (int c = start; c <= end; ++c) { hangul.StringBuilder.Length = 0; hangul.StringBuilder.AppendCodePoint(c); iter.SetText(false, hangul, 0); int length = iter.FetchCEs(); // Ignore the terminating non-CE. Debug.Assert(length >= 2 && iter.GetCE(length - 1) == Collation.NO_CE); sink.HandleExpansion(iter.GetCEs(), 0, length - 1); } } // Optimization: If we have a prefix, // then the relevant strings have been added already. if (unreversedPrefix.Length == 0) { AddExpansions(start, end); } return; case Collation.OFFSET_TAG: // Currently no need to send offset CEs to the sink. return; case Collation.IMPLICIT_TAG: // Currently no need to send implicit CEs to the sink. return; } } }