public void ForData(CollationData d) { // Add all from the data, can be tailoring or base. if (d.Base != null) { checkTailored = -1; } data = d; using (IEnumerator <Trie2.Range> trieIterator = data.trie.GetEnumerator()) { Trie2.Range range; while (trieIterator.MoveNext() && !(range = trieIterator.Current).LeadSurrogate) { EnumCnERange(range.StartCodePoint, range.EndCodePoint, range.Value, this); } } if (d.Base == null) { return; } // Add all from the base data but only for un-tailored code points. tailored.Freeze(); checkTailored = 1; data = d.Base; using (IEnumerator <Trie2.Range> trieIterator = data.trie.GetEnumerator()) { Trie2.Range range; while (trieIterator.MoveNext() && !(range = trieIterator.Current).LeadSurrogate) { EnumCnERange(range.StartCodePoint, range.EndCodePoint, range.Value, this); } } }
private bool LoadGroups(CollationData data) { headerLength = 1 + NUM_SPECIAL_GROUPS; int r0 = (CollationFastLatin.Version << 8) | headerLength; result.Append((char)r0); // The first few reordering groups should be special groups // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. for (int i = 0; i < NUM_SPECIAL_GROUPS; ++i) { lastSpecialPrimaries[i] = data.GetLastPrimaryForGroup(ReorderCodes.First + i); if (lastSpecialPrimaries[i] == 0) { // missing data return(false); } // ICU4N: Use char instead of int to append the value to ensure ambient culture has no effect result.Append('0'); // reserve a slot for this group } firstDigitPrimary = data.GetFirstPrimaryForGroup(ReorderCodes.Digit); firstLatinPrimary = data.GetFirstPrimaryForGroup(UScript.Latin); lastLatinPrimary = data.GetLastPrimaryForGroup(UScript.Latin); if (firstDigitPrimary == 0 || firstLatinPrimary == 0) { // missing data return(false); } return(true); }
private bool LoadGroups(CollationData data) { headerLength = 1 + NUM_SPECIAL_GROUPS; int r0 = (CollationFastLatin.VERSION << 8) | headerLength; result.Append((char)r0); // The first few reordering groups should be special groups // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. for (int i = 0; i < NUM_SPECIAL_GROUPS; ++i) { lastSpecialPrimaries[i] = data.GetLastPrimaryForGroup(ReorderCodes.First + i); if (lastSpecialPrimaries[i] == 0) { // missing data return(false); } // ICU4N TODO: Check this (not sure about char data type) result.Append((char)0); // reserve a slot for this group } firstDigitPrimary = data.GetFirstPrimaryForGroup(ReorderCodes.Digit); firstLatinPrimary = data.GetFirstPrimaryForGroup(UScript.Latin); lastLatinPrimary = data.GetLastPrimaryForGroup(UScript.Latin); if (firstDigitPrimary == 0 || firstLatinPrimary == 0) { // missing data return(false); } return(true); }
public FCDIterCollationIterator(CollationData data, bool numeric, UCharacterIterator ui, int startIndex) : base(data, numeric, ui) { state = State.IterCheckFwd; start = startIndex; nfcImpl = data.nfcImpl; }
public UTF16CollationIterator(CollationData d, bool numeric, ICharSequence s, int p) : base(d, numeric) { seq = s; start = 0; pos = p; limit = s.Length; }
internal void EnsureOwnedData() { if (ownedData == null) { Normalizer2Impl nfcImpl = Norm2AllModes.GetNFCInstance().Impl; ownedData = new CollationData(nfcImpl); } data = ownedData; }
public FCDUTF16CollationIterator(CollationData data, bool numeric, ICharSequence s, int p) : base(data, numeric, s, p) { rawSeq = s; segmentStart = p; rawLimit = s.Length; nfcImpl = data.NfcImpl; checkDir = 1; }
private void AddPrefixes(CollationData d, int c, string p, int pidx) // ICU4N specific - changed p from ICharSequence to string { using (CharsTrie.Enumerator prefixes = new CharsTrie(p, pidx).GetEnumerator()) { while (prefixes.MoveNext()) { var e = prefixes.Current; AddPrefix(d, e.Chars, c, e.Value); } } }
internal void AliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) { int[] codes; if (codesLength == codesAndRanges.Length) { codes = codesAndRanges; } else { // TODO: Java 6: Arrays.copyOf(codes, codesLength); codes = new int[codesLength]; System.Array.Copy(codesAndRanges, 0, codes, 0, codesLength); } int rangesStart = codesLength; int rangesLimit = codesAndRanges.Length; int rangesLength = rangesLimit - rangesStart; if (table != null && (rangesLength == 0 ? !ReorderTableHasSplitBytes(table) : rangesLength >= 2 && // The first offset must be 0. The last offset must not be 0. (codesAndRanges[rangesStart] & 0xffff) == 0 && (codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) { reorderTable = table; reorderCodes = codes; // Drop ranges before the first split byte. They are reordered by the table. // This then speeds up reordering of the remaining ranges. int firstSplitByteRangeIndex = rangesStart; while (firstSplitByteRangeIndex < rangesLimit && (codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) { // The second byte of the primary limit is 0. ++firstSplitByteRangeIndex; } if (firstSplitByteRangeIndex == rangesLimit) { Debug.Assert(!ReorderTableHasSplitBytes(table)); minHighNoReorder = 0; reorderRanges = null; } else { Debug.Assert(table[codesAndRanges[firstSplitByteRangeIndex].TripleShift(24)] == 0); minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L; SetReorderRanges(codesAndRanges, firstSplitByteRangeIndex, rangesLimit - firstSplitByteRangeIndex); } return; } // Regenerate missing data. SetReordering(data, codes); }
public void ForCodePoint(CollationData d, int c) { int ce32 = d.GetCE32(c); if (ce32 == Collation.FALLBACK_CE32) { d = d.Base; ce32 = d.GetCE32(c); } data = d; HandleCE32(c, c, ce32); }
private void AddPrefix(CollationData d, ICharSequence pfx, int c, int ce32) { SetPrefix(pfx); ce32 = d.GetFinalCE32(ce32); if (Collation.IsContractionCE32(ce32)) { int idx = Collation.IndexFromCE32(ce32); AddContractions(c, d.contexts, idx + 2); } tailored.Add(unreversedPrefix.AppendCodePoint(c).ToString()); ResetPrefix(); }
private void GetCEs(CollationData data) { int i = 0; for (char c = (char)0; ; ++i, ++c) { if (c == CollationFastLatin.LatinLimit) { c = (char)CollationFastLatin.PUNCT_START; } else if (c == CollationFastLatin.PUNCT_LIMIT) { break; } CollationData d; int ce32 = data.GetCE32(c); if (ce32 == Collation.FALLBACK_CE32) { d = data.Base; ce32 = d.GetCE32(c); } else { d = data; } if (GetCEsFromCE32(d, c, ce32)) { charCEs[i][0] = ce0; charCEs[i][1] = ce1; AddUniqueCE(ce0); AddUniqueCE(ce1); } else { // bail out for c charCEs[i][0] = ce0 = Collation.NoCE; charCEs[i][1] = ce1 = 0; } if (c == 0 && !IsContractionCharCE(ce0)) { // Always map U+0000 to a contraction. // Write a contraction list with only a default value if there is no real contraction. Debug.Assert(contractionCEs.Count == 0); AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG; charCEs[0][1] = 0; } } // Terminate the last contraction list. contractionCEs.Add(CollationFastLatin.CONTR_CHAR_MASK); }
public void ForData(CollationData d) { data = d; baseData = d.Base; Debug.Assert(baseData != null); // utrie2_enum(data->trie, NULL, enumTailoredRange, this); using (IEnumerator <Trie2.Range> trieIterator = data.trie.GetEnumerator()) { Trie2.Range range; while (trieIterator.MoveNext() && !(range = trieIterator.Current).LeadSurrogate) { EnumTailoredRange(range.StartCodePoint, range.EndCodePoint, range.Value, this); } } }
internal bool ForData(CollationData data) { if (result.Length != 0) { // This builder is not reusable. throw new InvalidOperationException("attempt to reuse a CollationFastLatinBuilder"); } if (!LoadGroups(data)) { return(false); } // Fast handling of digits. firstShortPrimary = firstDigitPrimary; GetCEs(data); EncodeUniqueCEs(); if (shortPrimaryOverflow) { // Give digits long mini primaries, // so that there are more short primaries for letters. firstShortPrimary = firstLatinPrimary; ResetCEs(); GetCEs(data); EncodeUniqueCEs(); } // Note: If we still have a short-primary overflow but not a long-primary overflow, // then we could calculate how many more long primaries would fit, // and set the firstShortPrimary to that many after the current firstShortPrimary, // and try again. // However, this might only benefit the en_US_POSIX tailoring, // and it is simpler to suppress building fast Latin data for it in genrb, // or by returning false here if shortPrimaryOverflow. bool ok = !shortPrimaryOverflow; if (ok) { EncodeCharCEs(); EncodeContractions(); } contractionCEs.Clear(); // might reduce heap memory usage uniqueCEs.Clear(); return(ok); }
/// <summary> /// Partial constructor, see <see cref="CollationIterator.CollationIterator(CollationData)"/> /// </summary> public UTF16CollationIterator(CollationData d) : base(d) { }
public IterCollationIterator(CollationData d, bool numeric, UCharacterIterator ui) : base(d, numeric) { iter = ui; }
/// <summary> /// Computes the options value for the compare functions /// and writes the precomputed primary weights. /// Returns -1 if the Latin fastpath is not supported for the data and settings. /// The capacity must be <see cref="LatinLimit"/>. /// </summary> public static int GetOptions(CollationData data, CollationSettings settings, char[] primaries) { char[] header = data.fastLatinTableHeader; if (header == null) { return(-1); } Debug.Assert((header[0] >> 8) == Version); if (primaries.Length != LatinLimit) { Debug.Assert(false); return(-1); } int miniVarTop; if ((settings.Options & CollationSettings.AlternateMask) == 0) { // No mini primaries are variable, set a variableTop just below the // lowest long mini primary. miniVarTop = MIN_LONG - 1; } else { int headerLength = header[0] & 0xff; int i = 1 + settings.MaxVariable; if (i >= headerLength) { return(-1); // variableTop >= digits, should not occur } miniVarTop = header[i]; } bool digitsAreReordered = false; if (settings.HasReordering) { long prevStart = 0; long beforeDigitStart = 0; long digitStart = 0; long afterDigitStart = 0; for (int group = ReorderCodes.First; group < ReorderCodes.First + CollationData.MAX_NUM_SPECIAL_REORDER_CODES; ++group) { long start = data.GetFirstPrimaryForGroup(group); start = settings.Reorder(start); if (group == ReorderCodes.Digit) { beforeDigitStart = prevStart; digitStart = start; } else if (start != 0) { if (start < prevStart) { // The permutation affects the groups up to Latin. return(-1); } // In the future, there might be a special group between digits & Latin. if (digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { afterDigitStart = start; } prevStart = start; } } long latinStart = data.GetFirstPrimaryForGroup(UScript.Latin); latinStart = settings.Reorder(latinStart); if (latinStart < prevStart) { return(-1); } if (afterDigitStart == 0) { afterDigitStart = latinStart; } if (!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { digitsAreReordered = true; } } char[] table = data.FastLatinTable; // skip the header for (int c = 0; c < LatinLimit; ++c) { int p = table[c]; if (p >= MIN_SHORT) { p &= SHORT_PRIMARY_MASK; } else if (p > miniVarTop) { p &= LONG_PRIMARY_MASK; } else { p = 0; } primaries[c] = (char)p; } if (digitsAreReordered || (settings.Options & CollationSettings.Numeric) != 0) { // Bail out for digits. for (int c = 0x30; c <= 0x39; ++c) { primaries[c] = (char)0; } } // Shift the miniVarTop above other options. return((miniVarTop << 16) | settings.Options); }
private bool GetCEsFromContractionCE32(CollationData data, int ce32) { int trieIndex = Collation.IndexFromCE32(ce32); ce32 = data.GetCE32FromContexts(trieIndex); // Default if no suffix match. // Since the original ce32 is not a prefix mapping, // the default ce32 must not be another contraction. Debug.Assert(!Collation.IsContractionCE32(ce32)); int contractionIndex = contractionCEs.Count; if (GetCEsFromCE32(data, Collation.SentinelCodePoint, ce32)) { AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); } else { // Bail out for c-without-contraction. AddContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NoCE, 0); } // Handle an encodable contraction unless the next contraction is too long // and starts with the same character. int prevX = -1; bool addContraction = false; using (CharsTrieEnumerator suffixes = CharsTrie.GetEnumerator(data.contexts, trieIndex + 2, 0)) { while (suffixes.MoveNext()) { CharsTrieEntry entry = suffixes.Current; ICharSequence suffix = entry.Chars; int x = CollationFastLatin.GetCharIndex(suffix[0]); if (x < 0) { continue; } // ignore anything but fast Latin text if (x == prevX) { if (addContraction) { // Bail out for all contractions starting with this character. AddContractionEntry(x, Collation.NoCE, 0); addContraction = false; } continue; } if (addContraction) { AddContractionEntry(prevX, ce0, ce1); } ce32 = entry.Value; if (suffix.Length == 1 && GetCEsFromCE32(data, Collation.SentinelCodePoint, ce32)) { addContraction = true; } else { AddContractionEntry(x, Collation.NoCE, 0); addContraction = false; } prevX = x; } } if (addContraction) { AddContractionEntry(prevX, ce0, ce1); } // Note: There might not be any fast Latin contractions, but // we need to enter contraction handling anyway so that we can bail out // when there is a non-fast-Latin character following. // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the // following umlaut and bail out, rather than return the difference of Y vs. u. ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | (uint)contractionIndex; ce1 = 0; return(true); }
private bool GetCEsFromCE32(CollationData data, int c, int ce32) { ce32 = data.GetFinalCE32(ce32); ce1 = 0; if (Collation.IsSimpleOrLongCE32(ce32)) { ce0 = Collation.CeFromCE32(ce32); } else { switch (Collation.TagFromCE32(ce32)) { case Collation.LATIN_EXPANSION_TAG: ce0 = Collation.LatinCE0FromCE32(ce32); ce1 = Collation.LatinCE1FromCE32(ce32); break; case Collation.EXPANSION32_TAG: { int index = Collation.IndexFromCE32(ce32); int length = Collation.LengthFromCE32(ce32); if (length <= 2) { ce0 = Collation.CeFromCE32(data.ce32s[index]); if (length == 2) { ce1 = Collation.CeFromCE32(data.ce32s[index + 1]); } break; } else { return(false); } } case Collation.EXPANSION_TAG: { int index = Collation.IndexFromCE32(ce32); int length = Collation.LengthFromCE32(ce32); if (length <= 2) { ce0 = data.ces[index]; if (length == 2) { ce1 = data.ces[index + 1]; } break; } else { return(false); } } // Note: We could support PREFIX_TAG (assert c>=0) // by recursing on its default CE32 and checking that none of the prefixes starts // with a fast Latin character. // However, currently (2013) there are only the L-before-middle-dot // prefix mappings in the Latin range, and those would be rejected anyway. case Collation.CONTRACTION_TAG: Debug.Assert(c >= 0); return(GetCEsFromContractionCE32(data, ce32)); case Collation.OFFSET_TAG: Debug.Assert(c >= 0); ce0 = data.GetCEFromOffsetCE32(c, ce32); break; default: return(false); } } // A mapping can be completely ignorable. if (ce0 == 0) { return(ce1 == 0); } // We do not support an ignorable ce0 unless it is completely ignorable. long p0 = ce0.TripleShift(32); if (p0 == 0) { return(false); } // We only support primaries up to the Latin script. if (p0 > lastLatinPrimary) { return(false); } // We support non-common secondary and case weights only together with short primaries. int lower32_0 = (int)ce0; if (p0 < firstShortPrimary) { int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK; if (sc0 != Collation.COMMON_SECONDARY_CE) { return(false); } } // No below-common tertiary weights. if ((lower32_0 & Collation.OnlyTertiaryMask) < Collation.CommonWeight16) { return(false); } if (ce1 != 0) { // Both primaries must be in the same group, // or both must get short mini primaries, // or a short-primary CE is followed by a secondary CE. // This is so that we can test the first primary and use the same mask for both, // and determine for both whether they are variable. long p1 = ce1.TripleShift(32); if (p1 == 0 ? p0 < firstShortPrimary : !InSameGroup(p0, p1)) { return(false); } int lower32_1 = (int)ce1; // No tertiary CEs. if ((lower32_1.TripleShift(16)) == 0) { return(false); } // We support non-common secondary and case weights // only for secondary CEs or together with short primaries. if (p1 != 0 && p1 < firstShortPrimary) { int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK; if (sc1 != Collation.COMMON_SECONDARY_CE) { return(false); } } // No below-common tertiary weights. if ((lower32_0 & Collation.OnlyTertiaryMask) < Collation.CommonWeight16) { return(false); } } // No quaternary weights. if (((ce0 | ce1) & Collation.QuaternaryMask) != 0) { return(false); } return(true); }
public void SetReordering(CollationData data, int[] codes) { if (codes.Length == 0 || (codes.Length == 1 && codes[0] == Text.ReorderCodes.None)) { ResetReordering(); return; } List <int> ranges = new List <int>(); data.MakeReorderRanges(codes, ranges); int rangesLength = ranges.Count; if (rangesLength == 0) { ResetReordering(); return; } // ranges[] contains at least two (limit, offset) pairs. // The first offset must be 0. The last offset must not be 0. // Separators (at the low end) and trailing weights (at the high end) // are never reordered. Debug.Assert(rangesLength >= 2); Debug.Assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0); minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L; // Write the lead byte permutation table. // Set a 0 for each lead byte that has a range boundary in the middle. byte[] table = new byte[256]; int b = 0; int firstSplitByteRangeIndex = -1; for (int i = 0; i < rangesLength; ++i) { int pair = ranges[i]; int limit1 = pair.TripleShift(24); while (b < limit1) { table[b] = (byte)(b + pair); ++b; } // Check the second byte of the limit. if ((pair & 0xff0000) != 0) { table[limit1] = 0; b = limit1 + 1; if (firstSplitByteRangeIndex < 0) { firstSplitByteRangeIndex = i; } } } while (b <= 0xff) { table[b] = (byte)b; ++b; } int rangesStart; if (firstSplitByteRangeIndex < 0) { // The lead byte permutation table alone suffices for reordering. rangesStart = rangesLength = 0; } else { // Remove the ranges below the first split byte. rangesStart = firstSplitByteRangeIndex; rangesLength -= firstSplitByteRangeIndex; } SetReorderArrays(codes, ranges, rangesStart, rangesLength, table); }
/// <summary> /// Partial constructor, see <see cref="CollationIterator.CollationIterator(CollationData)"/> /// </summary> public FCDUTF16CollationIterator(CollationData d) : base(d) { nfcImpl = d.NfcImpl; }
internal static void Read(CollationTailoring @base, ByteBuffer inBytes, CollationTailoring tailoring) { tailoring.Version = ICUBinary.ReadHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE); if (@base != null && @base.GetUCAVersion() != tailoring.GetUCAVersion()) { throw new ICUException("Tailoring UCA version differs from base data UCA version"); } int inLength = inBytes.Remaining; if (inLength < 8) { throw new ICUException("not enough bytes"); } int indexesLength = inBytes.GetInt32(); // inIndexes[IX_INDEXES_LENGTH] if (indexesLength < 2 || inLength < indexesLength * 4) { throw new ICUException("not enough indexes"); } int[] inIndexes = new int[IX_TOTAL_SIZE + 1]; inIndexes[0] = indexesLength; for (int i = 1; i < indexesLength && i < inIndexes.Length; ++i) { inIndexes[i] = inBytes.GetInt32(); } for (int i = indexesLength; i < inIndexes.Length; ++i) { inIndexes[i] = -1; } if (indexesLength > inIndexes.Length) { ICUBinary.SkipBytes(inBytes, (indexesLength - inIndexes.Length) * 4); } // Assume that the tailoring data is in initial state, // with null pointers and 0 lengths. // Set pointers to non-empty data parts. // Do this in order of their byte offsets. (Should help porting to Java.) int index; // one of the indexes[] slots int offset; // byte offset for the index part int length; // number of bytes in the index part if (indexesLength > IX_TOTAL_SIZE) { length = inIndexes[IX_TOTAL_SIZE]; } else if (indexesLength > IX_REORDER_CODES_OFFSET) { length = inIndexes[indexesLength - 1]; } else { length = 0; // only indexes, and inLength was already checked for them } if (inLength < length) { throw new ICUException("not enough bytes"); } CollationData baseData = @base == null ? null : @base.Data; int[] reorderCodes; int reorderCodesLength; index = IX_REORDER_CODES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 4) { if (baseData == null) { // We assume for collation settings that // the base data does not have a reordering. throw new ICUException("Collation base data must not reorder scripts"); } reorderCodesLength = length / 4; reorderCodes = ICUBinary.GetInts(inBytes, reorderCodesLength, length & 3); // The reorderRanges (if any) are the trailing reorderCodes entries. // Split the array at the boundary. // Script or reorder codes do not exceed 16-bit values. // Range limits are stored in the upper 16 bits, and are never 0. int reorderRangesLength = 0; while (reorderRangesLength < reorderCodesLength && (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) { ++reorderRangesLength; } Debug.Assert(reorderRangesLength < reorderCodesLength); reorderCodesLength -= reorderRangesLength; } else { reorderCodes = new int[0]; reorderCodesLength = 0; ICUBinary.SkipBytes(inBytes, length); } // There should be a reorder table only if there are reorder codes. // However, when there are reorder codes the reorder table may be omitted to reduce // the data size. byte[] reorderTable = null; index = IX_REORDER_TABLE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 256) { if (reorderCodesLength == 0) { throw new ICUException("Reordering table without reordering codes"); } reorderTable = new byte[256]; inBytes.Get(reorderTable); length -= 256; } else { // If we have reorder codes, then build the reorderTable at the end, // when the CollationData is otherwise complete. } ICUBinary.SkipBytes(inBytes, length); if (baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) { throw new ICUException("Tailoring numeric primary weight differs from base data"); } CollationData data = null; // Remains null if there are no mappings. index = IX_TRIE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 8) { tailoring.EnsureOwnedData(); data = tailoring.OwnedData; data.Base = baseData; data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L; data.trie = tailoring.Trie = Trie2_32.CreateFromSerialized(inBytes); int trieLength = data.trie.GetSerializedLength(); if (trieLength > length) { throw new ICUException("Not enough bytes for the mappings trie"); // No mappings. } length -= trieLength; } else if (baseData != null) { // Use the base data. Only the settings are tailored. tailoring.Data = baseData; } else { throw new ICUException("Missing collation data mappings"); // No mappings. } ICUBinary.SkipBytes(inBytes, length); index = IX_RESERVED8_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.SkipBytes(inBytes, length); index = IX_CES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 8) { if (data == null) { throw new ICUException("Tailored ces without tailored trie"); } data.ces = ICUBinary.GetLongs(inBytes, length / 8, length & 7); } else { ICUBinary.SkipBytes(inBytes, length); } index = IX_RESERVED10_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.SkipBytes(inBytes, length); index = IX_CE32S_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 4) { if (data == null) { throw new ICUException("Tailored ce32s without tailored trie"); } data.ce32s = ICUBinary.GetInts(inBytes, length / 4, length & 3); } else { ICUBinary.SkipBytes(inBytes, length); } int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START]; if (jamoCE32sStart >= 0) { if (data == null || data.ce32s == null) { throw new ICUException("JamoCE32sStart index into non-existent ce32s[]"); } data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; // ICU4N specific - added extension method to IList<T> to handle "copy to" data.ce32s.CopyTo(jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH); } else if (data == null) { // Nothing to do. } else if (baseData != null) { data.jamoCE32s = baseData.jamoCE32s; } else { throw new ICUException("Missing Jamo CE32s for Hangul processing"); } index = IX_ROOT_ELEMENTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 4) { int rootElementsLength = length / 4; if (data == null) { throw new ICUException("Root elements but no mappings"); } if (rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) { throw new ICUException("Root elements array too short"); } data.rootElements = new long[rootElementsLength]; for (int i = 0; i < rootElementsLength; ++i) { data.rootElements[i] = inBytes.GetInt32() & 0xffffffffL; // unsigned int -> long } long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE]; if (commonSecTer != Collation.COMMON_SEC_AND_TER_CE) { throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value"); } long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES]; if ((secTerBoundaries.TripleShift(24)) < CollationKeys.SEC_COMMON_HIGH) { // [fixed last secondary common byte] is too low, // and secondary weights would collide with compressed common secondaries. throw new ICUException("[fixed last secondary common byte] is too low"); } length &= 3; } ICUBinary.SkipBytes(inBytes, length); index = IX_CONTEXTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 2) { if (data == null) { throw new ICUException("Tailored contexts without tailored trie"); } data.contexts = ICUBinary.GetString(inBytes, length / 2, length & 1); } else { ICUBinary.SkipBytes(inBytes, length); } index = IX_UNSAFE_BWD_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 2) { if (data == null) { throw new ICUException("Unsafe-backward-set but no mappings"); } if (baseData == null) { // Create the unsafe-backward set for the root collator. // Include all non-zero combining marks and trail surrogates. // We do this at load time, rather than at build time, // to simplify Unicode version bootstrapping: // The root data builder only needs the new FractionalUCA.txt data, // but it need not be built with a version of ICU already updated to // the corresponding new Unicode Character Database. // // The following is an optimized version of // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]"). // It is faster and requires fewer code dependencies. tailoring.UnsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates data.nfcImpl.AddLcccChars(tailoring.UnsafeBackwardSet); } else { // Clone the root collator's set contents. tailoring.UnsafeBackwardSet = baseData.unsafeBackwardSet.CloneAsThawed(); } // Add the ranges from the data file to the unsafe-backward set. USerializedSet sset = new USerializedSet(); char[] unsafeData = ICUBinary.GetChars(inBytes, length / 2, length & 1); length = 0; sset.GetSet(unsafeData, 0); int count = sset.CountRanges(); int[] range = new int[2]; for (int i = 0; i < count; ++i) { sset.GetRange(i, range); tailoring.UnsafeBackwardSet.Add(range[0], range[1]); } // Mark each lead surrogate as "unsafe" // if any of its 1024 associated supplementary code points is "unsafe". int c = 0x10000; for (int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { if (!tailoring.UnsafeBackwardSet.ContainsNone(c, c + 0x3ff)) { tailoring.UnsafeBackwardSet.Add(lead); } } tailoring.UnsafeBackwardSet.Freeze(); data.unsafeBackwardSet = tailoring.UnsafeBackwardSet; } else if (data == null) { // Nothing to do. } else if (baseData != null) { // No tailoring-specific data: Alias the root collator's set. data.unsafeBackwardSet = baseData.unsafeBackwardSet; } else { throw new ICUException("Missing unsafe-backward-set"); } ICUBinary.SkipBytes(inBytes, length); // If the fast Latin format version is different, // or the version is set to 0 for "no fast Latin table", // then just always use the normal string comparison path. index = IX_FAST_LATIN_TABLE_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (data != null) { data.fastLatinTable = null; data.fastLatinTableHeader = null; if (((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) { if (length >= 2) { char header0 = inBytes.GetChar(); int headerLength = header0 & 0xff; data.fastLatinTableHeader = new char[headerLength]; data.fastLatinTableHeader[0] = header0; for (int i = 1; i < headerLength; ++i) { data.fastLatinTableHeader[i] = inBytes.GetChar(); } int tableLength = length / 2 - headerLength; data.fastLatinTable = ICUBinary.GetChars(inBytes, tableLength, length & 1); length = 0; if ((header0 >> 8) != CollationFastLatin.VERSION) { throw new ICUException("Fast-Latin table version differs from version in data header"); } } else if (baseData != null) { data.fastLatinTable = baseData.fastLatinTable; data.fastLatinTableHeader = baseData.fastLatinTableHeader; } } } ICUBinary.SkipBytes(inBytes, length); index = IX_SCRIPTS_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 2) { if (data == null) { throw new ICUException("Script order data but no mappings"); } int scriptsLength = length / 2; CharBuffer inChars = inBytes.AsCharBuffer(); data.numScripts = inChars.Get(); // There must be enough entries for both arrays, including more than two range starts. int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16); if (scriptStartsLength <= 2) { throw new ICUException("Script order data too short"); } inChars.Get(data.scriptsIndex = new char[data.numScripts + 16]); inChars.Get(data.scriptStarts = new char[scriptStartsLength]); if (!(data.scriptStarts[0] == 0 && data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) && data.scriptStarts[scriptStartsLength - 1] == (Collation.TRAIL_WEIGHT_BYTE << 8))) { throw new ICUException("Script order data not valid"); } } else if (data == null) { // Nothing to do. } else if (baseData != null) { data.numScripts = baseData.numScripts; data.scriptsIndex = baseData.scriptsIndex; data.scriptStarts = baseData.scriptStarts; } ICUBinary.SkipBytes(inBytes, length); index = IX_COMPRESSIBLE_BYTES_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; if (length >= 256) { if (data == null) { throw new ICUException("Data for compressible primary lead bytes but no mappings"); } data.compressibleBytes = new bool[256]; for (int i = 0; i < 256; ++i) { data.compressibleBytes[i] = inBytes.Get() != 0; } length -= 256; } else if (data == null) { // Nothing to do. } else if (baseData != null) { data.compressibleBytes = baseData.compressibleBytes; } else { throw new ICUException("Missing data for compressible primary lead bytes"); } ICUBinary.SkipBytes(inBytes, length); index = IX_RESERVED18_OFFSET; offset = inIndexes[index]; length = inIndexes[index + 1] - offset; ICUBinary.SkipBytes(inBytes, length); CollationSettings ts = tailoring.Settings.ReadOnly; int options = inIndexes[IX_OPTIONS] & 0xffff; char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT]; int fastLatinOptions = CollationFastLatin.GetOptions( tailoring.Data, ts, fastLatinPrimaries); if (options == ts.Options && ts.VariableTop != 0 && Arrays.Equals(reorderCodes, ts.ReorderCodes) && fastLatinOptions == ts.FastLatinOptions && (fastLatinOptions < 0 || Arrays.Equals(fastLatinPrimaries, ts.FastLatinPrimaries))) { return; } CollationSettings settings = tailoring.Settings.CopyOnWrite(); settings.Options = options; // Set variableTop from options and scripts data. settings.VariableTop = tailoring.Data.GetLastPrimaryForGroup( ReorderCodes.First + settings.MaxVariable); if (settings.VariableTop == 0) { throw new ICUException("The maxVariable could not be mapped to a variableTop"); } if (reorderCodesLength != 0) { settings.AliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable); } settings.FastLatinOptions = CollationFastLatin.GetOptions( tailoring.Data, settings, settings.FastLatinPrimaries); }
/// <summary> /// Constructor. /// The <see cref="ISink"/> must be set before parsing. /// The <see cref="IImporter"/> can be set, otherwise [import locale] syntax is not supported. /// </summary> /// <param name="baseData"></param> internal CollationRuleParser(CollationData baseData) { this.baseData = baseData; }