/** * Writes the sort key bytes for minLevel up to the iterator data's strength. Optionally writes * the case level. Stops writing levels when callback.needToWrite(level) returns false. * Separates levels with the LEVEL_SEPARATOR_BYTE but does not write a TERMINATOR_BYTE. */ public static void WriteSortKeyUpToQuaternary(CollationIterator iter, bool[] compressibleBytes, CollationSettings settings, SortKeyByteSink sink, int minLevel, LevelCallback callback, bool preflight) { int options = settings.Options; // Set of levels to process and write. int levels = levelMasks[(int)CollationSettings.GetStrength(options)]; if ((options & CollationSettings.CASE_LEVEL) != 0) { levels |= Collation.CASE_LEVEL_FLAG; } // Minus the levels below minLevel. levels &= ~((1 << minLevel) - 1); if (levels == 0) { return; } long variableTop; if ((options & CollationSettings.ALTERNATE_MASK) == 0) { variableTop = 0; } else { // +1 so that we can use "<" and primary ignorables test out early. variableTop = settings.VariableTop + 1; } int tertiaryMask = CollationSettings.GetTertiaryMask(options); byte[] p234 = new byte[3]; SortKeyLevel cases = GetSortKeyLevel(levels, Collation.CASE_LEVEL_FLAG); SortKeyLevel secondaries = GetSortKeyLevel(levels, Collation.SECONDARY_LEVEL_FLAG); SortKeyLevel tertiaries = GetSortKeyLevel(levels, Collation.TERTIARY_LEVEL_FLAG); SortKeyLevel quaternaries = GetSortKeyLevel(levels, Collation.QUATERNARY_LEVEL_FLAG); long prevReorderedPrimary = 0; // 0==no compression int commonCases = 0; int commonSecondaries = 0; int commonTertiaries = 0; int commonQuaternaries = 0; int prevSecondary = 0; int secSegmentStart = 0; for (; ;) { // No need to keep all CEs in the buffer when we write a sort key. iter.ClearCEsIfNoneRemaining(); long ce = iter.NextCE(); long p = ce.TripleShift(32); if (p < variableTop && p > Collation.MERGE_SEPARATOR_PRIMARY) { // Variable CE, shift it to quaternary level. // Ignore all following primary ignorables, and shift further variable CEs. if (commonQuaternaries != 0) { --commonQuaternaries; while (commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { quaternaries.AppendByte(QUAT_COMMON_MIDDLE); commonQuaternaries -= QUAT_COMMON_MAX_COUNT; } // Shifted primary weights are lower than the common weight. quaternaries.AppendByte(QUAT_COMMON_LOW + commonQuaternaries); commonQuaternaries = 0; } do { if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) { if (settings.HasReordering) { p = settings.Reorder(p); } if (((int)p.TripleShift(24)) >= QUAT_SHIFTED_LIMIT_BYTE) { // Prevent shifted primary lead bytes from // overlapping with the common compression range. quaternaries.AppendByte(QUAT_SHIFTED_LIMIT_BYTE); } quaternaries.AppendWeight32(p); } do { ce = iter.NextCE(); p = ce.TripleShift(32); } while (p == 0); } while (p < variableTop && p > Collation.MERGE_SEPARATOR_PRIMARY); } // ce could be primary ignorable, or NO_CE, or the merge separator, // or a regular primary CE, but it is not variable. // If ce==NO_CE, then write nothing for the primary level but // terminate compression on all levels and then exit the loop. if (p > Collation.NO_CE_PRIMARY && (levels & Collation.PRIMARY_LEVEL_FLAG) != 0) { // Test the un-reordered primary for compressibility. bool isCompressible = compressibleBytes[(int)p.TripleShift(24)]; if (settings.HasReordering) { p = settings.Reorder(p); } int p1 = (int)p.TripleShift(24); if (!isCompressible || p1 != ((int)prevReorderedPrimary.TripleShift(24))) { if (prevReorderedPrimary != 0) { if (p < prevReorderedPrimary) { // No primary compression terminator // at the end of the level or merged segment. if (p1 > Collation.MERGE_SEPARATOR_BYTE) { sink.Append(Collation.PRIMARY_COMPRESSION_LOW_BYTE); } } else { sink.Append(Collation.PRIMARY_COMPRESSION_HIGH_BYTE); } } sink.Append(p1); if (isCompressible) { prevReorderedPrimary = p; } else { prevReorderedPrimary = 0; } } byte p2 = (byte)(p.TripleShift(16)); if (p2 != 0) { p234[0] = p2; p234[1] = (byte)(p.TripleShift(8)); p234[2] = (byte)p; sink.Append(p234, (p234[1] == 0) ? 1 : (p234[2] == 0) ? 2 : 3); } // Optimization for internalNextSortKeyPart(): // When the primary level overflows we can stop because we need not // calculate (preflight) the whole sort key length. if (!preflight && sink.Overflowed) { // not used in Java -- if (!sink.IsOk()) { // Java porting note: U_MEMORY_ALLOCATION_ERROR is set here in // C implementation. IsOk() in Java always returns true, so this // is a dead code. return; } } int lower32 = (int)ce; if (lower32 == 0) { continue; } // completely ignorable, no secondary/case/tertiary/quaternary if ((levels & Collation.SECONDARY_LEVEL_FLAG) != 0) { int s = lower32.TripleShift(16); // 16 bits if (s == 0) { // secondary ignorable } else if (s == Collation.COMMON_WEIGHT16 && ((options & CollationSettings.BACKWARD_SECONDARY) == 0 || p != Collation.MERGE_SEPARATOR_PRIMARY)) { // s is a common secondary weight, and // backwards-secondary is off or the ce is not the merge separator. ++commonSecondaries; } else if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) { if (commonSecondaries != 0) { --commonSecondaries; while (commonSecondaries >= SEC_COMMON_MAX_COUNT) { secondaries.AppendByte(SEC_COMMON_MIDDLE); commonSecondaries -= SEC_COMMON_MAX_COUNT; } int b; if (s < Collation.COMMON_WEIGHT16) { b = SEC_COMMON_LOW + commonSecondaries; } else { b = SEC_COMMON_HIGH - commonSecondaries; } secondaries.AppendByte(b); commonSecondaries = 0; } secondaries.AppendWeight16(s); } else { if (commonSecondaries != 0) { --commonSecondaries; // Append reverse weights. The level will be re-reversed later. int remainder = commonSecondaries % SEC_COMMON_MAX_COUNT; int b; if (prevSecondary < Collation.COMMON_WEIGHT16) { b = SEC_COMMON_LOW + remainder; } else { b = SEC_COMMON_HIGH - remainder; } secondaries.AppendByte(b); commonSecondaries -= remainder; // commonSecondaries is now a multiple of SEC_COMMON_MAX_COUNT. while (commonSecondaries > 0) { // same as >= SEC_COMMON_MAX_COUNT secondaries.AppendByte(SEC_COMMON_MIDDLE); commonSecondaries -= SEC_COMMON_MAX_COUNT; } // commonSecondaries == 0 } if (0 < p && p <= Collation.MERGE_SEPARATOR_PRIMARY) { // The backwards secondary level compares secondary weights backwards // within segments separated by the merge separator (U+FFFE). byte[] secs = secondaries.Data(); int last = secondaries.Length - 1; while (secSegmentStart < last) { byte b = secs[secSegmentStart]; secs[secSegmentStart++] = secs[last]; secs[last--] = b; } secondaries.AppendByte(p == Collation.NO_CE_PRIMARY ? Collation.LEVEL_SEPARATOR_BYTE : Collation.MERGE_SEPARATOR_BYTE); prevSecondary = 0; secSegmentStart = secondaries.Length; } else { secondaries.AppendReverseWeight16(s); prevSecondary = s; } } } if ((levels & Collation.CASE_LEVEL_FLAG) != 0) { if ((CollationSettings.GetStrength(options) == (int)CollationStrength.Primary) ? p == 0 : (lower32.TripleShift(16)) == 0) { // Primary+caseLevel: Ignore case level weights of primary ignorables. // Otherwise: Ignore case level weights of secondary ignorables. // For details see the comments in the CollationCompare class. } else { int c = (lower32.TripleShift(8)) & 0xff; // case bits & tertiary lead byte Debug.Assert((c & 0xc0) != 0xc0); if ((c & 0xc0) == 0 && c > Collation.LEVEL_SEPARATOR_BYTE) { ++commonCases; } else { if ((options & CollationSettings.UPPER_FIRST) == 0) { // lowerFirst: Compress common weights to nibbles 1..7..13, mixed=14, // upper=15. // If there are only common (=lowest) weights in the whole level, // then we need not write anything. // Level length differences are handled already on the next-higher level. if (commonCases != 0 && (c > Collation.LEVEL_SEPARATOR_BYTE || !cases.IsEmpty)) { --commonCases; while (commonCases >= CASE_LOWER_FIRST_COMMON_MAX_COUNT) { cases.AppendByte(CASE_LOWER_FIRST_COMMON_MIDDLE << 4); commonCases -= CASE_LOWER_FIRST_COMMON_MAX_COUNT; } int b; if (c <= Collation.LEVEL_SEPARATOR_BYTE) { b = CASE_LOWER_FIRST_COMMON_LOW + commonCases; } else { b = CASE_LOWER_FIRST_COMMON_HIGH - commonCases; } cases.AppendByte(b << 4); commonCases = 0; } if (c > Collation.LEVEL_SEPARATOR_BYTE) { c = (CASE_LOWER_FIRST_COMMON_HIGH + (c.TripleShift(6))) << 4; // 14 or 15 } } else { // upperFirst: Compress common weights to nibbles 3..15, mixed=2, // upper=1. // The compressed common case weights only go up from the "low" value // because with upperFirst the common weight is the highest one. if (commonCases != 0) { --commonCases; while (commonCases >= CASE_UPPER_FIRST_COMMON_MAX_COUNT) { cases.AppendByte(CASE_UPPER_FIRST_COMMON_LOW << 4); commonCases -= CASE_UPPER_FIRST_COMMON_MAX_COUNT; } cases.AppendByte((CASE_UPPER_FIRST_COMMON_LOW + commonCases) << 4); commonCases = 0; } if (c > Collation.LEVEL_SEPARATOR_BYTE) { c = (CASE_UPPER_FIRST_COMMON_LOW - (c.TripleShift(6))) << 4; // 2 or 1 } } // c is a separator byte 01, // or a left-shifted nibble 0x10, 0x20, ... 0xf0. cases.AppendByte(c); } } } if ((levels & Collation.TERTIARY_LEVEL_FLAG) != 0) { int t = lower32 & tertiaryMask; Debug.Assert((lower32 & 0xc000) != 0xc000); if (t == Collation.COMMON_WEIGHT16) { ++commonTertiaries; } else if ((tertiaryMask & 0x8000) == 0) { // Tertiary weights without case bits. // Move lead bytes 06..3F to C6..FF for a large common-weight range. if (commonTertiaries != 0) { --commonTertiaries; while (commonTertiaries >= TER_ONLY_COMMON_MAX_COUNT) { tertiaries.AppendByte(TER_ONLY_COMMON_MIDDLE); commonTertiaries -= TER_ONLY_COMMON_MAX_COUNT; } int b; if (t < Collation.COMMON_WEIGHT16) { b = TER_ONLY_COMMON_LOW + commonTertiaries; } else { b = TER_ONLY_COMMON_HIGH - commonTertiaries; } tertiaries.AppendByte(b); commonTertiaries = 0; } if (t > Collation.COMMON_WEIGHT16) { t += 0xc000; } tertiaries.AppendWeight16(t); } else if ((options & CollationSettings.UPPER_FIRST) == 0) { // Tertiary weights with caseFirst=lowerFirst. // Move lead bytes 06..BF to 46..FF for the common-weight range. if (commonTertiaries != 0) { --commonTertiaries; while (commonTertiaries >= TER_LOWER_FIRST_COMMON_MAX_COUNT) { tertiaries.AppendByte(TER_LOWER_FIRST_COMMON_MIDDLE); commonTertiaries -= TER_LOWER_FIRST_COMMON_MAX_COUNT; } int b; if (t < Collation.COMMON_WEIGHT16) { b = TER_LOWER_FIRST_COMMON_LOW + commonTertiaries; } else { b = TER_LOWER_FIRST_COMMON_HIGH - commonTertiaries; } tertiaries.AppendByte(b); commonTertiaries = 0; } if (t > Collation.COMMON_WEIGHT16) { t += 0x4000; } tertiaries.AppendWeight16(t); } else { // Tertiary weights with caseFirst=upperFirst. // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), // to keep tertiary CEs well-formed. // Their case+tertiary weights must be greater than those of // primary and secondary CEs. // // Separator 01 -> 01 (unchanged) // Lowercase 02..04 -> 82..84 (includes uncased) // Common weight 05 -> 85..C5 (common-weight compression range) // Lowercase 06..3F -> C6..FF // Mixed case 42..7F -> 42..7F // Uppercase 82..BF -> 02..3F // Tertiary CE 86..BF -> C6..FF if (t <= Collation.NO_CE_WEIGHT16) { // Keep separators unchanged. } else if ((lower32.TripleShift(16)) != 0) { // Invert case bits of primary & secondary CEs. t ^= 0xc000; if (t < (TER_UPPER_FIRST_COMMON_HIGH << 8)) { t -= 0x4000; } } else { // Keep uppercase bits of tertiary CEs. Debug.Assert(0x8600 <= t && t <= 0xbfff); t += 0x4000; } if (commonTertiaries != 0) { --commonTertiaries; while (commonTertiaries >= TER_UPPER_FIRST_COMMON_MAX_COUNT) { tertiaries.AppendByte(TER_UPPER_FIRST_COMMON_MIDDLE); commonTertiaries -= TER_UPPER_FIRST_COMMON_MAX_COUNT; } int b; if (t < (TER_UPPER_FIRST_COMMON_LOW << 8)) { b = TER_UPPER_FIRST_COMMON_LOW + commonTertiaries; } else { b = TER_UPPER_FIRST_COMMON_HIGH - commonTertiaries; } tertiaries.AppendByte(b); commonTertiaries = 0; } tertiaries.AppendWeight16(t); } } if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) { int q = lower32 & 0xffff; if ((q & 0xc0) == 0 && q > Collation.NO_CE_WEIGHT16) { ++commonQuaternaries; } else if (q == Collation.NO_CE_WEIGHT16 && (options & CollationSettings.ALTERNATE_MASK) == 0 && quaternaries.IsEmpty) { // If alternate=non-ignorable and there are only common quaternary weights, // then we need not write anything. // The only weights greater than the merge separator and less than the common // weight // are shifted primary weights, which are not generated for // alternate=non-ignorable. // There are also exactly as many quaternary weights as tertiary weights, // so level length differences are handled already on tertiary level. // Any above-common quaternary weight will compare greater regardless. quaternaries.AppendByte(Collation.LEVEL_SEPARATOR_BYTE); } else { if (q == Collation.NO_CE_WEIGHT16) { q = Collation.LEVEL_SEPARATOR_BYTE; } else { q = 0xfc + ((q.TripleShift(6)) & 3); } if (commonQuaternaries != 0) { --commonQuaternaries; while (commonQuaternaries >= QUAT_COMMON_MAX_COUNT) { quaternaries.AppendByte(QUAT_COMMON_MIDDLE); commonQuaternaries -= QUAT_COMMON_MAX_COUNT; } int b; if (q < QUAT_COMMON_LOW) { b = QUAT_COMMON_LOW + commonQuaternaries; } else { b = QUAT_COMMON_HIGH - commonQuaternaries; } quaternaries.AppendByte(b); commonQuaternaries = 0; } quaternaries.AppendByte(q); } } if ((lower32.TripleShift(24)) == Collation.LEVEL_SEPARATOR_BYTE) { break; } // ce == NO_CE } // Append the beyond-primary levels. // not used in Java -- boolean ok = true; if ((levels & Collation.SECONDARY_LEVEL_FLAG) != 0) { if (!callback.NeedToWrite(Collation.SECONDARY_LEVEL)) { return; } // not used in Java -- ok &= secondaries.isOk(); sink.Append(Collation.LEVEL_SEPARATOR_BYTE); secondaries.AppendTo(sink); } if ((levels & Collation.CASE_LEVEL_FLAG) != 0) { if (!callback.NeedToWrite(Collation.CASE_LEVEL)) { return; } // not used in Java -- ok &= cases.isOk(); sink.Append(Collation.LEVEL_SEPARATOR_BYTE); // Write pairs of nibbles as bytes, except separator bytes as themselves. int length = cases.Length - 1; // Ignore the trailing NO_CE. byte b = 0; for (int i = 0; i < length; ++i) { byte c = cases.GetAt(i); Debug.Assert((c & 0xf) == 0 && c != 0); if (b == 0) { b = c; } else { sink.Append(b | ((c >> 4) & 0xf)); b = 0; } } if (b != 0) { sink.Append(b); } } if ((levels & Collation.TERTIARY_LEVEL_FLAG) != 0) { if (!callback.NeedToWrite(Collation.TERTIARY_LEVEL)) { return; } // not used in Java -- ok &= tertiaries.isOk(); sink.Append(Collation.LEVEL_SEPARATOR_BYTE); tertiaries.AppendTo(sink); } if ((levels & Collation.QUATERNARY_LEVEL_FLAG) != 0) { if (!callback.NeedToWrite(Collation.QUATERNARY_LEVEL)) { return; } // not used in Java -- ok &= quaternaries.isOk(); sink.Append(Collation.LEVEL_SEPARATOR_BYTE); quaternaries.AppendTo(sink); } // not used in Java -- if (!ok || !sink.IsOk()) { // Java porting note: U_MEMORY_ALLOCATION_ERROR is set here in // C implementation. IsOk() in Java always returns true, so this // is a dead code. }
// Appends all but the last byte to the sink. The last byte should be the 01 terminator. internal void AppendTo(SortKeyByteSink sink) { Debug.Assert(len > 0 && buffer[len - 1] == 1); sink.Append(buffer, len - 1); }