internal unsafe int GetNonRandomizedHashCodeOrdinalIgnoreCase() { uint hash1 = (5381 << 16) + 5381; uint hash2 = hash1; fixed(char *src = &_firstChar) { Debug.Assert(src[this.Length] == '\0', "src[this.Length] == '\\0'"); Debug.Assert(((int)src) % 4 == 0, "Managed string should start at 4 bytes boundary"); uint *ptr = (uint *)src; int length = this.Length; // We "normalize to lowercase" every char by ORing with 0x0020. This casts // a very wide net because it will change, e.g., '^' to '~'. But that should // be ok because we expect this to be very rare in practice. const uint NormalizeToLowercase = 0x0020_0020u; // valid both for big-endian and for little-endian while (length > 2) { uint p0 = ptr[0]; uint p1 = ptr[1]; if (!Utf16Utility.AllCharsInUInt32AreAscii(p0 | p1)) { goto NotAscii; } length -= 4; // Where length is 4n-1 (e.g. 3,7,11,15,19) this additionally consumes the null terminator hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ (p0 | NormalizeToLowercase); hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (p1 | NormalizeToLowercase); ptr += 2; } if (length > 0) { uint p0 = ptr[0]; if (!Utf16Utility.AllCharsInUInt32AreAscii(p0)) { goto NotAscii; } // Where length is 4n-3 (e.g. 1,5,9,13,17) this additionally consumes the null terminator hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (p0 | NormalizeToLowercase); } } return((int)(hash1 + (hash2 * 1566083941))); NotAscii: return(GetNonRandomizedHashCodeOrdinalIgnoreCaseSlow(this));
/// <summary> /// Compute a Marvin OrdinalIgnoreCase hash and collapse it into a 32-bit hash. /// n.b. <paramref name="count"/> is specified as char count, not byte count. /// </summary> public static int ComputeHash32OrdinalIgnoreCase(ref char data, int count, uint p0, uint p1) { uint ucount = (uint)count; // in chars nuint byteOffset = 0; // in bytes uint tempValue; // We operate on 32-bit integers (two chars) at a time. while (ucount >= 2) { tempValue = Unsafe.ReadUnaligned <uint>(ref Unsafe.As <char, byte>(ref Unsafe.AddByteOffset(ref data, byteOffset))); if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) { goto NotAscii; } p0 += Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue); Block(ref p0, ref p1); byteOffset += 4; ucount -= 2; } // We have either one char (16 bits) or zero chars left over. Debug.Assert(ucount < 2); if (ucount > 0) { tempValue = Unsafe.AddByteOffset(ref data, byteOffset); if (tempValue > 0x7Fu) { goto NotAscii; } // addition is written with -0x80u to allow fall-through to next statement rather than jmp past it p0 += Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) + (0x800000u - 0x80u); } p0 += 0x80u; Block(ref p0, ref p1); Block(ref p0, ref p1); return((int)(p1 ^ p0)); NotAscii: Debug.Assert(0 <= ucount && ucount <= Int32.MaxValue); // this should fit into a signed int return(ComputeHash32OrdinalIgnoreCaseSlow(ref Unsafe.AddByteOffset(ref data, byteOffset), (int)ucount, p0, p1)); }
private unsafe string ChangeCaseCommon <TConversion>(string source) where TConversion : struct { Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds Debug.Assert(!_invariantMode); Debug.Assert(source != null); // If the string is empty, we're done. if (source.Length == 0) { return(string.Empty); } fixed(char *pSource = source) { nuint currIdx = 0; // in chars // If this culture's casing for ASCII is the same as invariant, try to take // a fast path that'll work in managed code and ASCII rather than calling out // to the OS for culture-aware casing. if (IsAsciiCasingSameAsInvariant) { // Read 2 chars (one 32-bit integer) at a time if (source.Length >= 2) { nuint lastIndexWhereCanReadTwoChars = (uint)source.Length - 2; do { // See the comments in ChangeCaseCommon<TConversion>(ROS<char>, Span<char>) for a full explanation of the below code. uint tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx); if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) { goto NotAscii; } if ((toUpper) ? Utf16Utility.UInt32ContainsAnyLowercaseAsciiChar(tempValue) : Utf16Utility.UInt32ContainsAnyUppercaseAsciiChar(tempValue)) { goto AsciiMustChangeCase; } currIdx += 2; } while (currIdx <= lastIndexWhereCanReadTwoChars); } // If there's a single character left to convert, do it now. if ((source.Length & 1) != 0) { uint tempValue = pSource[currIdx]; if (tempValue > 0x7Fu) { goto NotAscii; } if ((toUpper) ? ((tempValue - 'a') <= (uint)('z' - 'a')) : ((tempValue - 'A') <= (uint)('Z' - 'A'))) { goto AsciiMustChangeCase; } } // We got through all characters without finding anything that needed to change - done! return(source); AsciiMustChangeCase: { // We reached ASCII data that requires a case change. // This will necessarily allocate a new string, but let's try to stay within the managed (non-localization tables) // conversion code path if we can. string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count // copy existing known-good data into the result Span <char> resultSpan = new Span <char>(ref result.GetRawStringData(), result.Length); source.AsSpan(0, (int)currIdx).CopyTo(resultSpan); // and re-run the fast span-based logic over the remainder of the data ChangeCaseCommon <TConversion>(source.AsSpan((int)currIdx), resultSpan.Slice((int)currIdx)); return(result); } } NotAscii: { // We reached non-ASCII data *or* the requested culture doesn't map ASCII data the same way as the invariant culture. // In either case we need to fall back to the localization tables. string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count if (currIdx > 0) { // copy existing known-good data into the result Span <char> resultSpan = new Span <char>(ref result.GetRawStringData(), result.Length); source.AsSpan(0, (int)currIdx).CopyTo(resultSpan); } // and run the culture-aware logic over the remainder of the data fixed(char *pResult = result) { ChangeCase(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx, toUpper); } return(result); } } }
private unsafe void ChangeCaseCommon <TConversion>(ref char source, ref char destination, int charCount) where TConversion : struct { Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds Debug.Assert(!_invariantMode); Debug.Assert(charCount >= 0); if (charCount == 0) { goto Return; } fixed(char *pSource = &source) fixed(char *pDestination = &destination) { nuint currIdx = 0; // in chars if (IsAsciiCasingSameAsInvariant) { // Read 4 chars (two 32-bit integers) at a time if (charCount >= 4) { nuint lastIndexWhereCanReadFourChars = (uint)charCount - 4; do { // This is a mostly branchless case change routine. Generally speaking, we assume that the majority // of input is ASCII, so the 'if' checks below should normally evaluate to false. However, within // the ASCII data, we expect that characters of either case might be about equally distributed, so // we want the case change operation itself to be branchless. This gives optimal performance in the // common case. We also expect that developers aren't passing very long (16+ character) strings into // this method, so we won't bother vectorizing until data shows us that it's worthwhile to do so. uint tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx); if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) { goto NonAscii; } tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); Unsafe.WriteUnaligned <uint>(pDestination + currIdx, tempValue); tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx + 2); if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) { goto NonAsciiSkipTwoChars; } tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); Unsafe.WriteUnaligned <uint>(pDestination + currIdx + 2, tempValue); currIdx += 4; } while (currIdx <= lastIndexWhereCanReadFourChars); // At this point, there are fewer than 4 characters remaining to convert. Debug.Assert((uint)charCount - currIdx < 4); } // If there are 2 or 3 characters left to convert, we'll convert 2 of them now. if ((charCount & 2) != 0) { uint tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx); if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue)) { goto NonAscii; } tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); Unsafe.WriteUnaligned <uint>(pDestination + currIdx, tempValue); currIdx += 2; } // If there's a single character left to convert, do it now. if ((charCount & 1) != 0) { uint tempValue = pSource[currIdx]; if (tempValue > 0x7Fu) { goto NonAscii; } tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue); pDestination[currIdx] = (char)tempValue; } // And we're finished! goto Return; // If we reached this point, we found non-ASCII data. // Fall back down the p/invoke code path. NonAsciiSkipTwoChars: currIdx += 2; NonAscii: Debug.Assert(currIdx < (uint)charCount, "We somehow read past the end of the buffer."); charCount -= (int)currIdx; } // We encountered non-ASCII data and therefore can't perform invariant case conversion; or the requested culture // has a case conversion that's different from the invariant culture, even for ASCII data (e.g., tr-TR converts // 'i' (U+0069) to Latin Capital Letter I With Dot Above (U+0130)). ChangeCase(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper); } Return: return; }