public void Utf8BytesUFFFF()
        {
            var expected = new char[] { (char)0x800, (char)0xd7ff, (char)0xffff };

            Utf16Utility.GetUtf8Bytes(new string(expected), out var bytes, out var count);
            var actual = Encoding.UTF8.GetString(bytes, 0, count).ToCharArray();

            Assert.Equal(expected.Select(x => (int)x), actual.Select(x => (int)x));
        }
        public void Utf8BytesU7FAllocated()
        {
            var expected = Enumerable.Range(0, 10).Select(x => (char)(x + 1)).ToArray();
            var buf      = new byte[expected.Length];

            Utf16Utility.GetUtf8Bytes(new string(expected), buf, 0, out var count);
            var actual = Encoding.UTF8.GetString(buf, 0, count).ToCharArray();

            Assert.Equal(expected.Select(x => (int)x), actual.Select(x => (int)x));
        }
        public void GetUtf8BytesPreallocated()
        {
            var str    = new string(Enumerable.Range(0, Length).Select(x => (char)CharacterCode).ToArray());
            var buffer = new byte[str.Length * 3];

            for (int i = 0; i < LoopNum; i++)
            {
                Utf16Utility.GetUtf8Bytes(str, buffer, 0, out var count);
            }
        }
Exemple #4
0
        internal unsafe int GetNonRandomizedHashCodeOrdinalIgnoreCase()
        {
            uint hash1 = (5381 << 16) + 5381;
            uint hash2 = hash1;

            fixed(char *src = &_firstChar)
            {
                Debug.Assert(src[this.Length] == '\0', "src[this.Length] == '\\0'");
                Debug.Assert(((int)src) % 4 == 0, "Managed string should start at 4 bytes boundary");

                uint *ptr    = (uint *)src;
                int   length = this.Length;

                // We "normalize to lowercase" every char by ORing with 0x0020. This casts
                // a very wide net because it will change, e.g., '^' to '~'. But that should
                // be ok because we expect this to be very rare in practice.
                const uint NormalizeToLowercase = 0x0020_0020u; // valid both for big-endian and for little-endian

                while (length > 2)
                {
                    uint p0 = ptr[0];
                    uint p1 = ptr[1];
                    if (!Utf16Utility.AllCharsInUInt32AreAscii(p0 | p1))
                    {
                        goto NotAscii;
                    }

                    length -= 4;
                    // Where length is 4n-1 (e.g. 3,7,11,15,19) this additionally consumes the null terminator
                    hash1 = (BitOperations.RotateLeft(hash1, 5) + hash1) ^ (p0 | NormalizeToLowercase);
                    hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (p1 | NormalizeToLowercase);
                    ptr  += 2;
                }

                if (length > 0)
                {
                    uint p0 = ptr[0];
                    if (!Utf16Utility.AllCharsInUInt32AreAscii(p0))
                    {
                        goto NotAscii;
                    }

                    // Where length is 4n-3 (e.g. 1,5,9,13,17) this additionally consumes the null terminator
                    hash2 = (BitOperations.RotateLeft(hash2, 5) + hash2) ^ (p0 | NormalizeToLowercase);
                }
            }

            return((int)(hash1 + (hash2 * 1566083941)));

NotAscii:
            return(GetNonRandomizedHashCodeOrdinalIgnoreCaseSlow(this));
        // [Fact]
        public void AsciiString()
        {
            var  str    = "abcde012345=!";
            var  bytes  = Encoding.UTF8.GetBytes(str);
            var  actual = new byte[bytes.Length];
            var  en     = Utf16Utility.GetEnumerator(str.ToCharArray(), 0, str.Length);
            byte b;
            int  idx = 0;

            while (en.TryGetNext(out b))
            {
                actual[idx] = b;
                idx++;
            }
            Assert.Equal(bytes, actual);
        }
Exemple #6
0
        /// <summary>
        /// Compute a Marvin OrdinalIgnoreCase hash and collapse it into a 32-bit hash.
        /// n.b. <paramref name="count"/> is specified as char count, not byte count.
        /// </summary>
        public static int ComputeHash32OrdinalIgnoreCase(ref char data, int count, uint p0, uint p1)
        {
            uint  ucount     = (uint)count; // in chars
            nuint byteOffset = 0;           // in bytes
            uint  tempValue;

            // We operate on 32-bit integers (two chars) at a time.

            while (ucount >= 2)
            {
                tempValue = Unsafe.ReadUnaligned <uint>(ref Unsafe.As <char, byte>(ref Unsafe.AddByteOffset(ref data, byteOffset)));
                if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
                {
                    goto NotAscii;
                }
                p0 += Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue);
                Block(ref p0, ref p1);

                byteOffset += 4;
                ucount     -= 2;
            }

            // We have either one char (16 bits) or zero chars left over.
            Debug.Assert(ucount < 2);

            if (ucount > 0)
            {
                tempValue = Unsafe.AddByteOffset(ref data, byteOffset);
                if (tempValue > 0x7Fu)
                {
                    goto NotAscii;
                }

                // addition is written with -0x80u to allow fall-through to next statement rather than jmp past it
                p0 += Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) + (0x800000u - 0x80u);
            }
            p0 += 0x80u;

            Block(ref p0, ref p1);
            Block(ref p0, ref p1);

            return((int)(p1 ^ p0));

NotAscii:
            Debug.Assert(0 <= ucount && ucount <= Int32.MaxValue); // this should fit into a signed int
            return(ComputeHash32OrdinalIgnoreCaseSlow(ref Unsafe.AddByteOffset(ref data, byteOffset), (int)ucount, p0, p1));
        }
Exemple #7
0
        private unsafe string ChangeCaseCommon <TConversion>(string source) where TConversion : struct
        {
            Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion));
            bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds

            Debug.Assert(!_invariantMode);
            Debug.Assert(source != null);

            // If the string is empty, we're done.
            if (source.Length == 0)
            {
                return(string.Empty);
            }

            fixed(char *pSource = source)
            {
                nuint currIdx = 0; // in chars

                // If this culture's casing for ASCII is the same as invariant, try to take
                // a fast path that'll work in managed code and ASCII rather than calling out
                // to the OS for culture-aware casing.
                if (IsAsciiCasingSameAsInvariant)
                {
                    // Read 2 chars (one 32-bit integer) at a time

                    if (source.Length >= 2)
                    {
                        nuint lastIndexWhereCanReadTwoChars = (uint)source.Length - 2;
                        do
                        {
                            // See the comments in ChangeCaseCommon<TConversion>(ROS<char>, Span<char>) for a full explanation of the below code.

                            uint tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx);
                            if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
                            {
                                goto NotAscii;
                            }
                            if ((toUpper) ? Utf16Utility.UInt32ContainsAnyLowercaseAsciiChar(tempValue) : Utf16Utility.UInt32ContainsAnyUppercaseAsciiChar(tempValue))
                            {
                                goto AsciiMustChangeCase;
                            }

                            currIdx += 2;
                        } while (currIdx <= lastIndexWhereCanReadTwoChars);
                    }

                    // If there's a single character left to convert, do it now.
                    if ((source.Length & 1) != 0)
                    {
                        uint tempValue = pSource[currIdx];
                        if (tempValue > 0x7Fu)
                        {
                            goto NotAscii;
                        }
                        if ((toUpper) ? ((tempValue - 'a') <= (uint)('z' - 'a')) : ((tempValue - 'A') <= (uint)('Z' - 'A')))
                        {
                            goto AsciiMustChangeCase;
                        }
                    }

                    // We got through all characters without finding anything that needed to change - done!
                    return(source);

AsciiMustChangeCase:
                    {
                        // We reached ASCII data that requires a case change.
                        // This will necessarily allocate a new string, but let's try to stay within the managed (non-localization tables)
                        // conversion code path if we can.

                        string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count

                        // copy existing known-good data into the result
                        Span <char> resultSpan = new Span <char>(ref result.GetRawStringData(), result.Length);
                        source.AsSpan(0, (int)currIdx).CopyTo(resultSpan);

                        // and re-run the fast span-based logic over the remainder of the data
                        ChangeCaseCommon <TConversion>(source.AsSpan((int)currIdx), resultSpan.Slice((int)currIdx));
                        return(result);
                    }
                }

NotAscii:
                {
                    // We reached non-ASCII data *or* the requested culture doesn't map ASCII data the same way as the invariant culture.
                    // In either case we need to fall back to the localization tables.

                    string result = string.FastAllocateString(source.Length); // changing case uses simple folding: doesn't change UTF-16 code unit count

                    if (currIdx > 0)
                    {
                        // copy existing known-good data into the result
                        Span <char> resultSpan = new Span <char>(ref result.GetRawStringData(), result.Length);
                        source.AsSpan(0, (int)currIdx).CopyTo(resultSpan);
                    }

                    // and run the culture-aware logic over the remainder of the data
                    fixed(char *pResult = result)
                    {
                        ChangeCase(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx, toUpper);
                    }

                    return(result);
                }
            }
        }
Exemple #8
0
        private unsafe void ChangeCaseCommon <TConversion>(ref char source, ref char destination, int charCount) where TConversion : struct
        {
            Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion));
            bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds

            Debug.Assert(!_invariantMode);
            Debug.Assert(charCount >= 0);

            if (charCount == 0)
            {
                goto Return;
            }

            fixed(char *pSource = &source)
            fixed(char *pDestination = &destination)
            {
                nuint currIdx = 0; // in chars

                if (IsAsciiCasingSameAsInvariant)
                {
                    // Read 4 chars (two 32-bit integers) at a time

                    if (charCount >= 4)
                    {
                        nuint lastIndexWhereCanReadFourChars = (uint)charCount - 4;
                        do
                        {
                            // This is a mostly branchless case change routine. Generally speaking, we assume that the majority
                            // of input is ASCII, so the 'if' checks below should normally evaluate to false. However, within
                            // the ASCII data, we expect that characters of either case might be about equally distributed, so
                            // we want the case change operation itself to be branchless. This gives optimal performance in the
                            // common case. We also expect that developers aren't passing very long (16+ character) strings into
                            // this method, so we won't bother vectorizing until data shows us that it's worthwhile to do so.

                            uint tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx);
                            if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
                            {
                                goto NonAscii;
                            }
                            tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
                            Unsafe.WriteUnaligned <uint>(pDestination + currIdx, tempValue);

                            tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx + 2);
                            if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
                            {
                                goto NonAsciiSkipTwoChars;
                            }
                            tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
                            Unsafe.WriteUnaligned <uint>(pDestination + currIdx + 2, tempValue);
                            currIdx += 4;
                        } while (currIdx <= lastIndexWhereCanReadFourChars);

                        // At this point, there are fewer than 4 characters remaining to convert.
                        Debug.Assert((uint)charCount - currIdx < 4);
                    }

                    // If there are 2 or 3 characters left to convert, we'll convert 2 of them now.
                    if ((charCount & 2) != 0)
                    {
                        uint tempValue = Unsafe.ReadUnaligned <uint>(pSource + currIdx);
                        if (!Utf16Utility.AllCharsInUInt32AreAscii(tempValue))
                        {
                            goto NonAscii;
                        }
                        tempValue = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
                        Unsafe.WriteUnaligned <uint>(pDestination + currIdx, tempValue);
                        currIdx += 2;
                    }

                    // If there's a single character left to convert, do it now.
                    if ((charCount & 1) != 0)
                    {
                        uint tempValue = pSource[currIdx];
                        if (tempValue > 0x7Fu)
                        {
                            goto NonAscii;
                        }
                        tempValue             = (toUpper) ? Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) : Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(tempValue);
                        pDestination[currIdx] = (char)tempValue;
                    }

                    // And we're finished!

                    goto Return;

                    // If we reached this point, we found non-ASCII data.
                    // Fall back down the p/invoke code path.

NonAsciiSkipTwoChars:
                    currIdx += 2;

NonAscii:
                    Debug.Assert(currIdx < (uint)charCount, "We somehow read past the end of the buffer.");
                    charCount -= (int)currIdx;
                }

                // We encountered non-ASCII data and therefore can't perform invariant case conversion; or the requested culture
                // has a case conversion that's different from the invariant culture, even for ASCII data (e.g., tr-TR converts
                // 'i' (U+0069) to Latin Capital Letter I With Dot Above (U+0130)).

                ChangeCase(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper);
            }

Return:
            return;
        }
Exemple #9
0
        // Returns 'null' if the input buffer does not represent well-formed UTF-16 data and 'replaceInvalidSequences' is false.
        private static byte[]? CreateBufferFromUtf16Common(ReadOnlySpan <char> value, bool replaceInvalidSequences)
        {
            // Shortcut: Since we expect most strings to be small-ish, first try a one-pass
            // operation where we transcode directly on to the stack and then copy the validated
            // data into the new Utf8String instance. It's still O(n), but it should have a smaller
            // constant factor than a typical "count + transcode" combo.

            OperationStatus status;

            byte[] newBuffer;

            if (value.Length <= MAX_STACK_TRANSCODE_CHAR_COUNT /* in chars */)
            {
                if (value.IsEmpty)
                {
                    return(Utf8String.Empty._bytes);
                }

                Span <byte> scratch = stackalloc byte[MAX_STACK_TRANSCODE_CHAR_COUNT * MAX_UTF8_BYTES_PER_UTF16_CHAR]; // largest possible expansion, as explained below
                status = Utf8.FromUtf16(value, scratch, out _, out int scratchBytesWritten, replaceInvalidSequences);
                Debug.Assert(status == OperationStatus.Done || status == OperationStatus.InvalidData);

                if (status == OperationStatus.InvalidData)
                {
                    return(null);
                }

                // At this point we know transcoding succeeded, so the original input data was well-formed.
                // We'll memcpy the scratch buffer into the new Utf8String instance, which is very fast.

                newBuffer = new byte[scratchBytesWritten + 1]; // null-terminated
                scratch.Slice(0, scratchBytesWritten).CopyTo(newBuffer);
                return(newBuffer);
            }

            // First, determine how many UTF-8 bytes we'll need in order to represent this data.
            // This also checks the input data for well-formedness.

            long utf8CodeUnitCountAdjustment;

            unsafe
            {
                fixed(char *pChars = &MemoryMarshal.GetReference(value))
                {
                    if (Utf16Utility.GetPointerToFirstInvalidChar(pChars, value.Length, out utf8CodeUnitCountAdjustment, out int _) != (pChars + (uint)value.Length))
                    {
                        return(null);
                    }
                }
            }

            // The max possible expansion transcoding UTF-16 to UTF-8 is that each input char corresponds
            // to 3 UTF-8 bytes. This is most common in CJK languages. Since the input buffer could be
            // up to int.MaxValue elements in length, we need to use a 64-bit value to hold the total
            // required UTF-8 byte length. However, the VM places restrictions on how large a Utf8String
            // instance can be, and the maximum allowed element count is just under int.MaxValue. (This
            // mirrors the restrictions already in place for System.String.) The VM will throw an
            // OutOfMemoryException if anybody tries to create a Utf8String instance larger than that,
            // so if we detect any sort of overflow we'll end up passing int.MaxValue down to the allocation
            // routine. This normalizes the OutOfMemoryException the caller sees.

            long totalUtf8BytesRequired = (uint)value.Length + utf8CodeUnitCountAdjustment;

            if (totalUtf8BytesRequired >= int.MaxValue)
            {
                totalUtf8BytesRequired = int.MaxValue - 1;
            }

            // We can get away with FastAllocateSkipZeroInit here because we're not going to return the
            // new Utf8String instance to the caller if we don't overwrite every byte of the buffer.

            newBuffer = new byte[(int)totalUtf8BytesRequired + 1]; // null-terminated

            // Now transcode the UTF-16 input into the newly allocated Utf8String's buffer. We can't call the
            // "skip validation" transcoder because the caller could've mutated the input buffer between the
            // initial counting step and the transcoding step below.

            status = Utf8.FromUtf16(value, newBuffer.AsSpan(0, newBuffer.Length - 1), out _, out int bytesWritten, replaceInvalidSequences: false);
            if (status != OperationStatus.Done || bytesWritten != newBuffer.Length - 1)
            {
                // Did somebody mutate our input buffer? Shouldn't be any other way this could happen.

                return(null);
            }

            return(newBuffer);
        }