private static nuint GetIndexOfFirstNonWhiteSpaceChar(ref byte utf8Data, nuint length)
        {
            // This method is optimized for the case where the input data is ASCII, and if the
            // data does need to be trimmed it's likely that only a relatively small number of
            // bytes will be trimmed.

            nuint i = 0;

            while (i < length)
            {
                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
                // If so, we can skip the more expensive logic later in this method.

                if ((sbyte)Unsafe.AddByteOffset(ref utf8Data, i) > (sbyte)0x20)
                {
                    break;
                }

                uint possibleAsciiByte = Unsafe.AddByteOffset(ref utf8Data, i);
                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
                {
                    // The simple comparison failed. Let's read the actual byte value,
                    // and if it's ASCII we can delegate to Rune's inlined method
                    // implementation.

                    if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte)))
                    {
                        i++;
                        continue;
                    }
                }
                else
                {
                    // Not ASCII data. Go back to the slower "decode the entire scalar"
                    // code path, then compare it against our Unicode tables.

                    Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(ref utf8Data, (int)length).Slice((int)i), out Rune decodedRune, out int bytesConsumed);
                    if (Rune.IsWhiteSpace(decodedRune))
                    {
                        i += (uint)bytesConsumed;
                        continue;
                    }
                }

                break; // If we got here, we saw a non-whitespace subsequence.
            }

            return(i);
        }
Пример #2
0
        /// <summary>
        /// Returns the index in <paramref name="utf8Data"/> where the trailing whitespace sequence
        /// begins, or 0 if the data contains only whitespace characters, or the span length if the
        /// data does not end with any whitespace characters.
        /// </summary>
        public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan <byte> utf8Data)
        {
            // This method is optimized for the case where the input data is ASCII, and if the
            // data does need to be trimmed it's likely that only a relatively small number of
            // bytes will be trimmed.

            int length = utf8Data.Length;

            while (length > 0)
            {
                // Very quick check: see if the byte is in the range [ 21 .. 7F ].
                // If so, we can skip the more expensive logic later in this method.

                if ((sbyte)utf8Data[length - 1] > (sbyte)0x20)
                {
                    break;
                }

                uint possibleAsciiByte = utf8Data[length - 1];
                if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte))
                {
                    // The simple comparison failed. Let's read the actual byte value,
                    // and if it's ASCII we can delegate to Rune's inlined method
                    // implementation.

                    if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte)))
                    {
                        length--;
                        continue;
                    }
                }
                else
                {
                    // Not ASCII data. Go back to the slower "decode the entire scalar"
                    // code path, then compare it against our Unicode tables.

                    Rune.DecodeLastFromUtf8(utf8Data.Slice(0, length), out Rune decodedRune, out int bytesConsumed);
                    if (Rune.IsWhiteSpace(decodedRune))
                    {
                        length -= bytesConsumed;
                        continue;
                    }
                }

                break; // If we got here, we saw a non-whitespace subsequence.
            }

            return(length);
        }
Пример #3
0
        public virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            int originalUtf8TextLength = utf8Text.Length;

            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            int i = 0;

            while (i < utf8Text.Length)
            {
                byte value = utf8Text[i];
                if (UnicodeUtility.IsAsciiCodePoint(value))
                {
                    if (!ReferenceEquals(GetAsciiEncoding(value), s_noEscape))
                    {
                        return(originalUtf8TextLength - utf8Text.Length + i);
                    }

                    i++;
                }
                else
                {
                    if (i > 0)
                    {
                        utf8Text = utf8Text.Slice(i);
                    }

                    if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done ||
                        WillEncode((int)nextScalarValue))
                    {
                        return(originalUtf8TextLength - utf8Text.Length);
                    }

                    i = bytesConsumedThisIteration;
                }
            }

            return(-1); // no input data needs to be escaped
        }
Пример #4
0
        public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            if (!_isAsciiCacheInitialized)
            {
                InitializeAsciiCache();
            }

            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if ((Sse2.IsSupported || AdvSimd.Arm64.IsSupported) && utf8Text.Length - 16 >= idx)
                {
                    // Hoist these outside the loop, as the JIT won't do it.
                    Vector128 <sbyte> bitMaskLookupAsciiNeedsEscaping = _bitMaskLookupAsciiNeedsEscaping;
                    Vector128 <sbyte> bitPosLookup    = Ssse3Helper.s_bitPosLookup;
                    Vector128 <sbyte> nibbleMaskSByte = Ssse3Helper.s_nibbleMaskSByte;
                    Vector128 <sbyte> nullMaskSByte   = Ssse3Helper.s_nullMaskSByte;

                    sbyte *startingAddress = (sbyte *)ptr;
                    do
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue;
                        bool containsNonAsciiBytes;

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            sourceValue           = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else if (AdvSimd.Arm64.IsSupported)
                        {
                            sourceValue           = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            throw new PlatformNotSupportedException();
                        }

                        if (!containsNonAsciiBytes)
                        {
                            // All of the following 16 bytes is ASCII.
                            // TODO AdvSimd: optimization maybe achievable using VectorTableLookup and/or VectorTableLookupExtension

                            if (Ssse3.IsSupported)
                            {
                                Vector128 <sbyte> mask = Ssse3Helper.CreateEscapingMask(sourceValue, bitMaskLookupAsciiNeedsEscaping, bitPosLookup, nibbleMaskSByte, nullMaskSByte);
                                int index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte());

                                if (index < 16)
                                {
                                    idx += index;
                                    goto Return;
                                }
                            }
                            else
                            {
                                byte *p = (byte *)startingAddress;
                                if (DoesAsciiNeedEncoding(p[0]))
                                {
                                    goto Return;
                                }
                                if (DoesAsciiNeedEncoding(p[1]))
                                {
                                    goto Return1;
                                }
                                if (DoesAsciiNeedEncoding(p[2]))
                                {
                                    goto Return2;
                                }
                                if (DoesAsciiNeedEncoding(p[3]))
                                {
                                    goto Return3;
                                }
                                if (DoesAsciiNeedEncoding(p[4]))
                                {
                                    goto Return4;
                                }
                                if (DoesAsciiNeedEncoding(p[5]))
                                {
                                    goto Return5;
                                }
                                if (DoesAsciiNeedEncoding(p[6]))
                                {
                                    goto Return6;
                                }
                                if (DoesAsciiNeedEncoding(p[7]))
                                {
                                    goto Return7;
                                }
                                if (DoesAsciiNeedEncoding(p[8]))
                                {
                                    goto Return8;
                                }
                                if (DoesAsciiNeedEncoding(p[9]))
                                {
                                    goto Return9;
                                }
                                if (DoesAsciiNeedEncoding(p[10]))
                                {
                                    goto Return10;
                                }
                                if (DoesAsciiNeedEncoding(p[11]))
                                {
                                    goto Return11;
                                }
                                if (DoesAsciiNeedEncoding(p[12]))
                                {
                                    goto Return12;
                                }
                                if (DoesAsciiNeedEncoding(p[13]))
                                {
                                    goto Return13;
                                }
                                if (DoesAsciiNeedEncoding(p[14]))
                                {
                                    goto Return14;
                                }
                                if (DoesAsciiNeedEncoding(p[15]))
                                {
                                    goto Return15;
                                }
                            }

                            idx += 16;
                        }
                        else
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }while (utf8Text.Length - 16 >= idx);

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.
                goto Return;

#if NETCOREAPP
Return15:
                return(idx + 15);

Return14:
                return(idx + 14);

Return13:
                return(idx + 13);

Return12:
                return(idx + 12);

Return11:
                return(idx + 11);

Return10:
                return(idx + 10);

Return9:
                return(idx + 9);

Return8:
                return(idx + 8);

Return7:
                return(idx + 7);

Return6:
                return(idx + 6);

Return5:
                return(idx + 5);

Return4:
                return(idx + 4);

Return3:
                return(idx + 3);

Return2:
                return(idx + 2);

Return1:
                return(idx + 1);
#endif
Return:
                return(idx);
            }
        }
Пример #5
0
        /// <summary>
        /// Encodes the supplied UTF-8 text.
        /// </summary>
        /// <param name="utf8Source">A source buffer containing the UTF-8 text to encode.</param>
        /// <param name="utf8Destination">The destination buffer to which the encoded form of <paramref name="utf8Source"/>
        /// will be written.</param>
        /// <param name="bytesConsumed">The number of bytes consumed from the <paramref name="utf8Source"/> buffer.</param>
        /// <param name="bytesWritten">The number of bytes written to the <paramref name="utf8Destination"/> buffer.</param>
        /// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded;
        /// <see langword="false"/> if there is no further source data that needs to be encoded.</param>
        /// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns>
        /// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks>
        public unsafe virtual OperationStatus EncodeUtf8(
            ReadOnlySpan <byte> utf8Source,
            Span <byte> utf8Destination,
            out int bytesConsumed,
            out int bytesWritten,
            bool isFinalBlock = true)
        {
            int originalUtf8SourceLength      = utf8Source.Length;
            int originalUtf8DestinationLength = utf8Destination.Length;

            const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation
            char *    pTempCharBuffer           = stackalloc char[TempUtf16CharBufferLength];

            const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */;
            byte *    pTempUtf8Buffer          = stackalloc byte[TempUtf8ByteBufferLength];

            uint            nextScalarValue;
            int             utf8BytesConsumedForScalar = 0;
            int             nonEscapedByteCount        = 0;
            OperationStatus opStatus = OperationStatus.Done;

            while (!utf8Source.IsEmpty)
            {
                // For performance, read until we require escaping.
                do
                {
                    nextScalarValue = utf8Source[nonEscapedByteCount];
                    if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue))
                    {
                        // Check Ascii cache.
                        byte[]? encodedBytes = GetAsciiEncoding((byte)nextScalarValue);

                        if (ReferenceEquals(encodedBytes, s_noEscape))
                        {
                            if (++nonEscapedByteCount <= utf8Destination.Length)
                            {
                                // Source data can be copied as-is.
                                continue;
                            }

                            --nonEscapedByteCount;
                            opStatus = OperationStatus.DestinationTooSmall;
                            break;
                        }

                        if (encodedBytes == null)
                        {
                            // We need to escape and update the cache, so break out of this loop.
                            opStatus = OperationStatus.Done;
                            utf8BytesConsumedForScalar = 1;
                            break;
                        }

                        // For performance, handle the non-escaped bytes and encoding here instead of breaking out of the loop.
                        if (nonEscapedByteCount > 0)
                        {
                            // We previously verified the destination size.
                            Debug.Assert(nonEscapedByteCount <= utf8Destination.Length);

                            utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination);
                            utf8Source          = utf8Source.Slice(nonEscapedByteCount);
                            utf8Destination     = utf8Destination.Slice(nonEscapedByteCount);
                            nonEscapedByteCount = 0;
                        }

                        if (!((ReadOnlySpan <byte>)encodedBytes).TryCopyTo(utf8Destination))
                        {
                            opStatus = OperationStatus.DestinationTooSmall;
                            break;
                        }

                        utf8Destination = utf8Destination.Slice(encodedBytes.Length);
                        utf8Source      = utf8Source.Slice(1);
                        continue;
                    }

                    // Code path for non-Ascii.
                    opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source.Slice(nonEscapedByteCount), out nextScalarValue, out utf8BytesConsumedForScalar);
                    if (opStatus == OperationStatus.Done)
                    {
                        if (!WillEncode((int)nextScalarValue))
                        {
                            nonEscapedByteCount += utf8BytesConsumedForScalar;
                            if (nonEscapedByteCount <= utf8Destination.Length)
                            {
                                // Source data can be copied as-is.
                                continue;
                            }

                            nonEscapedByteCount -= utf8BytesConsumedForScalar;
                            opStatus             = OperationStatus.DestinationTooSmall;
                        }
                    }

                    // We need to escape.
                    break;
                } while (nonEscapedByteCount < utf8Source.Length);

                if (nonEscapedByteCount > 0)
                {
                    // We previously verified the destination size.
                    Debug.Assert(nonEscapedByteCount <= utf8Destination.Length);

                    utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination);
                    utf8Source          = utf8Source.Slice(nonEscapedByteCount);
                    utf8Destination     = utf8Destination.Slice(nonEscapedByteCount);
                    nonEscapedByteCount = 0;
                }

                if (utf8Source.IsEmpty)
                {
                    goto Done;
                }

                // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD)
                // and for well-formed input data that must be escaped.

                if (opStatus != OperationStatus.Done) // Optimize happy path.
                {
                    if (opStatus == OperationStatus.NeedMoreData)
                    {
                        if (!isFinalBlock)
                        {
                            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
                            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
                            return(OperationStatus.NeedMoreData);
                        }
                        // else treat this as a normal invalid subsequence.
                    }
                    else if (opStatus == OperationStatus.DestinationTooSmall)
                    {
                        goto ReturnDestinationTooSmall;
                    }
                }

                if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow))
                {
                    // Now that we have it as UTF-16, transcode it to UTF-8.
                    // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception
                    // due to lack of output space.

                    int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength);
                    ReadOnlySpan <byte> transcodedUtf8BytesThisIteration = new ReadOnlySpan <byte>(pTempUtf8Buffer, transcodedByteCountThisIteration);

                    // Update cache for Ascii
                    if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue))
                    {
                        _asciiEscape[nextScalarValue] = transcodedUtf8BytesThisIteration.ToArray();
                    }

                    if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination))
                    {
                        goto ReturnDestinationTooSmall;
                    }

                    utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration);
                }
                else
                {
                    // We really don't expect this to fail. If that happens we'll report an error to our caller.
                    bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
                    bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
                    return(OperationStatus.InvalidData);
                }

                utf8Source = utf8Source.Slice(utf8BytesConsumedForScalar);
            }

Done:
            // Input buffer has been fully processed!
            bytesConsumed = originalUtf8SourceLength;
            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
            return(OperationStatus.Done);

ReturnDestinationTooSmall:
            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
            return(OperationStatus.DestinationTooSmall);
        }
Пример #6
0
        /// <summary>
        /// A copy of the logic in Rune.DecodeFromUtf8.
        /// </summary>
        public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan <byte> source, out uint result, out int bytesConsumed)
        {
            const char ReplacementChar = '\uFFFD';

            // This method follows the Unicode Standard's recommendation for detecting
            // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
            // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
            // it tries to consume as many code units as possible as long as those code
            // units constitute the beginning of a longer well-formed subsequence per Table 3-7.

            int index = 0;

            // Try reading input[0].

            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            uint tempValue = source[index];

            if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
            {
                goto NotAscii;
            }

Finish:

            bytesConsumed = index + 1;
            Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
            result = tempValue;
            return(OperationStatus.Done);

NotAscii:

            // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
            // the range [C2..F4]. If it's outside of that range, it's either a standalone
            // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
            // four-byte sequence.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
            {
                goto FirstByteInvalid;
            }

            tempValue = (tempValue - 0xC2) << 6;

            // Try reading input[1].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            // Continuation bytes are of the form [10xxxxxx], which means that their two's
            // complement representation is in the range [-65..-128]. This allows us to
            // perform a single comparison to see if a byte is a continuation byte.

            int thisByteSignExtended = (sbyte)source[index];

            if (thisByteSignExtended >= -64)
            {
                goto Invalid;
            }

            tempValue += (uint)thisByteSignExtended;
            tempValue += 0x80;               // remove the continuation byte marker
            tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker

            if (tempValue < 0x0800)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
                goto Finish; // this is a valid 2-byte sequence
            }

            // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
            // enough information (from just two code units) to detect overlong or surrogate
            // sequences, we need to perform these checks now.

            if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
                // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
            {
                // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
                goto Invalid;
            }

            if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
            {
                // This is an overlong 4-byte sequence.
                goto Invalid;
            }

            // The first two bytes were just fine. We don't need to perform any other checks
            // on the remaining bytes other than to see that they're valid continuation bytes.

            // Try reading input[2].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xE0 - 0xC0) << 12; // remove the leading byte marker

            if (tempValue <= 0xFFFF)
            {
                Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
                goto Finish; // this is a valid 3-byte sequence
            }

            // Try reading input[3].

            index++;
            if ((uint)index >= (uint)source.Length)
            {
                goto NeedsMoreData;
            }

            thisByteSignExtended = (sbyte)source[index];
            if (thisByteSignExtended >= -64)
            {
                goto Invalid; // this byte is not a UTF-8 continuation byte
            }

            tempValue <<= 6;
            tempValue  += (uint)thisByteSignExtended;
            tempValue  += 0x80;                // remove the continuation byte marker
            tempValue  -= (0xF0 - 0xE0) << 18; // remove the leading byte marker

            UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
            goto Finish; // this is a valid 4-byte sequence

FirstByteInvalid:

            index = 1; // Invalid subsequences are always at least length 1.

Invalid:

            Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.InvalidData);

NeedsMoreData:

            Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
            bytesConsumed = index;
            result        = ReplacementChar;
            return(OperationStatus.NeedMoreData);
        }
Пример #7
0
        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        Vector128 <sbyte> mask = Sse2Helper.CreateAsciiMask(sourceValue);
                        int index = Sse2.MoveMask(mask);

                        if (index != 0)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        else
                        {
                            if (DoesAsciiNeedEncoding(ptr[idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1)
                            {
                                goto Return;
                            }
                            idx++;
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }
Пример #8
0
        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        int index = Sse2.MoveMask(sourceValue);

                        if (index != 0)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                            startingAddress = (sbyte *)ptr + idx;
                        }
                        else
                        {
                            // Check if any of the 16 bytes need to be escaped.
                            Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue);

                            index = Sse2.MoveMask(mask);
                            // If index == 0, that means none of the 16 bytes needed to be escaped.
                            // TrailingZeroCount is relatively expensive, avoid it if possible.
                            if (index != 0)
                            {
                                // Found at least one byte that needs to be escaped, figure out the index of
                                // the first one found that needed to be escaped within the 16 bytes.
                                Debug.Assert(index > 0 && index <= 65_535);
                                int tzc = BitOperations.TrailingZeroCount(index);
                                Debug.Assert(tzc >= 0 && tzc <= 16);
                                idx += tzc;
                                goto Return;
                            }
                            idx             += 16;
                            startingAddress += 16;
                        }
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }
Пример #9
0
        public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        int index = Sse2.MoveMask(sourceValue);

                        if (index != 0)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        else
                        {
                            if (DoesAsciiNeedEncoding(ptr[idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1)
                            {
                                goto Return;
                            }
                            idx++;
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }
        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        bool containsNonAsciiBytes;

                        // Load the next 16 bytes, and check for ASCII text.
                        // Any byte that's not in the ASCII range will already be negative when casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else if (AdvSimd.Arm64.IsSupported)
                        {
                            Vector128 <sbyte> sourceValue = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            throw new PlatformNotSupportedException();
                        }

                        if (containsNonAsciiBytes)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        else
                        {
                            if (DoesAsciiNeedEncoding(ptr[idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1

                                || DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1 ||
                                DoesAsciiNeedEncoding(ptr[++idx]) == 1)
                            {
                                goto Return;
                            }
                            idx++;
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]) == 1)
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }