private static nuint GetIndexOfFirstNonWhiteSpaceChar(ref byte utf8Data, nuint length) { // This method is optimized for the case where the input data is ASCII, and if the // data does need to be trimmed it's likely that only a relatively small number of // bytes will be trimmed. nuint i = 0; while (i < length) { // Very quick check: see if the byte is in the range [ 21 .. 7F ]. // If so, we can skip the more expensive logic later in this method. if ((sbyte)Unsafe.AddByteOffset(ref utf8Data, i) > (sbyte)0x20) { break; } uint possibleAsciiByte = Unsafe.AddByteOffset(ref utf8Data, i); if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte)) { // The simple comparison failed. Let's read the actual byte value, // and if it's ASCII we can delegate to Rune's inlined method // implementation. if (Rune.IsWhiteSpace(Rune.UnsafeCreate(possibleAsciiByte))) { i++; continue; } } else { // Not ASCII data. Go back to the slower "decode the entire scalar" // code path, then compare it against our Unicode tables. Rune.DecodeFromUtf8(new ReadOnlySpan <byte>(ref utf8Data, (int)length).Slice((int)i), out Rune decodedRune, out int bytesConsumed); if (Rune.IsWhiteSpace(decodedRune)) { i += (uint)bytesConsumed; continue; } } break; // If we got here, we saw a non-whitespace subsequence. } return(i); }
/// <summary> /// Returns the index in <paramref name="utf8Data"/> where the trailing whitespace sequence /// begins, or 0 if the data contains only whitespace characters, or the span length if the /// data does not end with any whitespace characters. /// </summary> public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan <byte> utf8Data) { // This method is optimized for the case where the input data is ASCII, and if the // data does need to be trimmed it's likely that only a relatively small number of // bytes will be trimmed. int length = utf8Data.Length; while (length > 0) { // Very quick check: see if the byte is in the range [ 21 .. 7F ]. // If so, we can skip the more expensive logic later in this method. if ((sbyte)utf8Data[length - 1] > (sbyte)0x20) { break; } uint possibleAsciiByte = utf8Data[length - 1]; if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte)) { // The simple comparison failed. Let's read the actual byte value, // and if it's ASCII we can delegate to Rune's inlined method // implementation. if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte))) { length--; continue; } } else { // Not ASCII data. Go back to the slower "decode the entire scalar" // code path, then compare it against our Unicode tables. Rune.DecodeLastFromUtf8(utf8Data.Slice(0, length), out Rune decodedRune, out int bytesConsumed); if (Rune.IsWhiteSpace(decodedRune)) { length -= bytesConsumed; continue; } } break; // If we got here, we saw a non-whitespace subsequence. } return(length); }
public virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { int originalUtf8TextLength = utf8Text.Length; // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. int i = 0; while (i < utf8Text.Length) { byte value = utf8Text[i]; if (UnicodeUtility.IsAsciiCodePoint(value)) { if (!ReferenceEquals(GetAsciiEncoding(value), s_noEscape)) { return(originalUtf8TextLength - utf8Text.Length + i); } i++; } else { if (i > 0) { utf8Text = utf8Text.Slice(i); } if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done || WillEncode((int)nextScalarValue)) { return(originalUtf8TextLength - utf8Text.Length); } i = bytesConsumedThisIteration; } } return(-1); // no input data needs to be escaped }
public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { if (!_isAsciiCacheInitialized) { InitializeAsciiCache(); } // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if ((Sse2.IsSupported || AdvSimd.Arm64.IsSupported) && utf8Text.Length - 16 >= idx) { // Hoist these outside the loop, as the JIT won't do it. Vector128 <sbyte> bitMaskLookupAsciiNeedsEscaping = _bitMaskLookupAsciiNeedsEscaping; Vector128 <sbyte> bitPosLookup = Ssse3Helper.s_bitPosLookup; Vector128 <sbyte> nibbleMaskSByte = Ssse3Helper.s_nibbleMaskSByte; Vector128 <sbyte> nullMaskSByte = Ssse3Helper.s_nullMaskSByte; sbyte *startingAddress = (sbyte *)ptr; do { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue; bool containsNonAsciiBytes; // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. if (Sse2.IsSupported) { sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else if (AdvSimd.Arm64.IsSupported) { sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } else { throw new PlatformNotSupportedException(); } if (!containsNonAsciiBytes) { // All of the following 16 bytes is ASCII. // TODO AdvSimd: optimization maybe achievable using VectorTableLookup and/or VectorTableLookupExtension if (Ssse3.IsSupported) { Vector128 <sbyte> mask = Ssse3Helper.CreateEscapingMask(sourceValue, bitMaskLookupAsciiNeedsEscaping, bitPosLookup, nibbleMaskSByte, nullMaskSByte); int index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); if (index < 16) { idx += index; goto Return; } } else { byte *p = (byte *)startingAddress; if (DoesAsciiNeedEncoding(p[0])) { goto Return; } if (DoesAsciiNeedEncoding(p[1])) { goto Return1; } if (DoesAsciiNeedEncoding(p[2])) { goto Return2; } if (DoesAsciiNeedEncoding(p[3])) { goto Return3; } if (DoesAsciiNeedEncoding(p[4])) { goto Return4; } if (DoesAsciiNeedEncoding(p[5])) { goto Return5; } if (DoesAsciiNeedEncoding(p[6])) { goto Return6; } if (DoesAsciiNeedEncoding(p[7])) { goto Return7; } if (DoesAsciiNeedEncoding(p[8])) { goto Return8; } if (DoesAsciiNeedEncoding(p[9])) { goto Return9; } if (DoesAsciiNeedEncoding(p[10])) { goto Return10; } if (DoesAsciiNeedEncoding(p[11])) { goto Return11; } if (DoesAsciiNeedEncoding(p[12])) { goto Return12; } if (DoesAsciiNeedEncoding(p[13])) { goto Return13; } if (DoesAsciiNeedEncoding(p[14])) { goto Return14; } if (DoesAsciiNeedEncoding(p[15])) { goto Return15; } } idx += 16; } else { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } startingAddress = (sbyte *)ptr + idx; }while (utf8Text.Length - 16 >= idx); // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. goto Return; #if NETCOREAPP Return15: return(idx + 15); Return14: return(idx + 14); Return13: return(idx + 13); Return12: return(idx + 12); Return11: return(idx + 11); Return10: return(idx + 10); Return9: return(idx + 9); Return8: return(idx + 8); Return7: return(idx + 7); Return6: return(idx + 6); Return5: return(idx + 5); Return4: return(idx + 4); Return3: return(idx + 3); Return2: return(idx + 2); Return1: return(idx + 1); #endif Return: return(idx); } }
/// <summary> /// Encodes the supplied UTF-8 text. /// </summary> /// <param name="utf8Source">A source buffer containing the UTF-8 text to encode.</param> /// <param name="utf8Destination">The destination buffer to which the encoded form of <paramref name="utf8Source"/> /// will be written.</param> /// <param name="bytesConsumed">The number of bytes consumed from the <paramref name="utf8Source"/> buffer.</param> /// <param name="bytesWritten">The number of bytes written to the <paramref name="utf8Destination"/> buffer.</param> /// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded; /// <see langword="false"/> if there is no further source data that needs to be encoded.</param> /// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns> /// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks> public unsafe virtual OperationStatus EncodeUtf8( ReadOnlySpan <byte> utf8Source, Span <byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { int originalUtf8SourceLength = utf8Source.Length; int originalUtf8DestinationLength = utf8Destination.Length; const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation char * pTempCharBuffer = stackalloc char[TempUtf16CharBufferLength]; const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */; byte * pTempUtf8Buffer = stackalloc byte[TempUtf8ByteBufferLength]; uint nextScalarValue; int utf8BytesConsumedForScalar = 0; int nonEscapedByteCount = 0; OperationStatus opStatus = OperationStatus.Done; while (!utf8Source.IsEmpty) { // For performance, read until we require escaping. do { nextScalarValue = utf8Source[nonEscapedByteCount]; if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue)) { // Check Ascii cache. byte[]? encodedBytes = GetAsciiEncoding((byte)nextScalarValue); if (ReferenceEquals(encodedBytes, s_noEscape)) { if (++nonEscapedByteCount <= utf8Destination.Length) { // Source data can be copied as-is. continue; } --nonEscapedByteCount; opStatus = OperationStatus.DestinationTooSmall; break; } if (encodedBytes == null) { // We need to escape and update the cache, so break out of this loop. opStatus = OperationStatus.Done; utf8BytesConsumedForScalar = 1; break; } // For performance, handle the non-escaped bytes and encoding here instead of breaking out of the loop. if (nonEscapedByteCount > 0) { // We previously verified the destination size. Debug.Assert(nonEscapedByteCount <= utf8Destination.Length); utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination); utf8Source = utf8Source.Slice(nonEscapedByteCount); utf8Destination = utf8Destination.Slice(nonEscapedByteCount); nonEscapedByteCount = 0; } if (!((ReadOnlySpan <byte>)encodedBytes).TryCopyTo(utf8Destination)) { opStatus = OperationStatus.DestinationTooSmall; break; } utf8Destination = utf8Destination.Slice(encodedBytes.Length); utf8Source = utf8Source.Slice(1); continue; } // Code path for non-Ascii. opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source.Slice(nonEscapedByteCount), out nextScalarValue, out utf8BytesConsumedForScalar); if (opStatus == OperationStatus.Done) { if (!WillEncode((int)nextScalarValue)) { nonEscapedByteCount += utf8BytesConsumedForScalar; if (nonEscapedByteCount <= utf8Destination.Length) { // Source data can be copied as-is. continue; } nonEscapedByteCount -= utf8BytesConsumedForScalar; opStatus = OperationStatus.DestinationTooSmall; } } // We need to escape. break; } while (nonEscapedByteCount < utf8Source.Length); if (nonEscapedByteCount > 0) { // We previously verified the destination size. Debug.Assert(nonEscapedByteCount <= utf8Destination.Length); utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination); utf8Source = utf8Source.Slice(nonEscapedByteCount); utf8Destination = utf8Destination.Slice(nonEscapedByteCount); nonEscapedByteCount = 0; } if (utf8Source.IsEmpty) { goto Done; } // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD) // and for well-formed input data that must be escaped. if (opStatus != OperationStatus.Done) // Optimize happy path. { if (opStatus == OperationStatus.NeedMoreData) { if (!isFinalBlock) { bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.NeedMoreData); } // else treat this as a normal invalid subsequence. } else if (opStatus == OperationStatus.DestinationTooSmall) { goto ReturnDestinationTooSmall; } } if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow)) { // Now that we have it as UTF-16, transcode it to UTF-8. // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception // due to lack of output space. int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength); ReadOnlySpan <byte> transcodedUtf8BytesThisIteration = new ReadOnlySpan <byte>(pTempUtf8Buffer, transcodedByteCountThisIteration); // Update cache for Ascii if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue)) { _asciiEscape[nextScalarValue] = transcodedUtf8BytesThisIteration.ToArray(); } if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination)) { goto ReturnDestinationTooSmall; } utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration); } else { // We really don't expect this to fail. If that happens we'll report an error to our caller. bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.InvalidData); } utf8Source = utf8Source.Slice(utf8BytesConsumedForScalar); } Done: // Input buffer has been fully processed! bytesConsumed = originalUtf8SourceLength; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.Done); ReturnDestinationTooSmall: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.DestinationTooSmall); }
/// <summary> /// A copy of the logic in Rune.DecodeFromUtf8. /// </summary> public static OperationStatus DecodeScalarValueFromUtf8(ReadOnlySpan <byte> source, out uint result, out int bytesConsumed) { const char ReplacementChar = '\uFFFD'; // This method follows the Unicode Standard's recommendation for detecting // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, // it tries to consume as many code units as possible as long as those code // units constitute the beginning of a longer well-formed subsequence per Table 3-7. int index = 0; // Try reading input[0]. if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } uint tempValue = source[index]; if (!UnicodeUtility.IsAsciiCodePoint(tempValue)) { goto NotAscii; } Finish: bytesConsumed = index + 1; Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] result = tempValue; return(OperationStatus.Done); NotAscii: // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in // the range [C2..F4]. If it's outside of that range, it's either a standalone // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range // four-byte sequence. if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) { goto FirstByteInvalid; } tempValue = (tempValue - 0xC2) << 6; // Try reading input[1]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } // Continuation bytes are of the form [10xxxxxx], which means that their two's // complement representation is in the range [-65..-128]. This allows us to // perform a single comparison to see if a byte is a continuation byte. int thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; } tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker if (tempValue < 0x0800) { Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); goto Finish; // this is a valid 2-byte sequence } // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have // enough information (from just two code units) to detect overlong or surrogate // sequences, we need to perform these checks now. if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) { // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. goto Invalid; } if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) { // This is a UTF-16 surrogate code point, which is invalid in UTF-8. goto Invalid; } if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) { // This is an overlong 4-byte sequence. goto Invalid; } // The first two bytes were just fine. We don't need to perform any other checks // on the remaining bytes other than to see that they're valid continuation bytes. // Try reading input[2]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; // this byte is not a UTF-8 continuation byte } tempValue <<= 6; tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker if (tempValue <= 0xFFFF) { Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); goto Finish; // this is a valid 3-byte sequence } // Try reading input[3]. index++; if ((uint)index >= (uint)source.Length) { goto NeedsMoreData; } thisByteSignExtended = (sbyte)source[index]; if (thisByteSignExtended >= -64) { goto Invalid; // this byte is not a UTF-8 continuation byte } tempValue <<= 6; tempValue += (uint)thisByteSignExtended; tempValue += 0x80; // remove the continuation byte marker tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); goto Finish; // this is a valid 4-byte sequence FirstByteInvalid: index = 1; // Invalid subsequences are always at least length 1. Invalid: Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 bytesConsumed = index; result = ReplacementChar; return(OperationStatus.InvalidData); NeedsMoreData: Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 bytesConsumed = index; result = ReplacementChar; return(OperationStatus.NeedMoreData); }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); Vector128 <sbyte> mask = Sse2Helper.CreateAsciiMask(sourceValue); int index = Sse2.MoveMask(mask); if (index != 0) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } else { if (DoesAsciiNeedEncoding(ptr[idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1) { goto Return; } idx++; } startingAddress = (sbyte *)ptr + idx; } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } idx = -1; // All bytes are allowed. Return: return(idx); } }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. int index = Sse2.MoveMask(sourceValue); if (index != 0) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } startingAddress = (sbyte *)ptr + idx; } else { // Check if any of the 16 bytes need to be escaped. Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = Sse2.MoveMask(mask); // If index == 0, that means none of the 16 bytes needed to be escaped. // TrailingZeroCount is relatively expensive, avoid it if possible. if (index != 0) { // Found at least one byte that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 16 bytes. Debug.Assert(index > 0 && index <= 65_535); int tzc = BitOperations.TrailingZeroCount(index); Debug.Assert(tzc >= 0 && tzc <= 16); idx += tzc; goto Return; } idx += 16; startingAddress += 16; } } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }
public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. int index = Sse2.MoveMask(sourceValue); if (index != 0) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } else { if (DoesAsciiNeedEncoding(ptr[idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1) { goto Return; } idx++; } startingAddress = (sbyte *)ptr + idx; } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); bool containsNonAsciiBytes; // Load the next 16 bytes, and check for ASCII text. // Any byte that's not in the ASCII range will already be negative when casted to signed byte. if (Sse2.IsSupported) { Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else if (AdvSimd.Arm64.IsSupported) { Vector128 <sbyte> sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } else { throw new PlatformNotSupportedException(); } if (containsNonAsciiBytes) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } else { if (DoesAsciiNeedEncoding(ptr[idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1) { goto Return; } idx++; } startingAddress = (sbyte *)ptr + idx; } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }