public virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { int originalUtf8TextLength = utf8Text.Length; // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. int i = 0; while (i < utf8Text.Length) { byte value = utf8Text[i]; if (UnicodeUtility.IsAsciiCodePoint(value)) { if (!ReferenceEquals(GetAsciiEncoding(value), s_noEscape)) { return(originalUtf8TextLength - utf8Text.Length + i); } i++; } else { if (i > 0) { utf8Text = utf8Text.Slice(i); } if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done || WillEncode((int)nextScalarValue)) { return(originalUtf8TextLength - utf8Text.Length); } i = bytesConsumedThisIteration; } } return(-1); // no input data needs to be escaped }
internal virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { int originalUtf8TextLength = utf8Text.Length; // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. while (!utf8Text.IsEmpty) { if (UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text, out uint nextScalarValue, out int bytesConsumedThisIteration) != OperationStatus.Done || WillEncode((int)nextScalarValue)) { return(originalUtf8TextLength - utf8Text.Length); } utf8Text = utf8Text.Slice(bytesConsumedThisIteration); } return(-1); // no input data needs to be escaped }
public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { if (!_isAsciiCacheInitialized) { InitializeAsciiCache(); } // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if ((Sse2.IsSupported || AdvSimd.Arm64.IsSupported) && utf8Text.Length - 16 >= idx) { // Hoist these outside the loop, as the JIT won't do it. Vector128 <sbyte> bitMaskLookupAsciiNeedsEscaping = _bitMaskLookupAsciiNeedsEscaping; Vector128 <sbyte> bitPosLookup = Ssse3Helper.s_bitPosLookup; Vector128 <sbyte> nibbleMaskSByte = Ssse3Helper.s_nibbleMaskSByte; Vector128 <sbyte> nullMaskSByte = Ssse3Helper.s_nullMaskSByte; sbyte *startingAddress = (sbyte *)ptr; do { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue; bool containsNonAsciiBytes; // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. if (Sse2.IsSupported) { sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else if (AdvSimd.Arm64.IsSupported) { sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } else { throw new PlatformNotSupportedException(); } if (!containsNonAsciiBytes) { // All of the following 16 bytes is ASCII. // TODO AdvSimd: optimization maybe achievable using VectorTableLookup and/or VectorTableLookupExtension if (Ssse3.IsSupported) { Vector128 <sbyte> mask = Ssse3Helper.CreateEscapingMask(sourceValue, bitMaskLookupAsciiNeedsEscaping, bitPosLookup, nibbleMaskSByte, nullMaskSByte); int index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); if (index < 16) { idx += index; goto Return; } } else { byte *p = (byte *)startingAddress; if (DoesAsciiNeedEncoding(p[0])) { goto Return; } if (DoesAsciiNeedEncoding(p[1])) { goto Return1; } if (DoesAsciiNeedEncoding(p[2])) { goto Return2; } if (DoesAsciiNeedEncoding(p[3])) { goto Return3; } if (DoesAsciiNeedEncoding(p[4])) { goto Return4; } if (DoesAsciiNeedEncoding(p[5])) { goto Return5; } if (DoesAsciiNeedEncoding(p[6])) { goto Return6; } if (DoesAsciiNeedEncoding(p[7])) { goto Return7; } if (DoesAsciiNeedEncoding(p[8])) { goto Return8; } if (DoesAsciiNeedEncoding(p[9])) { goto Return9; } if (DoesAsciiNeedEncoding(p[10])) { goto Return10; } if (DoesAsciiNeedEncoding(p[11])) { goto Return11; } if (DoesAsciiNeedEncoding(p[12])) { goto Return12; } if (DoesAsciiNeedEncoding(p[13])) { goto Return13; } if (DoesAsciiNeedEncoding(p[14])) { goto Return14; } if (DoesAsciiNeedEncoding(p[15])) { goto Return15; } } idx += 16; } else { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } startingAddress = (sbyte *)ptr + idx; }while (utf8Text.Length - 16 >= idx); // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. goto Return; #if NETCOREAPP Return15: return(idx + 15); Return14: return(idx + 14); Return13: return(idx + 13); Return12: return(idx + 12); Return11: return(idx + 11); Return10: return(idx + 10); Return9: return(idx + 9); Return8: return(idx + 8); Return7: return(idx + 7); Return6: return(idx + 6); Return5: return(idx + 5); Return4: return(idx + 4); Return3: return(idx + 3); Return2: return(idx + 2); Return1: return(idx + 1); #endif Return: return(idx); } }
/// <summary> /// Encodes the supplied UTF-8 text. /// </summary> /// <param name="utf8Source">A source buffer containing the UTF-8 text to encode.</param> /// <param name="utf8Destination">The destination buffer to which the encoded form of <paramref name="utf8Source"/> /// will be written.</param> /// <param name="bytesConsumed">The number of bytes consumed from the <paramref name="utf8Source"/> buffer.</param> /// <param name="bytesWritten">The number of bytes written to the <paramref name="utf8Destination"/> buffer.</param> /// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded; /// <see langword="false"/> if there is no further source data that needs to be encoded.</param> /// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns> /// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks> public unsafe virtual OperationStatus EncodeUtf8( ReadOnlySpan <byte> utf8Source, Span <byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { int originalUtf8SourceLength = utf8Source.Length; int originalUtf8DestinationLength = utf8Destination.Length; const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation char * pTempCharBuffer = stackalloc char[TempUtf16CharBufferLength]; const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */; byte * pTempUtf8Buffer = stackalloc byte[TempUtf8ByteBufferLength]; uint nextScalarValue; int utf8BytesConsumedForScalar = 0; int nonEscapedByteCount = 0; OperationStatus opStatus = OperationStatus.Done; while (!utf8Source.IsEmpty) { // For performance, read until we require escaping. do { nextScalarValue = utf8Source[nonEscapedByteCount]; if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue)) { // Check Ascii cache. byte[]? encodedBytes = GetAsciiEncoding((byte)nextScalarValue); if (ReferenceEquals(encodedBytes, s_noEscape)) { if (++nonEscapedByteCount <= utf8Destination.Length) { // Source data can be copied as-is. continue; } --nonEscapedByteCount; opStatus = OperationStatus.DestinationTooSmall; break; } if (encodedBytes == null) { // We need to escape and update the cache, so break out of this loop. opStatus = OperationStatus.Done; utf8BytesConsumedForScalar = 1; break; } // For performance, handle the non-escaped bytes and encoding here instead of breaking out of the loop. if (nonEscapedByteCount > 0) { // We previously verified the destination size. Debug.Assert(nonEscapedByteCount <= utf8Destination.Length); utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination); utf8Source = utf8Source.Slice(nonEscapedByteCount); utf8Destination = utf8Destination.Slice(nonEscapedByteCount); nonEscapedByteCount = 0; } if (!((ReadOnlySpan <byte>)encodedBytes).TryCopyTo(utf8Destination)) { opStatus = OperationStatus.DestinationTooSmall; break; } utf8Destination = utf8Destination.Slice(encodedBytes.Length); utf8Source = utf8Source.Slice(1); continue; } // Code path for non-Ascii. opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source.Slice(nonEscapedByteCount), out nextScalarValue, out utf8BytesConsumedForScalar); if (opStatus == OperationStatus.Done) { if (!WillEncode((int)nextScalarValue)) { nonEscapedByteCount += utf8BytesConsumedForScalar; if (nonEscapedByteCount <= utf8Destination.Length) { // Source data can be copied as-is. continue; } nonEscapedByteCount -= utf8BytesConsumedForScalar; opStatus = OperationStatus.DestinationTooSmall; } } // We need to escape. break; } while (nonEscapedByteCount < utf8Source.Length); if (nonEscapedByteCount > 0) { // We previously verified the destination size. Debug.Assert(nonEscapedByteCount <= utf8Destination.Length); utf8Source.Slice(0, nonEscapedByteCount).CopyTo(utf8Destination); utf8Source = utf8Source.Slice(nonEscapedByteCount); utf8Destination = utf8Destination.Slice(nonEscapedByteCount); nonEscapedByteCount = 0; } if (utf8Source.IsEmpty) { goto Done; } // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD) // and for well-formed input data that must be escaped. if (opStatus != OperationStatus.Done) // Optimize happy path. { if (opStatus == OperationStatus.NeedMoreData) { if (!isFinalBlock) { bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.NeedMoreData); } // else treat this as a normal invalid subsequence. } else if (opStatus == OperationStatus.DestinationTooSmall) { goto ReturnDestinationTooSmall; } } if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow)) { // Now that we have it as UTF-16, transcode it to UTF-8. // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception // due to lack of output space. int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength); ReadOnlySpan <byte> transcodedUtf8BytesThisIteration = new ReadOnlySpan <byte>(pTempUtf8Buffer, transcodedByteCountThisIteration); // Update cache for Ascii if (UnicodeUtility.IsAsciiCodePoint(nextScalarValue)) { _asciiEscape[nextScalarValue] = transcodedUtf8BytesThisIteration.ToArray(); } if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination)) { goto ReturnDestinationTooSmall; } utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration); } else { // We really don't expect this to fail. If that happens we'll report an error to our caller. bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.InvalidData); } utf8Source = utf8Source.Slice(utf8BytesConsumedForScalar); } Done: // Input buffer has been fully processed! bytesConsumed = originalUtf8SourceLength; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.Done); ReturnDestinationTooSmall: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.DestinationTooSmall); }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); Vector128 <sbyte> mask = Sse2Helper.CreateAsciiMask(sourceValue); int index = Sse2.MoveMask(mask); if (index != 0) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } else { if (DoesAsciiNeedEncoding(ptr[idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1) { goto Return; } idx++; } startingAddress = (sbyte *)ptr + idx; } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } idx = -1; // All bytes are allowed. Return: return(idx); } }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. int index = Sse2.MoveMask(sourceValue); if (index != 0) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } startingAddress = (sbyte *)ptr + idx; } else { // Check if any of the 16 bytes need to be escaped. Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = Sse2.MoveMask(mask); // If index == 0, that means none of the 16 bytes needed to be escaped. // TrailingZeroCount is relatively expensive, avoid it if possible. if (index != 0) { // Found at least one byte that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 16 bytes. Debug.Assert(index > 0 && index <= 65_535); int tzc = BitOperations.TrailingZeroCount(index); Debug.Assert(tzc >= 0 && tzc <= 16); idx += tzc; goto Return; } idx += 16; startingAddress += 16; } } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }
/// <summary> /// Encodes the supplied UTF-8 text. /// </summary> /// <param name="utf8Source">A source buffer containing the UTF-8 text to encode.</param> /// <param name="utf8Destination">The destination buffer to which the encoded form of <paramref name="utf8Source"/> /// will be written.</param> /// <param name="bytesConsumed">The number of bytes consumed from the <paramref name="utf8Source"/> buffer.</param> /// <param name="bytesWritten">The number of bytes written to the <paramref name="utf8Destination"/> buffer.</param> /// <param name="isFinalBlock"><see langword="true"/> if there is further source data that needs to be encoded; /// <see langword="false"/> if there is no further source data that needs to be encoded.</param> /// <returns>An <see cref="OperationStatus"/> describing the result of the encoding operation.</returns> /// <remarks>The buffers <paramref name="utf8Source"/> and <paramref name="utf8Destination"/> must not overlap.</remarks> internal unsafe virtual OperationStatus EncodeUtf8(ReadOnlySpan <byte> utf8Source, Span <byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock = true) { // Optimization: Detect how much "doesn't require escaping" data exists at the beginning of the buffer, // and memcpy it directly to the destination. int numBytesToCopy = FindFirstCharacterToEncodeUtf8(utf8Source); if (numBytesToCopy < 0) { numBytesToCopy = utf8Source.Length; } if (!utf8Source.Slice(0, numBytesToCopy).TryCopyTo(utf8Destination)) { // There wasn't enough room in the destination to copy over the entire source buffer. // We'll instead copy over as much as we can and return DestinationTooSmall. We do need to // account for the fact that we don't want to truncate a multi-byte UTF-8 subsequence // mid-sequence (since a subsequent slice and call to EncodeUtf8 would produce invalid // data). utf8Source = utf8Source.Slice(0, utf8Destination.Length + 1); // guaranteed not to fail since utf8Source is larger than utf8Destination for (int i = utf8Source.Length - 1; i >= 0; i--) { if (!UnicodeHelpers.IsUtf8ContinuationByte(in utf8Source[i])) { utf8Source.Slice(0, i).CopyTo(utf8Destination); bytesConsumed = i; bytesWritten = i; return(OperationStatus.DestinationTooSmall); } } // If we got to this point, either somebody mutated the input buffer out from under us, or // the FindFirstCharacterToEncodeUtf8 method was overridden incorrectly such that it attempted // to skip over ill-formed data. In either case we don't know how to perform a partial memcpy // so we shouldn't do anything at all. We'll return DestinationTooSmall here since the caller // can resolve the issue by increasing the size of the destination buffer so that it's at least // as large as the input buffer, which would skip over this entire code path. bytesConsumed = 0; bytesWritten = 0; return(OperationStatus.DestinationTooSmall); } // If we copied over all of the input data, success! if (numBytesToCopy == utf8Source.Length) { bytesConsumed = numBytesToCopy; bytesWritten = numBytesToCopy; return(OperationStatus.Done); } // There's data that must be encoded. Fall back to the scalar-by-scalar slow path. int originalUtf8SourceLength = utf8Source.Length; int originalUtf8DestinationLength = utf8Destination.Length; utf8Source = utf8Source.Slice(numBytesToCopy); utf8Destination = utf8Destination.Slice(numBytesToCopy); const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation char * pTempCharBuffer = stackalloc char[TempUtf16CharBufferLength]; const int TempUtf8ByteBufferLength = TempUtf16CharBufferLength * 3 /* max UTF-8 output code units per UTF-16 input code unit */; byte * pTempUtf8Buffer = stackalloc byte[TempUtf8ByteBufferLength]; while (!utf8Source.IsEmpty) { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Source, out uint nextScalarValue, out int bytesConsumedThisIteration); switch (opStatus) { case OperationStatus.Done: if (WillEncode((int)nextScalarValue)) { goto default; // source data must be transcoded } else { // Source data can be copied as-is. Attempt to memcpy it to the destination buffer. if (utf8Source.Slice(0, bytesConsumedThisIteration).TryCopyTo(utf8Destination)) { utf8Destination = utf8Destination.Slice(bytesConsumedThisIteration); } else { goto ReturnDestinationTooSmall; } } break; case OperationStatus.NeedMoreData: if (isFinalBlock) { goto default; // treat this as a normal invalid subsequence } else { goto ReturnNeedMoreData; } default: // This code path is hit for ill-formed input data (where decoding has replaced it with U+FFFD) // and for well-formed input data that must be escaped. if (TryEncodeUnicodeScalar((int)nextScalarValue, pTempCharBuffer, TempUtf16CharBufferLength, out int charsWrittenJustNow)) { // Now that we have it as UTF-16, transcode it to UTF-8. // Need to copy it to a temporary buffer first, otherwise GetBytes might throw an exception // due to lack of output space. int transcodedByteCountThisIteration = Encoding.UTF8.GetBytes(pTempCharBuffer, charsWrittenJustNow, pTempUtf8Buffer, TempUtf8ByteBufferLength); ReadOnlySpan <byte> transcodedUtf8BytesThisIteration = new ReadOnlySpan <byte>(pTempUtf8Buffer, transcodedByteCountThisIteration); if (!transcodedUtf8BytesThisIteration.TryCopyTo(utf8Destination)) { goto ReturnDestinationTooSmall; } utf8Destination = utf8Destination.Slice(transcodedByteCountThisIteration); // advance destination buffer } else { // We really don't expect this to fail. If that happens we'll report an error to our caller. goto ReturnInvalidData; } break; } utf8Source = utf8Source.Slice(bytesConsumedThisIteration); } // Input buffer has been fully processed! bytesConsumed = originalUtf8SourceLength; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.Done); ReturnDestinationTooSmall: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.DestinationTooSmall); ReturnNeedMoreData: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.NeedMoreData); ReturnInvalidData: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(OperationStatus.InvalidData); }
public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value // that must be encoded. If we see either of these things then we'll return its index in the original // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate // that the text can be copied as-is without escaping. fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. int index = Sse2.MoveMask(sourceValue); if (index != 0) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } else { if (DoesAsciiNeedEncoding(ptr[idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1) { goto Return; } idx++; } startingAddress = (sbyte *)ptr + idx; } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }
public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); bool containsNonAsciiBytes; // Load the next 16 bytes, and check for ASCII text. // Any byte that's not in the ASCII range will already be negative when casted to signed byte. if (Sse2.IsSupported) { Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else if (AdvSimd.Arm64.IsSupported) { Vector128 <sbyte> sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } else { throw new PlatformNotSupportedException(); } if (containsNonAsciiBytes) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } } else { if (DoesAsciiNeedEncoding(ptr[idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1 || DoesAsciiNeedEncoding(ptr[++idx]) == 1) { goto Return; } idx++; } startingAddress = (sbyte *)ptr + idx; } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (DoesAsciiNeedEncoding(ptr[idx]) == 1) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }