public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { fixed(byte *ptr = utf8Text) { int idx = 0; #if NETCOREAPP if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { sbyte *startingAddress = (sbyte *)ptr; while (utf8Text.Length - 16 >= idx) { Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16)); // Load the next 16 bytes. Vector128 <sbyte> sourceValue; bool containsNonAsciiBytes; // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when // casted to signed byte. if (Sse2.IsSupported) { sourceValue = Sse2.LoadVector128(startingAddress); containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue); } else { sourceValue = AdvSimd.LoadVector128(startingAddress); containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue); } if (containsNonAsciiBytes) { // At least one of the following 16 bytes is non-ASCII. int processNextSixteen = idx + 16; Debug.Assert(processNextSixteen <= utf8Text.Length); while (idx < processNextSixteen) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } startingAddress = (sbyte *)ptr + idx; } else { // Check if any of the 16 bytes need to be escaped. int index; if (Sse2.IsSupported) { Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); } else { Vector128 <sbyte> mask = AdvSimdHelper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue); index = AdvSimdHelper.GetIndexOfFirstNonAsciiByte(mask.AsByte()); } // If index >= 16, that means none of the 16 bytes needed to be escaped. if (index < 16) { // Found at least one byte that needs to be escaped, figure out the index of // the first one found that needed to be escaped within the 16 bytes. idx += index; goto Return; } idx += 16; startingAddress += 16; } } // Process the remaining bytes. Debug.Assert(utf8Text.Length - idx < 16); } #endif while (idx < utf8Text.Length) { Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length)); if (UnicodeUtility.IsAsciiCodePoint(ptr[idx])) { if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx])) { goto Return; } idx++; } else { OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar); Debug.Assert(nextScalarValue <= int.MaxValue); if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue)) { goto Return; } Debug.Assert(opStatus == OperationStatus.Done); idx += utf8BytesConsumedForScalar; } } Debug.Assert(idx == utf8Text.Length); idx = -1; // All bytes are allowed. Return: return(idx); } }
public static string Encode(this TextEncoder encoder, string value) { if (value == null) { return(value); } unsafe { fixed(char *valuePointer = value) { int firstCharacterToEncode = encoder.FindFirstCharacterToEncode(valuePointer, value.Length); if (firstCharacterToEncode == -1) { return(value); } int bufferSize = encoder.MaxOutputCharsPerInputChar * value.Length; char *wholebuffer = stackalloc char[bufferSize]; char *buffer = wholebuffer; int totalWritten = 0; if (firstCharacterToEncode > 0) { int bytesToCopy = firstCharacterToEncode + firstCharacterToEncode; Buffer.MemoryCopy(valuePointer, buffer, bytesToCopy, bytesToCopy); totalWritten += firstCharacterToEncode; bufferSize -= firstCharacterToEncode; buffer += firstCharacterToEncode; } int valueIndex = firstCharacterToEncode; char firstChar = value[valueIndex]; char secondChar = firstChar; bool wasSurrogatePair = false; int charsWritten; // this loop processes character pairs (in case they are surrogates). // there is an if block below to process single last character. for (int secondCharIndex = valueIndex + 1; secondCharIndex < value.Length; secondCharIndex++) { if (!wasSurrogatePair) { firstChar = secondChar; } else { firstChar = value[secondCharIndex - 1]; } secondChar = value[secondCharIndex]; if (!encoder.Encodes(firstChar)) { wasSurrogatePair = false; *buffer = firstChar; buffer++; bufferSize--; totalWritten++; } else { int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair); if (!encoder.TryEncodeUnicodeScalar(nextScalar, buffer, bufferSize, out charsWritten)) { throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly."); } buffer += charsWritten; bufferSize -= charsWritten; totalWritten += charsWritten; if (wasSurrogatePair) { secondCharIndex++; } } } if (!wasSurrogatePair) { firstChar = value[value.Length - 1]; int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair); if (!encoder.TryEncodeUnicodeScalar(nextScalar, buffer, bufferSize, out charsWritten)) { throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly."); } buffer += charsWritten; bufferSize -= charsWritten; totalWritten += charsWritten; } var result = new String(wholebuffer, 0, totalWritten); return(result); } } }
// skips the call to FindFirstCharacterToEncodeUtf8 private protected virtual OperationStatus EncodeUtf8Core( ReadOnlySpan <byte> utf8Source, Span <byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock) { int originalUtf8SourceLength = utf8Source.Length; int originalUtf8DestinationLength = utf8Destination.Length; const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation Span <char> utf16ScratchBuffer = stackalloc char[TempUtf16CharBufferLength]; while (!utf8Source.IsEmpty) { OperationStatus opStatus = Rune.DecodeFromUtf8(utf8Source, out Rune scalarValue, out int bytesConsumedJustNow); if (opStatus != OperationStatus.Done) { if (!isFinalBlock && opStatus == OperationStatus.NeedMoreData) { goto NeedMoreData; } Debug.Assert(scalarValue == Rune.ReplacementChar); // DecodeFromUtf8 should've performed substitution goto MustEncode; } if (!WillEncode(scalarValue.Value)) { uint utf8lsb = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)scalarValue.Value); int dstIdxTemp = 0; do { if ((uint)dstIdxTemp >= (uint)utf8Destination.Length) { goto DestinationTooSmall; } utf8Destination[dstIdxTemp++] = (byte)utf8lsb; } while ((utf8lsb >>= 8) != 0); utf8Source = utf8Source.Slice(bytesConsumedJustNow); utf8Destination = utf8Destination.Slice(dstIdxTemp); continue; } MustEncode: if (!TryEncodeUnicodeScalarUtf8((uint)scalarValue.Value, utf16ScratchBuffer, utf8Destination, out int bytesWrittenJustNow)) { goto DestinationTooSmall; } utf8Source = utf8Source.Slice(bytesConsumedJustNow); utf8Destination = utf8Destination.Slice(bytesWrittenJustNow); } // And we're finished! OperationStatus retVal = OperationStatus.Done; ReturnCommon: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(retVal); NeedMoreData: retVal = OperationStatus.NeedMoreData; goto ReturnCommon; DestinationTooSmall: retVal = OperationStatus.DestinationTooSmall; goto ReturnCommon; }