[MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharsCommon private protected sealed override unsafe int GetCharsFast(byte *pBytes, int bytesLength, char *pChars, int charsLength, out int bytesConsumed) { // We don't care about the exact OperationStatus value returned by the workhorse routine; we only // care if the workhorse was able to consume the entire input payload. If we're unable to do so, // we'll handle the remainder in the fallback routine. Utf8Utility.TranscodeToUtf16(pBytes, bytesLength, pChars, charsLength, out byte *pInputBufferRemaining, out char *pOutputBufferRemaining); bytesConsumed = (int)(pInputBufferRemaining - pBytes); return((int)(pOutputBufferRemaining - pChars)); }
private Utf8String Ctor(ReadOnlySpan <byte> value) { if (value.IsEmpty) { return(Empty); } Utf8String newString = FastAllocate(value.Length); Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref MemoryMarshal.GetReference(value), (uint)value.Length); return(Utf8Utility.ValidateAndFixupUtf8String(newString) !); // TODO-NULLABLE: https://github.com/dotnet/roslyn/issues/26761 }
private Utf8String Ctor(ReadOnlySpan <byte> value) { if (value.IsEmpty) { return(Empty); } Utf8String newString = FastAllocate(value.Length); Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref MemoryMarshal.GetReference(value), (uint)value.Length); return(Utf8Utility.ValidateAndFixupUtf8String(newString) !); // TODO-NULLABLE: Remove ! when nullable attributes are respected }
private Utf8String Ctor(ReadOnlySpan <byte> value) { if (value.IsEmpty) { return(Empty); } Utf8String newString = FastAllocate(value.Length); Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref MemoryMarshal.GetReference(value), (uint)value.Length); return(Utf8Utility.ValidateAndFixupUtf8String(newString)); }
[MethodImpl(MethodImplOptions.AggressiveInlining)] // called directly by GetCharCountCommon private protected sealed override unsafe int GetCharCountFast(byte *pBytes, int bytesLength, DecoderFallback?fallback, out int bytesConsumed) { // The number of UTF-16 code units will never exceed the number of UTF-8 code units, // so the addition at the end of this method will not overflow. byte *ptrToFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pBytes, bytesLength, out int utf16CodeUnitCountAdjustment, out _); int tempBytesConsumed = (int)(ptrToFirstInvalidByte - pBytes); bytesConsumed = tempBytesConsumed; return(tempBytesConsumed + utf16CodeUnitCountAdjustment); }
public void PeekFirstSequence_WithIncompleteSequence_ReturnsIncomplete(byte[] sequence) { // Act var asUtf8Bytes = NativeMemory.GetProtectedReadonlyBuffer(sequence); var validity = Utf8Utility.PeekFirstSequence(asUtf8Bytes, out var numBytesConsumed, out var scalarValue); // Assert Assert.Equal(SequenceValidity.Incomplete, validity); Assert.Equal(sequence.Length, numBytesConsumed); Assert.Equal(UnicodeScalar.ReplacementChar, scalarValue); }
public void IsWellFormedUtf8String_WithStringOfAllPossibleScalarValues_ReturnsTrue() { // Arrange byte[] allScalarsAsUtf8 = _utf8EncodingWithoutReplacement.GetBytes(_stringWithAllScalars.Value); var buffer = NativeMemory.AllocateFromExistingData(allScalarsAsUtf8, PoisonPagePlacement.AfterSpan); buffer.MakeReadonly(); // Act & assert Assert.True(Utf8Utility.IsWellFormedUtf8String(buffer.Span)); buffer.Dispose(); }
public void PeekFirstSequence_WithEmptyInput_ReturnsEmptyValidity() { // Act var buffer = NativeMemory.Allocate(0, PoisonPagePlacement.AfterSpan); var validity = Utf8Utility.PeekFirstSequence(buffer.Span, out var numBytesConsumed, out var scalarValue); // Assert Assert.Equal(SequenceValidity.Empty, validity); Assert.Equal(0, numBytesConsumed); Assert.Equal(UnicodeScalar.ReplacementChar, scalarValue); buffer.Dispose(); }
public void IsWellFormedUtf8String_WithCorruptedStringOfAllPossibleScalarValues_ReturnsFalse() { // Arrange byte[] allScalarsAsUtf8 = _utf8EncodingWithoutReplacement.GetBytes(_stringWithAllScalars.Value); allScalarsAsUtf8[0x1000] ^= 0x80; // modify the high bit of one of the characters, which will corrupt the header var buffer = NativeMemory.AllocateFromExistingData(allScalarsAsUtf8, PoisonPagePlacement.AfterSpan); buffer.MakeReadonly(); // Act & assert Assert.False(Utf8Utility.IsWellFormedUtf8String(buffer.Span)); buffer.Dispose(); }
private void TryReadFirstRune_WithValidInput_ReturnsScalarValue_Core(int scalarValue) { // Arrange string asUtf16String = Char.ConvertFromUtf32(scalarValue); var asUtf8Bytes = NativeMemory.GetProtectedReadonlyBuffer(_utf8EncodingWithoutReplacement.GetBytes(asUtf16String)); // Act bool retVal = Utf8Utility.TryReadFirstRune(asUtf8Bytes, out var rune, out var bytesConsumed); // Assert Assert.True(retVal); Assert.Equal(scalarValue, rune); Assert.Equal(asUtf8Bytes.Length, bytesConsumed); }
/// <summary> /// Creates a <see cref="Utf8String"/> instance from existing UTF-8 data. /// </summary> /// <param name="buffer">The existing data from which to create the new <see cref="Utf8String"/>.</param> /// <remarks> /// If <paramref name="buffer"/> contains any ill-formed UTF-8 subsequences, those subsequences will /// be replaced with <see cref="Rune.ReplacementChar"/> in the returned <see cref="Utf8String"/> instance. /// This may result in the returned <see cref="Utf8String"/> having different contents (and thus a different /// total byte length) than the source parameter <paramref name="buffer"/>. /// </remarks> public static Utf8String CreateFromRelaxed(ReadOnlySpan <byte> buffer) { if (buffer.IsEmpty) { return(Empty); } // Create and populate the Utf8String instance. Utf8String newString = FastAllocateSkipZeroInit(buffer.Length); Buffer.Memmove(ref newString.DangerousGetMutableReference(), ref MemoryMarshal.GetReference(buffer), (uint)buffer.Length); // Now perform validation & fixup. return(Utf8Utility.ValidateAndFixupUtf8String(newString)); }
private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) { // Arrange using BoundedMemory <byte> boundedMemory = BoundedMemory.AllocateFromExistingData(input); boundedMemory.MakeReadonly(); // Act int actualRetVal; int actualSurrogatePairCount; int actualRuneCount; fixed(byte *pInputBuffer = &MemoryMarshal.GetReference(boundedMemory.Span)) { byte *pFirstInvalidByte = Utf8Utility.GetPointerToFirstInvalidByte(pInputBuffer, input.Length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); long ptrDiff = pFirstInvalidByte - pInputBuffer; Assert.True((ulong)ptrDiff <= (uint)input.Length, "ptrDiff was outside expected range."); Assert.True(utf16CodeUnitCountAdjustment <= 0, "UTF-16 code unit count adjustment must be 0 or negative."); Assert.True(scalarCountAdjustment <= 0, "Scalar count adjustment must be 0 or negative."); actualRetVal = (ptrDiff == input.Length) ? -1 : (int)ptrDiff; // The last two 'out' parameters are: // a) The number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count, and // b) The number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count. int totalUtf16CodeUnitCount = (int)ptrDiff + utf16CodeUnitCountAdjustment; actualRuneCount = totalUtf16CodeUnitCount + scalarCountAdjustment; // Surrogate pair count is number of UTF-16 code units less the number of scalars. actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount; } // Assert Assert.Equal(expectedRetVal, actualRetVal); Assert.Equal(expectedRuneCount, actualRuneCount); Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount); }
public void GetExpectedNumberOfContinuationBytes_ForAllInputs() { // For [ 00..7F ], ASCII characters for (uint i = 0x00; i <= 0x7F; i++) { Assert.Equal(0, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } // For [ 80..BF ], continuation code units (never valid start bytes) for (uint i = 0x80; i <= 0xBF; i++) { Assert.Equal(0, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } // For [ C0..C1 ], overlong 2-byte starting code units (never valid bytes) for (uint i = 0x80; i <= 0xBF; i++) { Assert.Equal(0, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } // For [ C2..DF ], 2-byte sequence starting markers for (uint i = 0xC2; i <= 0xDF; i++) { Assert.Equal(1, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } // For [ E0..EF ], 3-byte sequence starting markers for (uint i = 0xE0; i <= 0xEF; i++) { Assert.Equal(2, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } // For [ F0..F4 ], 4-byte sequence starting markers for (uint i = 0xF0; i <= 0xF4; i++) { Assert.Equal(3, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } // For [ F5..FF ], never valid UTF-8 code units for (uint i = 0xF5; i <= 0xFF; i++) { Assert.Equal(0, Utf8Utility.GetExpectedNumberOfContinuationBytes((byte)i)); } }
private void TryReadFirstRuneAsUtf16_WithValidInput_ReturnsScalarValue_Core(int scalarValue) { // Arrange string asUtf16String = Char.ConvertFromUtf32(scalarValue); var asUtf8Bytes = NativeMemory.GetProtectedReadonlyBuffer(_utf8EncodingWithoutReplacement.GetBytes(asUtf16String)); var chars = NativeMemory.GetProtectedWriteableCharBuffer(2); // Act bool retVal = Utf8Utility.TryReadFirstRuneAsUtf16(asUtf8Bytes, chars, out var bytesConsumed, out var charsWritten); // Assert Assert.True(retVal); Assert.Equal(asUtf16String, new String(chars.ToArray(), 0, charsWritten)); Assert.Equal(asUtf16String.Length, charsWritten); Assert.Equal(asUtf8Bytes.Length, bytesConsumed); }
/// <summary> /// Creates a new <see cref="Utf8String"/> instance populated with a copy of the provided contents. /// Please see remarks for important safety information about this method. /// </summary> /// <param name="utf8Contents">The contents to copy to the new <see cref="Utf8String"/>.</param> /// <remarks> /// This factory method can be used as an optimization to skip the validation step that the /// <see cref="Utf8String"/> constructors normally perform. The contract of this method requires that /// <paramref name="utf8Contents"/> contain only well-formed UTF-8 data, as <see cref="Utf8String"/> /// contractually guarantees that it contains only well-formed UTF-8 data, and runtime instability /// could occur if a caller violates this guarantee. /// </remarks> public static Utf8String UnsafeCreateWithoutValidation(ReadOnlySpan <byte> utf8Contents) { if (utf8Contents.IsEmpty) { return(Empty); // special-case empty input } // Create and populate the Utf8String instance. Utf8String newString = FastAllocateSkipZeroInit(utf8Contents.Length); utf8Contents.CopyTo(newString.DangerousGetMutableSpan()); // The line below is removed entirely in release builds. Debug.Assert(Utf8Utility.IsWellFormedUtf8(newString.AsBytes()), "Buffer contained ill-formed UTF-8 data."); return(newString); }
[InlineData("25249-0.txt")] // Chinese, UTF-8 (primarily 3-byte sequences) public void GetUtf16CharCount_UsingNewEncoder(string resourceName) { ReadOnlySpan <byte> utf8Text = ReadTestResource(resourceName); // Call UTF8Encoding.GetCharCount once to ensure it's JITted bool succeeded = Utf8Utility.TryGetUtf16CharCount(utf8Text, out _); Assert.True(succeeded); // Perform perf test foreach (var iteration in Benchmark.Iterations) { using (iteration.StartMeasurement()) { for (int i = 0; i < Benchmark.InnerIterationCount; i++) { Utf8Utility.TryGetUtf16CharCount(utf8Text, out _); } } } }
private static byte[] InitializeBuffer(ReadOnlySpan <byte> value) { if (value.IsEmpty) { return(Empty._bytes); } // Create and populate the Utf8String buffer. byte[] newBuffer = AllocateBuffer(value.Length); value.CopyTo(newBuffer); // Now perform validation. // Reminder: Perform validation over the copy, not over the source. if (!Utf8Utility.IsWellFormedUtf8(newBuffer)) { throw new ArgumentException( message: SR.Utf8String_InputContainedMalformedUtf8, paramName: nameof(value)); } return(newBuffer); }
private static int EscapeNextBytes(ReadOnlySpan <byte> value, ref Span <byte> destination, ref int written) { SequenceValidity status = Utf8Utility.PeekFirstSequence(value, out int numBytesConsumed, out UnicodeScalar unicodeScalar); if (status != SequenceValidity.WellFormed) { JsonThrowHelper.ThrowJsonWriterException("Invalid UTF-8 string."); } destination[written++] = (byte)'\\'; int scalar = unicodeScalar.Value; switch (scalar) { case '\n': destination[written++] = (byte)'n'; break; case '\r': destination[written++] = (byte)'r'; break; case '\t': destination[written++] = (byte)'t'; break; case '\\': destination[written++] = (byte)'\\'; break; case '/': destination[written++] = (byte)'/'; break; case '\b': destination[written++] = (byte)'b'; break; case '\f': destination[written++] = (byte)'f'; break; default: destination[written++] = (byte)'u'; if (scalar < 0x10000) { WriteHex(scalar, ref destination, ref written); } else { int quotient = DivMod(scalar - 0x10000, 0x400, out int remainder); int firstChar = quotient + 0xD800; int nextChar = remainder + 0xDC00; WriteHex(firstChar, ref destination, ref written); destination[written++] = (byte)'\\'; destination[written++] = (byte)'u'; WriteHex(nextChar, ref destination, ref written); } break; } return(numBytesConsumed); }
public bool UsingNewEncoder() { return(Utf8Utility.TryGetUtf16CharCount(utf8Text, out _)); }