public static bool IsValidUtf8Slice(ReadOnlySpan <byte> span, int start, int count) { ReadOnlySpan <byte> slice = span.Slice(start, count); int sliceStart = start; int startOffset = 0; for (; startOffset < 3 && sliceStart > 0; startOffset++) { sliceStart--; } if (sliceStart == 0 && count == span.Length) { return(true); } OperationStatus statusOfLast = Rune.DecodeLastFromUtf8(slice, out _, out _); if (statusOfLast != OperationStatus.Done) { return(false); } for (int i = startOffset; i > 0;) { ReadOnlySpan <byte> preSlice = span[(start - i)..];
/// <summary> /// Returns the index in <paramref name="utf8Data"/> where the trailing whitespace sequence /// begins, or 0 if the data contains only whitespace characters, or the span length if the /// data does not end with any whitespace characters. /// </summary> public static int GetIndexOfTrailingWhiteSpaceSequence(ReadOnlySpan <byte> utf8Data) { // This method is optimized for the case where the input data is ASCII, and if the // data does need to be trimmed it's likely that only a relatively small number of // bytes will be trimmed. int length = utf8Data.Length; while (length > 0) { // Very quick check: see if the byte is in the range [ 21 .. 7F ]. // If so, we can skip the more expensive logic later in this method. if ((sbyte)utf8Data[length - 1] > (sbyte)0x20) { break; } uint possibleAsciiByte = utf8Data[length - 1]; if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte)) { // The simple comparison failed. Let's read the actual byte value, // and if it's ASCII we can delegate to Rune's inlined method // implementation. if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte))) { length--; continue; } } else { // Not ASCII data. Go back to the slower "decode the entire scalar" // code path, then compare it against our Unicode tables. Rune.DecodeLastFromUtf8(utf8Data.Slice(0, length), out Rune decodedRune, out int bytesConsumed); if (Rune.IsWhiteSpace(decodedRune)) { length -= bytesConsumed; continue; } } break; // If we got here, we saw a non-whitespace subsequence. } return(length); }
public static void Trim(string input) { // Arrange using BoundedUtf8Span boundedSpan = new BoundedUtf8Span(input); Utf8Span span = boundedSpan.Span; // Act Utf8Span trimmed = span.Trim(); // Assert // Compute the trim manually and ensure it matches the trimmed span's characteristics. ReadOnlySpan <byte> utf8Bytes = span.Bytes; while (!utf8Bytes.IsEmpty) { OperationStatus status = Rune.DecodeFromUtf8(utf8Bytes, out Rune decodedRune, out int bytesConsumed); Assert.Equal(OperationStatus.Done, status); if (!Rune.IsWhiteSpace(decodedRune)) { break; } utf8Bytes = utf8Bytes.Slice(bytesConsumed); } while (!utf8Bytes.IsEmpty) { OperationStatus status = Rune.DecodeLastFromUtf8(utf8Bytes, out Rune decodedRune, out int bytesConsumed); Assert.Equal(OperationStatus.Done, status); if (!Rune.IsWhiteSpace(decodedRune)) { break; } utf8Bytes = utf8Bytes[..^ bytesConsumed];
[InlineData(new byte[] { 0xF0, 0x9F, 0x98, }, OperationStatus.NeedMoreData, 0xFFFD, 3)] // [ F0 9F 98 ] is valid 3-byte start of 4-byte sequence public static void DecodeLastFromUtf8(byte[] data, OperationStatus expectedOperationStatus, int expectedRuneValue, int expectedBytesConsumed) { Assert.Equal(expectedOperationStatus, Rune.DecodeLastFromUtf8(data, out Rune actualRune, out int actualBytesConsumed)); Assert.Equal(expectedRuneValue, actualRune.Value); Assert.Equal(expectedBytesConsumed, actualBytesConsumed); }