/// <summary> /// Calculates the number of UTF-16 code units (<see cref="char"/>s) that would result from converting /// the provided UTF-8 string to UTF-16 representation. /// </summary> /// <param name="inputBuffer">The buffer containing UTF-8 text.</param> /// <param name="charCount"> /// If this method returns <see langword="true"/>, contains the equivalent <see cref="char"/> count of the input string. /// If this method returns <see langword="false"/>, the value is undefined. /// </param> /// <returns> /// <see langword="true"/> on success, <see langword="false"/> if the input is not a well-formed UTF-8 string. /// </returns> public static bool TryGetUtf16CharCount(ReadOnlySpan <byte> inputBuffer, out int charCount) { if (Utf8Util.GetIndexOfFirstInvalidUtf8Sequence(inputBuffer, out int runeCount, out int surrogatePairCount) < 0) { // can't overflow because UTF-16 code unit count is always <= UTF-8 code unit count for well-formed strings charCount = runeCount + surrogatePairCount; return(true); }
private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount) { // Arrange var inputBytes = NativeMemory.GetProtectedReadonlyBuffer(input); // Act var indexOfFirstInvalidChar = Utf8Util.GetIndexOfFirstInvalidUtf8Sequence(inputBytes, out int actualRuneCount, out int actualSurrogatePairCount); // Assert Assert.Equal(expectedRetVal, indexOfFirstInvalidChar); Assert.Equal(expectedRuneCount, actualRuneCount); Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount); }
/// <summary> /// Peeks at the first UTF-8 sequence in the input buffer and returns information about that /// sequence. If the sequence is well-formed, returns <see cref="SequenceValidity.WellFormed"/> /// and sets the <paramref name="scalarValue"/> output parameter to the scalar value encoded by /// the sequence. If the return value is anything other than <see cref="SequenceValidity.WellFormed"/>, /// sets the <paramref name="scalarValue"/> output parameter to <see cref="UnicodeScalar.ReplacementChar"/>. /// In all cases, the <paramref name="numBytesConsumed"/> output parameter will contain the /// number of UTF-8 code units read from the input buffer in order to make the determination. /// </summary> public static SequenceValidity PeekFirstSequence(ReadOnlySpan <byte> data, out int numBytesConsumed, out UnicodeScalar scalarValue) { // This method is implemented to match the behavior of System.Text.Encoding.UTF8 in terms of // how many bytes it consumes when reporting invalid sequences. The behavior is as follows: // // - Some bytes are *always* invalid (ranges [ C0..C1 ] and [ F5..FF ]), and when these // are encountered it's an invalid sequence of length 1. // // - Multi-byte sequences which are overlong are reported as an invalid sequence of length 2, // since per the Unicode Standard Table 3-7 it's always possible to tell these by the second byte. // Exception: Sequences which begin with [ C0..C1 ] are covered by the above case, thus length 1. // // - Multi-byte sequences which are improperly terminated (no continuation byte when one is // expected) are reported as invalid sequences up to and including the last seen continuation byte. scalarValue = UnicodeScalar.ReplacementChar; if (data.IsEmpty) { // No data to peek at numBytesConsumed = 0; return(SequenceValidity.Empty); } byte firstByte = data[0]; if (IsAsciiValue(firstByte)) { // ASCII byte = well-formed one-byte sequence. scalarValue = UnicodeScalar.CreateWithoutValidation(firstByte); numBytesConsumed = 1; return(SequenceValidity.WellFormed); } if (!Utf8Util.IsInRangeInclusive(firstByte, (byte)0xC2U, (byte)0xF4U)) { // Standalone continuation byte or "always invalid" byte = ill-formed one-byte sequence. goto InvalidOneByteSequence; } // At this point, we know we're working with a multi-byte sequence, // and we know that at least the first byte is potentially valid. if (data.Length < 2) { // One byte of an incomplete multi-byte sequence. goto OneByteOfIncompleteMultiByteSequence; } byte secondByte = data[1]; if (!IsUtf8ContinuationByte(secondByte)) { // One byte of an improperly terminated multi-byte sequence. goto InvalidOneByteSequence; } if (firstByte < (byte)0xE0U) { // Well-formed two-byte sequence. scalarValue = UnicodeScalar.CreateWithoutValidation((((uint)firstByte & 0x1FU) << 6) | ((uint)secondByte & 0x3FU)); numBytesConsumed = 2; return(SequenceValidity.WellFormed); } if (firstByte < (byte)0xF0U) { // Start of a three-byte sequence. // Need to check for overlong or surrogate sequences. uint scalar = (((uint)firstByte & 0x0FU) << 12) | (((uint)secondByte & 0x3FU) << 6); if (scalar < 0x800U || Utf8Util.IsLowWordSurrogate(scalar)) { goto OverlongOutOfRangeOrSurrogateSequence; } // At this point, we have a valid two-byte start of a three-byte sequence. if (data.Length < 3) { // Two bytes of an incomplete three-byte sequence. goto TwoBytesOfIncompleteMultiByteSequence; } else { byte thirdByte = data[2]; if (IsUtf8ContinuationByte(thirdByte)) { // Well-formed three-byte sequence. scalar |= (uint)thirdByte & 0x3FU; scalarValue = UnicodeScalar.CreateWithoutValidation(scalar); numBytesConsumed = 3; return(SequenceValidity.WellFormed); } else { // Two bytes of improperly terminated multi-byte sequence. goto InvalidTwoByteSequence; } } } { // Start of four-byte sequence. // Need to check for overlong or out-of-range sequences. uint scalar = (((uint)firstByte & 0x07U) << 18) | (((uint)secondByte & 0x3FU) << 12); if (!Utf8Util.IsInRangeInclusive(scalar, 0x10000U, 0x10FFFFU)) { goto OverlongOutOfRangeOrSurrogateSequence; } // At this point, we have a valid two-byte start of a four-byte sequence. if (data.Length < 3) { // Two bytes of an incomplete four-byte sequence. goto TwoBytesOfIncompleteMultiByteSequence; } else { byte thirdByte = data[2]; if (IsUtf8ContinuationByte(thirdByte)) { // Valid three-byte start of a four-byte sequence. if (data.Length < 4) { // Three bytes of an incomplete four-byte sequence. goto ThreeBytesOfIncompleteMultiByteSequence; } else { byte fourthByte = data[3]; if (IsUtf8ContinuationByte(fourthByte)) { // Well-formed four-byte sequence. scalar |= (((uint)thirdByte & 0x3FU) << 6) | ((uint)fourthByte & 0x3FU); scalarValue = UnicodeScalar.CreateWithoutValidation(scalar); numBytesConsumed = 4; return(SequenceValidity.WellFormed); } else { // Three bytes of an improperly terminated multi-byte sequence. goto InvalidThreeByteSequence; } } } else { // Two bytes of improperly terminated multi-byte sequence. goto InvalidTwoByteSequence; } } } // Everything below here is error handling. InvalidOneByteSequence: numBytesConsumed = 1; return(SequenceValidity.Invalid); InvalidTwoByteSequence: OverlongOutOfRangeOrSurrogateSequence: numBytesConsumed = 2; return(SequenceValidity.Invalid); InvalidThreeByteSequence: numBytesConsumed = 3; return(SequenceValidity.Invalid); OneByteOfIncompleteMultiByteSequence: numBytesConsumed = 1; return(SequenceValidity.Incomplete); TwoBytesOfIncompleteMultiByteSequence: numBytesConsumed = 2; return(SequenceValidity.Incomplete); ThreeBytesOfIncompleteMultiByteSequence: numBytesConsumed = 3; return(SequenceValidity.Incomplete); }
/// <summary> /// Returns the index of the first byte of the first invalid UTF-8 sequence in <paramref name="data"/>, /// or -1 if <paramref name="data"/> is a well-formed UTF-8 string. /// </summary> public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan <byte> data) => Utf8Util.GetIndexOfFirstInvalidUtf8Sequence(data, out _, out _);