예제 #1
0
 /// <summary>
 /// Calculates the number of UTF-16 code units (<see cref="char"/>s) that would result from converting
 /// the provided UTF-8 string to UTF-16 representation.
 /// </summary>
 /// <param name="inputBuffer">The buffer containing UTF-8 text.</param>
 /// <param name="charCount">
 /// If this method returns <see langword="true"/>, contains the equivalent <see cref="char"/> count of the input string.
 /// If this method returns <see langword="false"/>, the value is undefined.
 /// </param>
 /// <returns>
 /// <see langword="true"/> on success, <see langword="false"/> if the input is not a well-formed UTF-8 string.
 /// </returns>
 public static bool TryGetUtf16CharCount(ReadOnlySpan <byte> inputBuffer, out int charCount)
 {
     if (Utf8Util.GetIndexOfFirstInvalidUtf8Sequence(inputBuffer, out int runeCount, out int surrogatePairCount) < 0)
     {
         // can't overflow because UTF-16 code unit count is always <= UTF-8 code unit count for well-formed strings
         charCount = runeCount + surrogatePairCount;
         return(true);
     }
        private static void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] input, int expectedRetVal, int expectedRuneCount, int expectedSurrogatePairCount)
        {
            // Arrange

            var inputBytes = NativeMemory.GetProtectedReadonlyBuffer(input);

            // Act

            var indexOfFirstInvalidChar = Utf8Util.GetIndexOfFirstInvalidUtf8Sequence(inputBytes, out int actualRuneCount, out int actualSurrogatePairCount);

            // Assert

            Assert.Equal(expectedRetVal, indexOfFirstInvalidChar);
            Assert.Equal(expectedRuneCount, actualRuneCount);
            Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount);
        }
예제 #3
0
        /// <summary>
        /// Peeks at the first UTF-8 sequence in the input buffer and returns information about that
        /// sequence. If the sequence is well-formed, returns <see cref="SequenceValidity.WellFormed"/>
        /// and sets the <paramref name="scalarValue"/> output parameter to the scalar value encoded by
        /// the sequence. If the return value is anything other than <see cref="SequenceValidity.WellFormed"/>,
        /// sets the <paramref name="scalarValue"/> output parameter to <see cref="UnicodeScalar.ReplacementChar"/>.
        /// In all cases, the <paramref name="numBytesConsumed"/> output parameter will contain the
        /// number of UTF-8 code units read from the input buffer in order to make the determination.
        /// </summary>
        public static SequenceValidity PeekFirstSequence(ReadOnlySpan <byte> data, out int numBytesConsumed, out UnicodeScalar scalarValue)
        {
            // This method is implemented to match the behavior of System.Text.Encoding.UTF8 in terms of
            // how many bytes it consumes when reporting invalid sequences. The behavior is as follows:
            //
            // - Some bytes are *always* invalid (ranges [ C0..C1 ] and [ F5..FF ]), and when these
            //   are encountered it's an invalid sequence of length 1.
            //
            // - Multi-byte sequences which are overlong are reported as an invalid sequence of length 2,
            //   since per the Unicode Standard Table 3-7 it's always possible to tell these by the second byte.
            //   Exception: Sequences which begin with [ C0..C1 ] are covered by the above case, thus length 1.
            //
            // - Multi-byte sequences which are improperly terminated (no continuation byte when one is
            //   expected) are reported as invalid sequences up to and including the last seen continuation byte.

            scalarValue = UnicodeScalar.ReplacementChar;

            if (data.IsEmpty)
            {
                // No data to peek at
                numBytesConsumed = 0;
                return(SequenceValidity.Empty);
            }

            byte firstByte = data[0];

            if (IsAsciiValue(firstByte))
            {
                // ASCII byte = well-formed one-byte sequence.
                scalarValue      = UnicodeScalar.CreateWithoutValidation(firstByte);
                numBytesConsumed = 1;
                return(SequenceValidity.WellFormed);
            }

            if (!Utf8Util.IsInRangeInclusive(firstByte, (byte)0xC2U, (byte)0xF4U))
            {
                // Standalone continuation byte or "always invalid" byte = ill-formed one-byte sequence.
                goto InvalidOneByteSequence;
            }

            // At this point, we know we're working with a multi-byte sequence,
            // and we know that at least the first byte is potentially valid.

            if (data.Length < 2)
            {
                // One byte of an incomplete multi-byte sequence.
                goto OneByteOfIncompleteMultiByteSequence;
            }

            byte secondByte = data[1];

            if (!IsUtf8ContinuationByte(secondByte))
            {
                // One byte of an improperly terminated multi-byte sequence.
                goto InvalidOneByteSequence;
            }

            if (firstByte < (byte)0xE0U)
            {
                // Well-formed two-byte sequence.
                scalarValue      = UnicodeScalar.CreateWithoutValidation((((uint)firstByte & 0x1FU) << 6) | ((uint)secondByte & 0x3FU));
                numBytesConsumed = 2;
                return(SequenceValidity.WellFormed);
            }

            if (firstByte < (byte)0xF0U)
            {
                // Start of a three-byte sequence.
                // Need to check for overlong or surrogate sequences.

                uint scalar = (((uint)firstByte & 0x0FU) << 12) | (((uint)secondByte & 0x3FU) << 6);
                if (scalar < 0x800U || Utf8Util.IsLowWordSurrogate(scalar))
                {
                    goto OverlongOutOfRangeOrSurrogateSequence;
                }

                // At this point, we have a valid two-byte start of a three-byte sequence.

                if (data.Length < 3)
                {
                    // Two bytes of an incomplete three-byte sequence.
                    goto TwoBytesOfIncompleteMultiByteSequence;
                }
                else
                {
                    byte thirdByte = data[2];
                    if (IsUtf8ContinuationByte(thirdByte))
                    {
                        // Well-formed three-byte sequence.
                        scalar          |= (uint)thirdByte & 0x3FU;
                        scalarValue      = UnicodeScalar.CreateWithoutValidation(scalar);
                        numBytesConsumed = 3;
                        return(SequenceValidity.WellFormed);
                    }
                    else
                    {
                        // Two bytes of improperly terminated multi-byte sequence.
                        goto InvalidTwoByteSequence;
                    }
                }
            }

            {
                // Start of four-byte sequence.
                // Need to check for overlong or out-of-range sequences.

                uint scalar = (((uint)firstByte & 0x07U) << 18) | (((uint)secondByte & 0x3FU) << 12);
                if (!Utf8Util.IsInRangeInclusive(scalar, 0x10000U, 0x10FFFFU))
                {
                    goto OverlongOutOfRangeOrSurrogateSequence;
                }

                // At this point, we have a valid two-byte start of a four-byte sequence.

                if (data.Length < 3)
                {
                    // Two bytes of an incomplete four-byte sequence.
                    goto TwoBytesOfIncompleteMultiByteSequence;
                }
                else
                {
                    byte thirdByte = data[2];
                    if (IsUtf8ContinuationByte(thirdByte))
                    {
                        // Valid three-byte start of a four-byte sequence.

                        if (data.Length < 4)
                        {
                            // Three bytes of an incomplete four-byte sequence.
                            goto ThreeBytesOfIncompleteMultiByteSequence;
                        }
                        else
                        {
                            byte fourthByte = data[3];
                            if (IsUtf8ContinuationByte(fourthByte))
                            {
                                // Well-formed four-byte sequence.
                                scalar          |= (((uint)thirdByte & 0x3FU) << 6) | ((uint)fourthByte & 0x3FU);
                                scalarValue      = UnicodeScalar.CreateWithoutValidation(scalar);
                                numBytesConsumed = 4;
                                return(SequenceValidity.WellFormed);
                            }
                            else
                            {
                                // Three bytes of an improperly terminated multi-byte sequence.
                                goto InvalidThreeByteSequence;
                            }
                        }
                    }
                    else
                    {
                        // Two bytes of improperly terminated multi-byte sequence.
                        goto InvalidTwoByteSequence;
                    }
                }
            }

            // Everything below here is error handling.

InvalidOneByteSequence:
            numBytesConsumed = 1;
            return(SequenceValidity.Invalid);

InvalidTwoByteSequence:
OverlongOutOfRangeOrSurrogateSequence:
            numBytesConsumed = 2;
            return(SequenceValidity.Invalid);

InvalidThreeByteSequence:
            numBytesConsumed = 3;
            return(SequenceValidity.Invalid);

OneByteOfIncompleteMultiByteSequence:
            numBytesConsumed = 1;
            return(SequenceValidity.Incomplete);

TwoBytesOfIncompleteMultiByteSequence:
            numBytesConsumed = 2;
            return(SequenceValidity.Incomplete);

ThreeBytesOfIncompleteMultiByteSequence:
            numBytesConsumed = 3;
            return(SequenceValidity.Incomplete);
        }
예제 #4
0
 /// <summary>
 /// Returns the index of the first byte of the first invalid UTF-8 sequence in <paramref name="data"/>,
 /// or -1 if <paramref name="data"/> is a well-formed UTF-8 string.
 /// </summary>
 public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan <byte> data) =>
 Utf8Util.GetIndexOfFirstInvalidUtf8Sequence(data, out _, out _);