// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)ASCIIUtility64.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; inputLength -= numAsciiCharsConsumedJustNow; if (0u >= (uint)inputLength) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if (Sse2.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero; do { Vector128 <ushort> utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned uint mask; // The 'charIsNonAscii' vector we're about to build will have the 0x8000 or the 0x0080 // bit set (but not both!) only if the corresponding input char is non-ASCII. Which of // the two bits is set doesn't matter, as will be explained in the diagram a few lines // below. Vector128 <ushort> charIsNonAscii; if (Sse41.IsSupported) { // sets 0x0080 bit if corresponding char element is >= 0x0080 charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else { // sets 0x8000 bit if corresponding char element is >= 0x0080 charIsNonAscii = Sse2.AndNot(vector0080, Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 7))); } #if DEBUG // Quick check to ensure we didn't accidentally set both 0x8080 bits in any element. uint debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); Debug.Assert((debugMask & (debugMask << 1)) == 0, "Two set bits shouldn't occur adjacent to each other in this mask."); #endif // DEBUG // sets 0x8080 bits if corresponding char element is >= 0x0800 Vector128 <ushort> charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); // Each odd bit of mask will be 1 only if the char was >= 0x0080, // and each even bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is non-ASCII // | ,-- set if char[0] is non-ASCII // v v // mask = ... 1 1 1 0 // ^ ^-- set if char[0] is >= 0x0800 // `-- set if char[1] is >= 0x0800 // // (If the SSE4.1 code path is taken above, the meaning of the odd and even // bits are swapped, but the logic below otherwise holds.) // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. // // For now, compute the popcnt but squirrel it away. We'll fold it in to the // cumulative UTF-8 adjustment factor once we determine that there are no // unpaired surrogates in our data. (Unpaired surrogates would invalidate // our computed result and we'd have to throw it away.) uint popcnt = (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, // - 00 if the corresponding char was a high surrogate char or not a surrogate at all. uint lowSurrogatesMask = mask2 & mask; // 'highSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a high surrogate char, // - 00 if the corresponding char was a low surrogate char or not a surrogate at all. uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask; Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0, "A char cannot simultaneously be both a high and a low surrogate char."); Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0, "Only even bits (no odd bits) of the masks should be set."); // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here pInputBuffer--; inputLength++; } // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for // free right now, saving the extension step a few lines below. If we're 32-bit, the // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real // 64 -bit extension a few lines below. nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNuint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation // assumes that the pair is encoded as 6 UTF-8 code units. Since each // pair is in reality only encoded as 4 UTF-8 code units, we need to // perform this adjustment now. if (PlatformDependent.Is64BitProcess) { // Since we've already zero-extended surrogatePairsCountNuint, we can directly // sub + sub. It's more efficient than shl + sub. tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; } else { // Take the hit of the 64-bit extension now. tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint; } } tempUtf8CodeUnitCountAdjustment += popcnt; pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE2 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint> sumVector = (Vector <nuint>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint> .Count; i++) { popcnt += sumVector[i]; } uint popcnt32 = (uint)popcnt; if (PlatformDependent.Is64BitProcess) { popcnt32 += (uint)(popcnt >> 32); } // As in the SSE4.1 paths, compute popcnt but don't fold it in until we // know there aren't any unpaired surrogates in the input data. popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. if (lowSurrogateChars[0] != 0) { goto Error; // error: start of buffer contains standalone low surrogate char } ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; popcnt32 -= 2; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } tempUtf8CodeUnitCountAdjustment += popcnt32; pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static byte *GetPointerToFirstInvalidByte(byte *pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, try to drain off as many ASCII bytes as we can from the beginning. { nuint numAsciiBytesCounted = ASCIIUtility64.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength); pInputBuffer += numAsciiBytesCounted; // Quick check - did we just end up consuming the entire input buffer? // If so, short-circuit the remainder of the method. inputLength -= (int)numAsciiBytesCounted; if (0u >= inputLength) { utf16CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } } #if DEBUG // Keep these around for final validation at the end of the method. byte *pOriginalInputBuffer = pInputBuffer; int originalInputLength = inputLength; #endif // Enregistered locals that we'll eventually out to our caller. int tempUtf16CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if (inputLength < sizeof(uint)) { goto ProcessInputOfLessThanDWordSize; } byte *pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint); // Begin the main loop. #if DEBUG byte *pLastBufferPosProcessed = null; // used for invariant checking in debug builds #endif while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. uint thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); AfterReadDWord: #if DEBUG Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read."); pLastBufferPosProcessed = pInputBuffer; #endif // First, check for the common case of all-ASCII bytes. if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) { // We read an all-ASCII sequence. pInputBuffer += sizeof(uint); // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII. // Below is basically unrolled loops with poor man's vectorization. // Below check is "can I read at least five DWORDs from the input stream?" // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value, // hence using nint instead of nuint. if ((nint)(void *)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint)) { // We want reads in the inner loop to be aligned. So let's perform a quick // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump // the read pointer up to the next aligned address. thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) { goto AfterReadDWordSkipAllBytesAsciiCheck; } pInputBuffer = (byte *)((nuint)(pInputBuffer + 4) & ~(nuint)3); // At this point, the input buffer offset points to an aligned DWORD. We also know that there's // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above: // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and // the alignment check consumes at most a single DWORD.) byte *pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here uint mask; do { if (Sse2.IsSupported && Bmi1.IsSupported) { // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're // going to perform an unaligned load. We don't necessarily care about aligning // this because we pessimistically assume we'll encounter non-ASCII data at some // point in the not-too-distant future (otherwise we would've stayed entirely // within the all-ASCII vectorized code at the entry to this method). mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte *)pInputBuffer)); if (mask != 0) { goto Sse2LoopTerminatedEarlyDueToNonAsciiData; } } else { if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint *)pInputBuffer)[0] | ((uint *)pInputBuffer)[1])) { goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair; } if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint *)pInputBuffer)[2] | ((uint *)pInputBuffer)[3])) { goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair; } } pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); continue; // need to perform a bounds check because we might be running out of data Sse2LoopTerminatedEarlyDueToNonAsciiData: Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(Sse2.IsSupported); Debug.Assert(Bmi1.IsSupported); // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit // for each non-ASCII byte we saw. We can count the number of ASCII bytes, // bump our input counter by that amount, and resume processing from the // "the first byte is no longer ASCII" portion of the main loop. Debug.Assert(mask != 0); pInputBuffer += Bmi1.TrailingZeroCount(mask); if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) { goto ProcessRemainingBytesSlow; } thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); // no longer guaranteed to be aligned goto BeforeProcessTwoByteSequence; LoopTerminatedEarlyDueToNonAsciiDataInSecondPair: pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs LoopTerminatedEarlyDueToNonAsciiDataInFirstPair: // We know that there's *at least* two DWORDs of data remaining in the buffer. // We also know that one of them (or both of them) contains non-ASCII data somewhere. // Let's perform a quick check here to bypass the logic at the beginning of the main loop. thisDWord = *(uint *)pInputBuffer; // still aligned here if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) { pInputBuffer += sizeof(uint); // consumed 1 more DWORD thisDWord = *(uint *)pInputBuffer; // still aligned here } goto AfterReadDWordSkipAllBytesAsciiCheck; } continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks } AfterReadDWordSkipAllBytesAsciiCheck: Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier // Next, try stripping off ASCII bytes one at a time. // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above. { uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord); pInputBuffer += numLeadingAsciiBytes; if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer) { goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD } else { // The input buffer at the current offset contains a non-ASCII byte. // Read an entire DWORD and fall through to multi-byte consumption logic. thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); } } BeforeProcessTwoByteSequence: // At this point, we suspect we're working with a multi-byte code unit sequence, // but we haven't yet validated it for well-formedness. // The masks and comparands are derived from the Unicode Standard, Table 3-6. // Additionally, we need to check for valid byte sequences per Table 3-7. // Check the 2-byte case. thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u; if (0u >= (thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u))) { // Per Table 3-7, valid sequences are: // [ C2..DF ] [ 80..BF ] // // Due to our modification of 'thisDWord' above, this becomes: // [ 02..1F ] [ 00..3F ] // // We've already checked that the leading byte was originally in the range [ C0..DF ] // and that the trailing byte was originally in the range [ 80..BF ], so now we only need // to check that the modified leading byte is >= [ 02 ]. if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u) || (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u)) { goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ] } ProcessTwoByteSequenceSkipOverlongFormCheck: // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew, // there's a good chance that if we see one two-byte run then there's another two-byte // run immediately after. Let's check that now. // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that // the value isn't overlong using a single comparison. On big-endian platforms, we'll need // to validate the mask and validate that the sequence isn't overlong as two separate comparisons. if ((BitConverter.IsLittleEndian && Utf8Utility.UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) || (!BitConverter.IsLittleEndian && (Utf8Utility.UInt32EndsWithUtf8TwoByteMask(thisDWord) && !Utf8Utility.UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord)))) { // We have two runs of two bytes each. pInputBuffer += 4; tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars) if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { // Optimization: If we read a long run of two-byte sequences, the next sequence is probably // also two bytes. Check for that first before going back to the beginning of the loop. thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (BitConverter.IsLittleEndian) { if (Utf8Utility.UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) { // The next sequence is a valid two-byte sequence. goto ProcessTwoByteSequenceSkipOverlongFormCheck; } } else { if (Utf8Utility.UInt32BeginsWithUtf8TwoByteMask(thisDWord)) { if (Utf8Utility.UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord)) { goto Error; // The next sequence purports to be a 2-byte sequence but is overlong. } goto ProcessTwoByteSequenceSkipOverlongFormCheck; } } // If we reached this point, the next sequence is something other than a valid // two-byte sequence, so go back to the beginning of the loop. goto AfterReadDWord; } else { goto ProcessRemainingBytesSlow; // Running out of data - go down slow path } } // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence. // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining // bytes are ASCII? tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing] if (Utf8Utility.UInt32ThirdByteIsAscii(thisDWord)) { if (Utf8Utility.UInt32FourthByteIsAscii(thisDWord)) { pInputBuffer += 4; } else { pInputBuffer += 3; // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte. // Read in the next DWORD and jump directly to the start of the multi-byte processing block. if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); goto BeforeProcessTwoByteSequence; } } } else { pInputBuffer += 2; } continue; } // Check the 3-byte case. // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte. thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u); if (0u >= (thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u))) { ProcessThreeByteSequenceWithCheck: // We assume the caller has confirmed that the bit pattern is representative of a three-byte // sequence, but it may still be overlong or surrogate. We need to check for these possibilities. // // Per Table 3-7, valid sequences are: // [ E0 ] [ A0..BF ] [ 80..BF ] // [ E1..EC ] [ 80..BF ] [ 80..BF ] // [ ED ] [ 80..9F ] [ 80..BF ] // [ EE..EF ] [ 80..BF ] [ 80..BF ] // // Big-endian examples of using the above validation table: // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# #### // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# #### // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20), // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000), // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20). // // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80) // as long as they haven't touched the bits we're about to use in our mask checking below. if (BitConverter.IsLittleEndian) { // The "overlong or surrogate" check can be implemented using a single jump, but there's // some overhead to moving the bits into the correct locations in order to perform the // correct comparison, and in practice the processor's branch prediction capability is // good enough that we shouldn't bother. So we'll use two jumps instead. // Can't extract this check into its own helper method because JITter produces suboptimal // assembly, even with aggressive inlining. // Code below becomes 5 instructions: test, jz, lea, test, jz if ((0u >= (thisDWord & 0x0000_200Fu)) || (0u >= ((thisDWord - 0x0000_200Du) & 0x0000_200Fu))) { goto Error; // overlong or surrogate } } else { if ((0u >= (thisDWord & 0x0F20_0000u)) || (0u >= ((thisDWord - 0x0D20_0000u) & 0x0F20_0000u))) { goto Error; // overlong or surrogate } } ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks: // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way // in to the text. If this happens strip it off now before seeing if the next character // consists of three code units. // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end nint asciiAdjustment; if (BitConverter.IsLittleEndian) { asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value } else { asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value } // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid // to add 4 before backing up since we already checked previously that the input buffer contains at // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can // no longer track the reference. However, we can't back up before adding 4, since we might back up to // before the start of the buffer, and the GC isn't guaranteed to be able to track this. pInputBuffer += 4; // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte pInputBuffer += asciiAdjustment; // back up if we didn't actually consume an ASCII byte tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar) SuccessfullyProcessedThreeByteSequence: if (PlatformDependent.Is64BitProcess && BitConverter.IsLittleEndian) { // x64 little-endian optimization: A three-byte character could indicate CJK text, // which makes it likely that the character following this one is also CJK. // We'll try to process several three-byte sequences at a time. // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so // use nint instead of nuint. if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5) { ulong thisQWord = Unsafe.ReadUnaligned <ulong>(pInputBuffer); // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward // to a previous location in the loop. This offers defense against reading main memory again (which may // have been modified and could lead to a race condition). thisDWord = (uint)thisQWord; // Is this three 3-byte sequences in a row? // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ] // ---- CHAR 3 ---- --------- CHAR 2 --------- --------- CHAR 1 --------- -CHAR 3- if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && Utf8Utility.IsUtf8ContinuationByte(in pInputBuffer[8])) { // Saw a proper bitmask for three incoming 3-byte sequences, perform the // overlong and surrogate sequence checking now. // Check the first character. // If the first character is overlong or a surrogate, fail immediately. if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu))) { goto Error; } // Check the second character. // At this point, we now know the first three bytes represent a well-formed sequence. // If there's an error beyond here, we'll jump back to the "process three known good bytes" // logic. thisQWord >>= 24; if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu))) { goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; } // Check the third character (we already checked that it's followed by a continuation byte). thisQWord >>= 24; if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu))) { goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; } pInputBuffer += 9; tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars) goto SuccessfullyProcessedThreeByteSequence; } // Is this two 3-byte sequences in a row? // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] // --------- CHAR 2 --------- --------- CHAR 1 --------- if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul) { // Saw a proper bitmask for two incoming 3-byte sequences, perform the // overlong and surrogate sequence checking now. // Check the first character. // If the first character is overlong or a surrogate, fail immediately. if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu))) { goto Error; } // Check the second character. // At this point, we now know the first three bytes represent a well-formed sequence. // If there's an error beyond here, we'll jump back to the "process three known good bytes" // logic. thisQWord >>= 24; if ((0u >= ((uint)thisQWord & 0x200Fu)) || (0u >= (((uint)thisQWord - 0x200Du) & 0x200Fu))) { goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; } pInputBuffer += 6; tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars) // The next byte in the sequence didn't have a 3-byte marker, so it's probably // an ASCII character. Jump back to the beginning of loop processing. continue; } if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) { // A single three-byte sequence. goto ProcessThreeByteSequenceWithCheck; } else { // Not a three-byte sequence; perhaps ASCII? goto AfterReadDWord; } } } if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); // Optimization: A three-byte character could indicate CJK text, which makes it likely // that the character following this one is also CJK. We'll check for a three-byte sequence // marker now and jump directly to three-byte sequence processing if we see one, skipping // all of the logic at the beginning of the loop. if (Utf8Utility.UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) { goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process } else { goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop } } else { goto ProcessRemainingBytesSlow; // Running out of data } } // Assume the 4-byte case, but we need to validate. if (BitConverter.IsLittleEndian) { thisDWord &= 0xC0C0_FFFFu; // After the above modifications earlier in this method, we expect 'thisDWord' // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now // perform two checks to confirm this. The first will verify the // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's // complement representation to perform a single *signed* integer check. if ((int)thisDWord > unchecked ((int)0x8000_3FFF)) { goto Error; // didn't have three trailing bytes } // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). thisDWord = BitOperations.RotateRight(thisDWord, 8); // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. // The check is now a simple add / cmp / jcc combo. if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu)) { goto Error; // overlong or out-of-range } } else { thisDWord -= 0x80u; // After the above modifications earlier in this method, we expect 'thisDWord' // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now // perform two checks to confirm this. The first will verify the // [ ######## 00###### 00###### 00###### ] structure. if ((thisDWord & 0x00C0_C0C0u) != 0) { goto Error; // didn't have three trailing bytes } // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). // This is a simple range check. (We don't care about the low two bytes.) if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu)) { goto Error; // overlong or out-of-range } } // Validation of 4-byte case complete. pInputBuffer += 4; tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar continue; // go back to beginning of loop for processing } goto ProcessRemainingBytesSlow; ProcessInputOfLessThanDWordSize: Debug.Assert(inputLength < 4); nuint inputBufferRemainingBytes = (uint)inputLength; goto ProcessSmallBufferCommon; ProcessRemainingBytesSlow: inputBufferRemainingBytes = (nuint)(void *)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; ProcessSmallBufferCommon: Debug.Assert(inputBufferRemainingBytes < 4); while (inputBufferRemainingBytes > 0) { uint firstByte = pInputBuffer[0]; if ((byte)firstByte < 0x80u) { // 1-byte (ASCII) case pInputBuffer++; inputBufferRemainingBytes--; continue; } else if (inputBufferRemainingBytes >= 2) { uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value if ((byte)firstByte < 0xE0u) { // 2-byte case if ((byte)firstByte >= 0xC2u && Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte)) { pInputBuffer += 2; tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar) inputBufferRemainingBytes -= 2; continue; } } else if (inputBufferRemainingBytes >= 3) { if ((byte)firstByte < 0xF0u) { if ((byte)firstByte == 0xE0u) { if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu)) { goto Error; // overlong encoding } } else if ((byte)firstByte == 0xEDu) { if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu)) { goto Error; // would be a UTF-16 surrogate code point } } else { if (!Utf8Utility.IsLowByteUtf8ContinuationByte(secondByte)) { goto Error; // first trailing byte doesn't have proper continuation marker } } if (Utf8Utility.IsUtf8ContinuationByte(in pInputBuffer[2])) { pInputBuffer += 3; tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars) inputBufferRemainingBytes -= 3; continue; } } } } // Error - no match. goto Error; } // If we reached this point, we're out of data, and we saw no bad UTF8 sequence. #if DEBUG // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength]. Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value."); #endif Error: // Report back to our caller how far we got before seeing invalid data. // (Also used for normal termination when falling out of the loop above.) utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }