public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Sse41.TestZ( Sse2.LoadVector128((SByte *)(_dataTable.inArray1Ptr)), Sse2.LoadVector128((SByte *)(_dataTable.inArray2Ptr)) ); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, result); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Sse41.TestZ( Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr) ); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, result); }
public static bool AnyMask_bool(m32 m) { if (Sse41.IsSupported) { return(!Sse41.TestZ(m, m)); } else { return(Sse.MoveMask(m.AsSingle()) != 0); } }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Sse41.TestZ( _clsVar1, _clsVar2 ); ValidateResult(_clsVar1, _clsVar2, result); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse41.TestZ( Sse2.LoadVector128((Int64 *)(&test._fld1)), Sse2.LoadVector128((Int64 *)(&test._fld2)) ); ValidateResult(test._fld1, test._fld2, result); }
public void RunStructFldScenario_Load(BooleanBinaryOpTest__TestZInt64 testClass) { fixed(Vector128 <Int64> *pFld1 = &_fld1) fixed(Vector128 <Int64> *pFld2 = &_fld2) { var result = Sse41.TestZ( Sse2.LoadVector128((Int64 *)(pFld1)), Sse2.LoadVector128((Int64 *)(pFld2)) ); testClass.ValidateResult(_fld1, _fld2, result); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Int64> *pFld1 = &_fld1) fixed(Vector128 <Int64> *pFld2 = &_fld2) { var result = Sse41.TestZ( Sse2.LoadVector128((Int64 *)(pFld1)), Sse2.LoadVector128((Int64 *)(pFld2)) ); ValidateResult(_fld1, _fld2, result); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <Int64> *pClsVar1 = &_clsVar1) fixed(Vector128 <Int64> *pClsVar2 = &_clsVar2) { var result = Sse41.TestZ( Sse2.LoadVector128((Int64 *)(pClsVar1)), Sse2.LoadVector128((Int64 *)(pClsVar2)) ); ValidateResult(_clsVar1, _clsVar2, result); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new BooleanBinaryOpTest__TestZInt64(); fixed(Vector128 <Int64> *pFld1 = &test._fld1) fixed(Vector128 <Int64> *pFld2 = &test._fld2) { var result = Sse41.TestZ( Sse2.LoadVector128((Int64 *)(pFld1)), Sse2.LoadVector128((Int64 *)(pFld2)) ); ValidateResult(test._fld1, test._fld2, result); } }
public void RunStructFldScenario(BooleanBinaryOpTest__TestZInt64 testClass) { var result = Sse41.TestZ(_fld1, _fld2); testClass.ValidateResult(_fld1, _fld2, result); }
public void RunFldScenario() { var result = Sse41.TestZ(_fld1, _fld2); ValidateResult(_fld1, _fld2, result); }
private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char *pBuffer, nuint bufferLength /* in chars */) { // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method // will be elided by JIT once we determine which specific ISAs we support. // Quick check for empty inputs. if (bufferLength == 0) { return(0); } // JIT turns the below into constants uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf <Vector128 <byte> >(); uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char); Debug.Assert(Sse2.IsSupported, "Should've been checked by caller."); Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian."); Vector128 <short> firstVector, secondVector; uint currentMask; char *pOriginalBuffer = pBuffer; if (bufferLength < SizeOfVector128InChars) { goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead } // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII // data, we jump out of the hot paths to targets at the end of the method. Vector128 <short> asciiMaskForPTEST = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware Vector128 <ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware Vector128 <short> asciiMaskForPXOR = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); // Read the first vector unaligned. firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load if (Sse41.IsSupported) { // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector // in order to extract the mask. currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte()); } else { // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract // the mask. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); } if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } // If we have less than 32 bytes to process, just go straight to the final unaligned // read. There's no need to mess with the loop logic in the middle of this method. // Adjust the remaining length to account for what we just read. // For the remainder of this code path, bufferLength will be in bytes, not chars. bufferLength <<= 1; // chars to bytes if (bufferLength < 2 * SizeOfVector128InBytes) { goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead; } // Now adjust the read pointer so that future reads are aligned. pBuffer = (char *)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char."); Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); #endif // Adjust remaining buffer length. bufferLength += (nuint)pOriginalBuffer; bufferLength -= (nuint)pBuffer; // The buffer is now properly aligned. // Read 2 vectors at a time if possible. if (bufferLength >= 2 * SizeOfVector128InBytes) { char *pFinalVectorReadPos = (char *)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes); // After this point, we no longer need to update the bufferLength value. do { firstVector = Sse2.LoadAlignedVector128((short *)pBuffer); secondVector = Sse2.LoadAlignedVector128((short *)pBuffer + SizeOfVector128InChars); Vector128 <short> combinedVector = Sse2.Or(firstVector, secondVector); if (Sse41.IsSupported) { // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstOrSecondVector; } } else { // See comment earlier in the method for an explanation of how the below logic works. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto FoundNonAsciiDataInFirstOrSecondVector; } } pBuffer += 2 * SizeOfVector128InChars; } while (pBuffer <= pFinalVectorReadPos); } // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from. // Since the above loop doesn't update bufferLength, we can't rely on its absolute value. // But we _can_ rely on it to tell us how much remaining data must be drained by looking // at what bits of it are set. This works because had we updated it within the loop above, // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about // bits which are less significant than those that the addition would've acted on. // If there is fewer than one vector length remaining, skip the next aligned read. // Remember, at this point bufferLength is measured in bytes, not chars. if ((bufferLength & SizeOfVector128InBytes) == 0) { goto DoFinalUnalignedVectorRead; } // At least one full vector's worth of data remains, so we can safely read it. // Remember, at this point pBuffer is still aligned. firstVector = Sse2.LoadAlignedVector128((short *)pBuffer); if (Sse41.IsSupported) { // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstVector; } } else { // See comment earlier in the method for an explanation of how the below logic works. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } } IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: pBuffer += SizeOfVector128InChars; DoFinalUnalignedVectorRead: if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0) { // Perform an unaligned read of the last vector. // We need to adjust the pointer because we're re-reading data. pBuffer = (char *)((byte *)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes); firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load if (Sse41.IsSupported) { // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstVector; } } else { // See comment earlier in the method for an explanation of how the below logic works. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } } pBuffer += SizeOfVector128InChars; } Finish: Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count."); return(((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char)); // and we're done! (remember to adjust for char count) FoundNonAsciiDataInFirstOrSecondVector: // We don't know if the first or the second vector contains non-ASCII data. Check the first // vector, and if that's all-ASCII then the second vector must be the culprit. Either way // we'll make sure the first vector local is the one that contains the non-ASCII data. // See comment earlier in the method for an explanation of how the below logic works. if (Sse41.IsSupported) { if (!Sse41.TestZ(firstVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstVector; } } else { currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } } // Wasn't the first vector; must be the second. pBuffer += SizeOfVector128InChars; firstVector = secondVector; FoundNonAsciiDataInFirstVector: // See comment earlier in the method for an explanation of how the below logic works. if (Sse41.IsSupported) { currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte()); } else { currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); } FoundNonAsciiDataInCurrentMask: // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements, // masks work on BYTE elements, and we account for this in the final fixup.) Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data."); pBuffer = (char *)((byte *)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask)); goto Finish; FoundNonAsciiDataInCurrentDWord: uint currentDWord; Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data."); if (FirstCharInUInt32IsAscii(currentDWord)) { pBuffer++; // skip past the ASCII char } goto Finish; InputBufferLessThanOneVectorInLength: // These code paths get hit if the original input length was less than one vector in size. // We can't perform vectorized reads at this point, so we'll fall back to reading primitives // directly. Note that all of these reads are unaligned. // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count. // We skipped the code path that multiplied the count by sizeof(char). Debug.Assert(bufferLength < SizeOfVector128InChars); // QWORD drain if ((bufferLength & 4) != 0) { if (Bmi1.X64.IsSupported) { // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it. ulong candidateUInt64 = Unsafe.ReadUnaligned <ulong>(pBuffer); if (!AllCharsInUInt64AreAscii(candidateUInt64)) { // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt. // Remember the / 8 at the end to convert bit count to byte count, // then the & ~1 at the end to treat a match in the high byte of // any char the same as a match in the low byte of that same char. candidateUInt64 &= 0xFF80FF80_FF80FF80ul; pBuffer = (char *)((byte *)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1)); goto Finish; } } else { // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead. currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer); uint nextDWord = Unsafe.ReadUnaligned <uint>(pBuffer + 4 / sizeof(char)); if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord)) { // At least one of the values wasn't all-ASCII. // We need to figure out which one it was and stick it in the currentMask local. if (AllCharsInUInt32AreAscii(currentDWord)) { currentDWord = nextDWord; // this one is the culprit pBuffer += 4 / sizeof(char); } goto FoundNonAsciiDataInCurrentDWord; } } pBuffer += 4; // successfully consumed 4 ASCII chars } // DWORD drain if ((bufferLength & 2) != 0) { currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer); if (!AllCharsInUInt32AreAscii(currentDWord)) { goto FoundNonAsciiDataInCurrentDWord; } pBuffer += 2; // successfully consumed 2 ASCII chars } // WORD drain // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char. if ((bufferLength & 1) != 0) { if (*pBuffer <= 0x007F) { pBuffer++; // successfully consumed a single char } } goto Finish; }
public static bool _mm_testz_si128(Vector128 <sbyte> left, Vector128 <sbyte> right) { return(Sse41.TestZ(left, right)); }
private static unsafe int EncodeBinarySse( ReadOnlySpan <byte> source, Span <byte> destination) { var length = (uint)source.Length; if ((uint)destination.Length < length * 2) ThrowException(); fixed(byte *src = source) fixed(byte *dest = destination) { var srcCurrent = src; var destCurrent = dest; var end = src + length; var simdEnd = end - (length % 16); while (srcCurrent < simdEnd) { // Load 16 bytes (unaligned) into the XMM register. var data = Sse2.LoadVector128(srcCurrent); // Compare each byte of the input with '\0'. This results in a vector // where each byte is either \x00 or \xFF, depending on whether the // input had a '\x00' in the corresponding position. var zeroBytes = Sse2.CompareEqual(data, Vector128 <byte> .Zero); // Check whether the resulting vector is all-zero. // If it's all zero, we can just store the entire chunk. if (Sse41.TestZ(zeroBytes, zeroBytes)) { Sse2.Store(destCurrent, data); } else { break; } srcCurrent += 16; destCurrent += 16; } while (srcCurrent < end) { byte value = *srcCurrent++; if (value == 0) { *destCurrent++ = 0; *destCurrent++ = 1; } else { *destCurrent++ = value; } } var written = destCurrent - dest; return((int)written); } }
private static unsafe nuint NarrowUtf16ToAscii_Sse2(char *pUtf16Buffer, byte *pAsciiBuffer, nuint elementCount) { // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method // will be elided by JIT once we determine which specific ISAs we support. // JIT turns the below into constants uint SizeOfVector128 = (uint)Unsafe.SizeOf <Vector128 <byte> >(); nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII // data, we jump out of the hot paths to targets at the end of the method. Debug.Assert(Sse2.IsSupported); Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(elementCount >= 2 * SizeOfVector128); Vector128 <short> asciiMaskForPTEST = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware Vector128 <short> asciiMaskForPXOR = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW // First, perform an unaligned read of the first part of the input buffer. Vector128 <short> utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer); // unaligned load // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST)) { return(0); } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { return(0); } } // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. Vector128 <byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); Sse2.StoreScalar((ulong *)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far // We're going to get the best performance when we have aligned writes, so we'll take the // hit of potentially unaligned reads in order to hit this sweet spot. // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In // that case we can immediately back up to the previous aligned boundary and start the main loop. // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump // just past the next aligned boundary address. if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2))) { // We need to perform one more partial vector write before we can get the alignment we want. utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load // See comments earlier in this method for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST)) { goto Finish; } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto Finish; } } // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment // point, then use that as the base offset going forward. currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128); Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector."); Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector."); nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128; do { // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load Vector128 <short> utf16VectorSecond = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load Vector128 <short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond); // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInLoop; } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto FoundNonAsciiDataInLoop; } } // Build up the UTF-8 vector and perform the store. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond); Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned."); Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned currentOffsetInElements += SizeOfVector128; } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); Finish: // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain. return(currentOffsetInElements); FoundNonAsciiDataInLoop: // Can we at least narrow the high vector? // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST)) { goto Finish; // found non-ASCII data } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto Finish; // found non-ASCII data } } // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned currentOffsetInElements += SizeOfVector128 / 2; goto Finish; }