Example #1
0
        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Sse41.TestZ(
                Sse2.LoadVector128((SByte *)(_dataTable.inArray1Ptr)),
                Sse2.LoadVector128((SByte *)(_dataTable.inArray2Ptr))
                );

            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, result);
        }
Example #2
0
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Sse41.TestZ(
                Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr)
                );

            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, result);
        }
 public static bool AnyMask_bool(m32 m)
 {
     if (Sse41.IsSupported)
     {
         return(!Sse41.TestZ(m, m));
     }
     else
     {
         return(Sse.MoveMask(m.AsSingle()) != 0);
     }
 }
Example #4
0
        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Sse41.TestZ(
                _clsVar1,
                _clsVar2
                );

            ValidateResult(_clsVar1, _clsVar2, result);
        }
Example #5
0
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse41.TestZ(
                Sse2.LoadVector128((Int64 *)(&test._fld1)),
                Sse2.LoadVector128((Int64 *)(&test._fld2))
                );

            ValidateResult(test._fld1, test._fld2, result);
        }
Example #6
0
            public void RunStructFldScenario_Load(BooleanBinaryOpTest__TestZInt64 testClass)
            {
                fixed(Vector128 <Int64> *pFld1 = &_fld1)
                fixed(Vector128 <Int64> *pFld2 = &_fld2)
                {
                    var result = Sse41.TestZ(
                        Sse2.LoadVector128((Int64 *)(pFld1)),
                        Sse2.LoadVector128((Int64 *)(pFld2))
                        );

                    testClass.ValidateResult(_fld1, _fld2, result);
                }
            }
Example #7
0
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <Int64> *pFld1 = &_fld1)
            fixed(Vector128 <Int64> *pFld2 = &_fld2)
            {
                var result = Sse41.TestZ(
                    Sse2.LoadVector128((Int64 *)(pFld1)),
                    Sse2.LoadVector128((Int64 *)(pFld2))
                    );

                ValidateResult(_fld1, _fld2, result);
            }
        }
Example #8
0
        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector128 <Int64> *pClsVar1 = &_clsVar1)
            fixed(Vector128 <Int64> *pClsVar2 = &_clsVar2)
            {
                var result = Sse41.TestZ(
                    Sse2.LoadVector128((Int64 *)(pClsVar1)),
                    Sse2.LoadVector128((Int64 *)(pClsVar2))
                    );

                ValidateResult(_clsVar1, _clsVar2, result);
            }
        }
Example #9
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new BooleanBinaryOpTest__TestZInt64();

            fixed(Vector128 <Int64> *pFld1 = &test._fld1)
            fixed(Vector128 <Int64> *pFld2 = &test._fld2)
            {
                var result = Sse41.TestZ(
                    Sse2.LoadVector128((Int64 *)(pFld1)),
                    Sse2.LoadVector128((Int64 *)(pFld2))
                    );

                ValidateResult(test._fld1, test._fld2, result);
            }
        }
Example #10
0
            public void RunStructFldScenario(BooleanBinaryOpTest__TestZInt64 testClass)
            {
                var result = Sse41.TestZ(_fld1, _fld2);

                testClass.ValidateResult(_fld1, _fld2, result);
            }
Example #11
0
        public void RunFldScenario()
        {
            var result = Sse41.TestZ(_fld1, _fld2);

            ValidateResult(_fld1, _fld2, result);
        }
Example #12
0
        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char *pBuffer, nuint bufferLength /* in chars */)
        {
            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
            // will be elided by JIT once we determine which specific ISAs we support.

            // Quick check for empty inputs.

            if (bufferLength == 0)
            {
                return(0);
            }

            // JIT turns the below into constants

            uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf <Vector128 <byte> >();
            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);

            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");

            Vector128 <short> firstVector, secondVector;
            uint  currentMask;
            char *pOriginalBuffer = pBuffer;

            if (bufferLength < SizeOfVector128InChars)
            {
                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
            }

            // This method is written such that control generally flows top-to-bottom, avoiding
            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
            // data, we jump out of the hot paths to targets at the end of the method.

            Vector128 <short>  asciiMaskForPTEST   = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware
            Vector128 <ushort> asciiMaskForPMINUW  = Vector128.Create((ushort)0x0080);            // used for PMINUW on supported hardware
            Vector128 <short>  asciiMaskForPXOR    = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR
            Vector128 <short>  asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW

            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));

            // Read the first vector unaligned.

            firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load

            if (Sse41.IsSupported)
            {
                // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
                // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
                // in order to extract the mask.
                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
            }
            else
            {
                // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
                // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
                // the mask.
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
            }

            if (currentMask != 0)
            {
                goto FoundNonAsciiDataInCurrentMask;
            }

            // If we have less than 32 bytes to process, just go straight to the final unaligned
            // read. There's no need to mess with the loop logic in the middle of this method.

            // Adjust the remaining length to account for what we just read.
            // For the remainder of this code path, bufferLength will be in bytes, not chars.

            bufferLength <<= 1; // chars to bytes

            if (bufferLength < 2 * SizeOfVector128InBytes)
            {
                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
            }

            // Now adjust the read pointer so that future reads are aligned.

            pBuffer = (char *)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));

#if DEBUG
            long numCharsRead = pBuffer - pOriginalBuffer;
            Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
            Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
#endif

            // Adjust remaining buffer length.

            bufferLength += (nuint)pOriginalBuffer;
            bufferLength -= (nuint)pBuffer;

            // The buffer is now properly aligned.
            // Read 2 vectors at a time if possible.

            if (bufferLength >= 2 * SizeOfVector128InBytes)
            {
                char *pFinalVectorReadPos = (char *)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);

                // After this point, we no longer need to update the bufferLength value.

                do
                {
                    firstVector  = Sse2.LoadAlignedVector128((short *)pBuffer);
                    secondVector = Sse2.LoadAlignedVector128((short *)pBuffer + SizeOfVector128InChars);
                    Vector128 <short> combinedVector = Sse2.Or(firstVector, secondVector);

                    if (Sse41.IsSupported)
                    {
                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
                        if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
                        {
                            goto FoundNonAsciiDataInFirstOrSecondVector;
                        }
                    }
                    else
                    {
                        // See comment earlier in the method for an explanation of how the below logic works.
                        if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                        {
                            goto FoundNonAsciiDataInFirstOrSecondVector;
                        }
                    }

                    pBuffer += 2 * SizeOfVector128InChars;
                } while (pBuffer <= pFinalVectorReadPos);
            }

            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
            // at what bits of it are set. This works because had we updated it within the loop above,
            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
            // bits which are less significant than those that the addition would've acted on.

            // If there is fewer than one vector length remaining, skip the next aligned read.
            // Remember, at this point bufferLength is measured in bytes, not chars.

            if ((bufferLength & SizeOfVector128InBytes) == 0)
            {
                goto DoFinalUnalignedVectorRead;
            }

            // At least one full vector's worth of data remains, so we can safely read it.
            // Remember, at this point pBuffer is still aligned.

            firstVector = Sse2.LoadAlignedVector128((short *)pBuffer);

            if (Sse41.IsSupported)
            {
                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
                {
                    goto FoundNonAsciiDataInFirstVector;
                }
            }
            else
            {
                // See comment earlier in the method for an explanation of how the below logic works.
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
                if (currentMask != 0)
                {
                    goto FoundNonAsciiDataInCurrentMask;
                }
            }

IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:

            pBuffer += SizeOfVector128InChars;

DoFinalUnalignedVectorRead:

            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
            {
                // Perform an unaligned read of the last vector.
                // We need to adjust the pointer because we're re-reading data.

                pBuffer     = (char *)((byte *)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
                firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load

                if (Sse41.IsSupported)
                {
                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
                    if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
                    {
                        goto FoundNonAsciiDataInFirstVector;
                    }
                }
                else
                {
                    // See comment earlier in the method for an explanation of how the below logic works.
                    currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
                    if (currentMask != 0)
                    {
                        goto FoundNonAsciiDataInCurrentMask;
                    }
                }

                pBuffer += SizeOfVector128InChars;
            }

Finish:

            Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
            return(((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char)); // and we're done! (remember to adjust for char count)

FoundNonAsciiDataInFirstOrSecondVector:

            // We don't know if the first or the second vector contains non-ASCII data. Check the first
            // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
            // we'll make sure the first vector local is the one that contains the non-ASCII data.

            // See comment earlier in the method for an explanation of how the below logic works.
            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
                {
                    goto FoundNonAsciiDataInFirstVector;
                }
            }
            else
            {
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
                if (currentMask != 0)
                {
                    goto FoundNonAsciiDataInCurrentMask;
                }
            }

            // Wasn't the first vector; must be the second.

            pBuffer    += SizeOfVector128InChars;
            firstVector = secondVector;

FoundNonAsciiDataInFirstVector:

            // See comment earlier in the method for an explanation of how the below logic works.
            if (Sse41.IsSupported)
            {
                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
            }
            else
            {
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
            }

FoundNonAsciiDataInCurrentMask:

            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
            // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
            // masks work on BYTE elements, and we account for this in the final fixup.)

            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
            pBuffer = (char *)((byte *)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));

            goto Finish;

FoundNonAsciiDataInCurrentDWord:

            uint currentDWord;
            Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");

            if (FirstCharInUInt32IsAscii(currentDWord))
            {
                pBuffer++; // skip past the ASCII char
            }

            goto Finish;

InputBufferLessThanOneVectorInLength:

            // These code paths get hit if the original input length was less than one vector in size.
            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
            // directly. Note that all of these reads are unaligned.

            // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
            // We skipped the code path that multiplied the count by sizeof(char).

            Debug.Assert(bufferLength < SizeOfVector128InChars);

            // QWORD drain

            if ((bufferLength & 4) != 0)
            {
                if (Bmi1.X64.IsSupported)
                {
                    // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.

                    ulong candidateUInt64 = Unsafe.ReadUnaligned <ulong>(pBuffer);
                    if (!AllCharsInUInt64AreAscii(candidateUInt64))
                    {
                        // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
                        // Remember the / 8 at the end to convert bit count to byte count,
                        // then the & ~1 at the end to treat a match in the high byte of
                        // any char the same as a match in the low byte of that same char.

                        candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
                        pBuffer          = (char *)((byte *)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
                        goto Finish;
                    }
                }
                else
                {
                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.

                    currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer);
                    uint nextDWord = Unsafe.ReadUnaligned <uint>(pBuffer + 4 / sizeof(char));

                    if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
                    {
                        // At least one of the values wasn't all-ASCII.
                        // We need to figure out which one it was and stick it in the currentMask local.

                        if (AllCharsInUInt32AreAscii(currentDWord))
                        {
                            currentDWord = nextDWord; // this one is the culprit
                            pBuffer     += 4 / sizeof(char);
                        }

                        goto FoundNonAsciiDataInCurrentDWord;
                    }
                }

                pBuffer += 4; // successfully consumed 4 ASCII chars
            }

            // DWORD drain

            if ((bufferLength & 2) != 0)
            {
                currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer);

                if (!AllCharsInUInt32AreAscii(currentDWord))
                {
                    goto FoundNonAsciiDataInCurrentDWord;
                }

                pBuffer += 2; // successfully consumed 2 ASCII chars
            }

            // WORD drain
            // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.

            if ((bufferLength & 1) != 0)
            {
                if (*pBuffer <= 0x007F)
                {
                    pBuffer++; // successfully consumed a single char
                }
            }

            goto Finish;
        }
Example #13
0
 public static bool _mm_testz_si128(Vector128 <sbyte> left, Vector128 <sbyte> right)
 {
     return(Sse41.TestZ(left, right));
 }
Example #14
0
    private static unsafe int EncodeBinarySse(
        ReadOnlySpan <byte> source, Span <byte> destination)
    {
        var length = (uint)source.Length;

        if ((uint)destination.Length < length * 2)
            ThrowException();

        fixed(byte *src = source)
        fixed(byte *dest = destination)
        {
            var srcCurrent  = src;
            var destCurrent = dest;

            var end     = src + length;
            var simdEnd = end - (length % 16);

            while (srcCurrent < simdEnd)
            {
                // Load 16 bytes (unaligned) into the XMM register.
                var data = Sse2.LoadVector128(srcCurrent);

                // Compare each byte of the input with '\0'. This results in a vector
                // where each byte is either \x00 or \xFF, depending on whether the
                // input had a '\x00' in the corresponding position.
                var zeroBytes = Sse2.CompareEqual(data, Vector128 <byte> .Zero);

                // Check whether the resulting vector is all-zero.
                // If it's all zero, we can just store the entire chunk.
                if (Sse41.TestZ(zeroBytes, zeroBytes))
                {
                    Sse2.Store(destCurrent, data);
                }
                else
                {
                    break;
                }

                srcCurrent  += 16;
                destCurrent += 16;
            }

            while (srcCurrent < end)
            {
                byte value = *srcCurrent++;
                if (value == 0)
                {
                    *destCurrent++ = 0;
                    *destCurrent++ = 1;
                }
                else
                {
                    *destCurrent++ = value;
                }
            }

            var written = destCurrent - dest;

            return((int)written);
        }
    }
Example #15
0
        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char *pUtf16Buffer, byte *pAsciiBuffer, nuint elementCount)
        {
            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
            // will be elided by JIT once we determine which specific ISAs we support.

            // JIT turns the below into constants

            uint  SizeOfVector128          = (uint)Unsafe.SizeOf <Vector128 <byte> >();
            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);

            // This method is written such that control generally flows top-to-bottom, avoiding
            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
            // data, we jump out of the hot paths to targets at the end of the method.

            Debug.Assert(Sse2.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);
            Debug.Assert(elementCount >= 2 * SizeOfVector128);

            Vector128 <short> asciiMaskForPTEST   = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware
            Vector128 <short> asciiMaskForPXOR    = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR
            Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW

            // First, perform an unaligned read of the first part of the input buffer.

            Vector128 <short> utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer); // unaligned load

            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.

            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                {
                    return(0);
                }
            }
            else
            {
                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                {
                    return(0);
                }
            }

            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.

            Vector128 <byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);

            Sse2.StoreScalar((ulong *)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED

            nuint currentOffsetInElements = SizeOfVector128 / 2;             // we processed 8 elements so far

            // We're going to get the best performance when we have aligned writes, so we'll take the
            // hit of potentially unaligned reads in order to hit this sweet spot.

            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
            // that case we can immediately back up to the previous aligned boundary and start the main loop.
            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
            // just past the next aligned boundary address.

            if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2)))
            {
                // We need to perform one more partial vector write before we can get the alignment we want.

                utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load

                // See comments earlier in this method for information about how this works.
                if (Sse41.IsSupported)
                {
                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                    {
                        goto Finish;
                    }
                }
                else
                {
                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                    {
                        goto Finish;
                    }
                }

                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
                Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
            }

            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
            // point, then use that as the base offset going forward.

            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");

            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");

            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;

            do
            {
                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.

                utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements);                                                      // unaligned load
                Vector128 <short> utf16VectorSecond = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
                Vector128 <short> combinedVector    = Sse2.Or(utf16VectorFirst, utf16VectorSecond);

                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
                if (Sse41.IsSupported)
                {
                    if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
                    {
                        goto FoundNonAsciiDataInLoop;
                    }
                }
                else
                {
                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                    {
                        goto FoundNonAsciiDataInLoop;
                    }
                }

                // Build up the UTF-8 vector and perform the store.

                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);

                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned

                currentOffsetInElements += SizeOfVector128;
            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);

Finish:

            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
            return(currentOffsetInElements);

FoundNonAsciiDataInLoop:

            // Can we at least narrow the high vector?
            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                {
                    goto Finish; // found non-ASCII data
                }
            }
            else
            {
                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                {
                    goto Finish; // found non-ASCII data
                }
            }

            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);

            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");

            Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
            currentOffsetInElements += SizeOfVector128 / 2;

            goto Finish;
        }