C# (CSharp) Sse41.TestZ Examples

Programming Language: C# (CSharp)

Class/Type: Sse41

Method/Function: TestZ

Examples at hotexamples.com: 15

C# (CSharp) Sse41.TestZ - 15 examples found. These are the top rated real world C# (CSharp) examples of Sse41.TestZ extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Min(30)

ConvertToVector128Int32(30)

Max(30)

MultiplyLow(30)

Extract(30)

DotProduct(30)

BlendVariable(30)

RoundCurrentDirectionScalar(28)

Insert(28)

RoundToZeroScalar(28)

TestAllOnes(27)

CompareEqual(26)

RoundToNearestIntegerScalar(25)

TestMixOnesZeros(25)

RoundToPositiveInfinityScalar(25)

Floor(24)

Ceiling(24)

RoundToZero(23)

RoundToNegativeInfinity(23)

RoundCurrentDirection(22)

RoundToPositiveInfinity(21)

TestC(20)

PackUnsignedSaturate(20)

RoundToNearestInteger(18)

TestZ(15)

FloorScalar(14)

Blend(12)

TestNotZAndNotC(12)

ConvertToVector128Int16(12)

CeilingScalar(12)

RoundToNegativeInfinityScalar(12)

MinHorizontal(9)

MultipleSumAbsoluteDifferences(8)

ConvertToVector128Int64(7)

LoadVector128(5)

Multiply(3)

LoadAlignedVector128NonTemporal(3)

TestAllZeros(3)

Add(2)

Store(1)

HorizontalAdd(1)

ConvertToVector128Single(1)

Example #1

0

Show file

File: TestZ.SByte.cs Project: zwei222/coreclr

        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Sse41.TestZ(
                Sse2.LoadVector128((SByte *)(_dataTable.inArray1Ptr)),
                Sse2.LoadVector128((SByte *)(_dataTable.inArray2Ptr))
                );

            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, result);
        }

Example #2

0

Show file

File: TestZ.SByte.cs Project: zwei222/coreclr

        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Sse41.TestZ(
                Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr)
                );

            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, result);
        }

Example #3

0

Show file

File: Sse2Functions.cs Project: TechnologicalPizza/SharpFastNoise2

 public static bool AnyMask_bool(m32 m)
 {
     if (Sse41.IsSupported)
     {
         return(!Sse41.TestZ(m, m));
     }
     else
     {
         return(Sse.MoveMask(m.AsSingle()) != 0);
     }
 }

Example #4

0

Show file

        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Sse41.TestZ(
                _clsVar1,
                _clsVar2
                );

            ValidateResult(_clsVar1, _clsVar2, result);
        }

Example #5

0

Show file

File: TestZ.Int64.cs Project: zwei222/coreclr

        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse41.TestZ(
                Sse2.LoadVector128((Int64 *)(&test._fld1)),
                Sse2.LoadVector128((Int64 *)(&test._fld2))
                );

            ValidateResult(test._fld1, test._fld2, result);
        }

Example #6

0

Show file

File: TestZ.Int64.cs Project: zwei222/coreclr

            public void RunStructFldScenario_Load(BooleanBinaryOpTest__TestZInt64 testClass)
            {
                fixed(Vector128 <Int64> *pFld1 = &_fld1)
                fixed(Vector128 <Int64> *pFld2 = &_fld2)
                {
                    var result = Sse41.TestZ(
                        Sse2.LoadVector128((Int64 *)(pFld1)),
                        Sse2.LoadVector128((Int64 *)(pFld2))
                        );

                    testClass.ValidateResult(_fld1, _fld2, result);
                }
            }

Example #7

0

Show file

File: TestZ.Int64.cs Project: zwei222/coreclr

        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <Int64> *pFld1 = &_fld1)
            fixed(Vector128 <Int64> *pFld2 = &_fld2)
            {
                var result = Sse41.TestZ(
                    Sse2.LoadVector128((Int64 *)(pFld1)),
                    Sse2.LoadVector128((Int64 *)(pFld2))
                    );

                ValidateResult(_fld1, _fld2, result);
            }
        }

Example #8

0

Show file

File: TestZ.Int64.cs Project: zwei222/coreclr

        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector128 <Int64> *pClsVar1 = &_clsVar1)
            fixed(Vector128 <Int64> *pClsVar2 = &_clsVar2)
            {
                var result = Sse41.TestZ(
                    Sse2.LoadVector128((Int64 *)(pClsVar1)),
                    Sse2.LoadVector128((Int64 *)(pClsVar2))
                    );

                ValidateResult(_clsVar1, _clsVar2, result);
            }
        }

Example #9

0

Show file

File: TestZ.Int64.cs Project: zwei222/coreclr

        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new BooleanBinaryOpTest__TestZInt64();

            fixed(Vector128 <Int64> *pFld1 = &test._fld1)
            fixed(Vector128 <Int64> *pFld2 = &test._fld2)
            {
                var result = Sse41.TestZ(
                    Sse2.LoadVector128((Int64 *)(pFld1)),
                    Sse2.LoadVector128((Int64 *)(pFld2))
                    );

                ValidateResult(test._fld1, test._fld2, result);
            }
        }

Example #10

0

Show file

File: TestZ.Int64.cs Project: zwei222/coreclr

            public void RunStructFldScenario(BooleanBinaryOpTest__TestZInt64 testClass)
            {
                var result = Sse41.TestZ(_fld1, _fld2);

                testClass.ValidateResult(_fld1, _fld2, result);
            }

Example #11

0

Show file

        public void RunFldScenario()
        {
            var result = Sse41.TestZ(_fld1, _fld2);

            ValidateResult(_fld1, _fld2, result);
        }

Example #12

0

Show file

        private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char *pBuffer, nuint bufferLength /* in chars */)
        {
            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
            // will be elided by JIT once we determine which specific ISAs we support.

            // Quick check for empty inputs.

            if (bufferLength == 0)
            {
                return(0);
            }

            // JIT turns the below into constants

            uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf <Vector128 <byte> >();
            uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);

            Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
            Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");

            Vector128 <short> firstVector, secondVector;
            uint  currentMask;
            char *pOriginalBuffer = pBuffer;

            if (bufferLength < SizeOfVector128InChars)
            {
                goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead
            }

            // This method is written such that control generally flows top-to-bottom, avoiding
            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
            // data, we jump out of the hot paths to targets at the end of the method.

            Vector128 <short>  asciiMaskForPTEST   = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware
            Vector128 <ushort> asciiMaskForPMINUW  = Vector128.Create((ushort)0x0080);            // used for PMINUW on supported hardware
            Vector128 <short>  asciiMaskForPXOR    = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR
            Vector128 <short>  asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW

            Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));

            // Read the first vector unaligned.

            firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load

            if (Sse41.IsSupported)
            {
                // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be
                // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector
                // in order to extract the mask.
                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
            }
            else
            {
                // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD
                // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract
                // the mask.
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
            }

            if (currentMask != 0)
            {
                goto FoundNonAsciiDataInCurrentMask;
            }

            // If we have less than 32 bytes to process, just go straight to the final unaligned
            // read. There's no need to mess with the loop logic in the middle of this method.

            // Adjust the remaining length to account for what we just read.
            // For the remainder of this code path, bufferLength will be in bytes, not chars.

            bufferLength <<= 1; // chars to bytes

            if (bufferLength < 2 * SizeOfVector128InBytes)
            {
                goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead;
            }

            // Now adjust the read pointer so that future reads are aligned.

            pBuffer = (char *)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1));

#if DEBUG
            long numCharsRead = pBuffer - pOriginalBuffer;
            Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char.");
            Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer.");
#endif

            // Adjust remaining buffer length.

            bufferLength += (nuint)pOriginalBuffer;
            bufferLength -= (nuint)pBuffer;

            // The buffer is now properly aligned.
            // Read 2 vectors at a time if possible.

            if (bufferLength >= 2 * SizeOfVector128InBytes)
            {
                char *pFinalVectorReadPos = (char *)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes);

                // After this point, we no longer need to update the bufferLength value.

                do
                {
                    firstVector  = Sse2.LoadAlignedVector128((short *)pBuffer);
                    secondVector = Sse2.LoadAlignedVector128((short *)pBuffer + SizeOfVector128InChars);
                    Vector128 <short> combinedVector = Sse2.Or(firstVector, secondVector);

                    if (Sse41.IsSupported)
                    {
                        // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
                        // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
                        if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
                        {
                            goto FoundNonAsciiDataInFirstOrSecondVector;
                        }
                    }
                    else
                    {
                        // See comment earlier in the method for an explanation of how the below logic works.
                        if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                        {
                            goto FoundNonAsciiDataInFirstOrSecondVector;
                        }
                    }

                    pBuffer += 2 * SizeOfVector128InChars;
                } while (pBuffer <= pFinalVectorReadPos);
            }

            // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from.
            // Since the above loop doesn't update bufferLength, we can't rely on its absolute value.
            // But we _can_ rely on it to tell us how much remaining data must be drained by looking
            // at what bits of it are set. This works because had we updated it within the loop above,
            // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about
            // bits which are less significant than those that the addition would've acted on.

            // If there is fewer than one vector length remaining, skip the next aligned read.
            // Remember, at this point bufferLength is measured in bytes, not chars.

            if ((bufferLength & SizeOfVector128InBytes) == 0)
            {
                goto DoFinalUnalignedVectorRead;
            }

            // At least one full vector's worth of data remains, so we can safely read it.
            // Remember, at this point pBuffer is still aligned.

            firstVector = Sse2.LoadAlignedVector128((short *)pBuffer);

            if (Sse41.IsSupported)
            {
                // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
                // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
                {
                    goto FoundNonAsciiDataInFirstVector;
                }
            }
            else
            {
                // See comment earlier in the method for an explanation of how the below logic works.
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
                if (currentMask != 0)
                {
                    goto FoundNonAsciiDataInCurrentMask;
                }
            }

IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:

            pBuffer += SizeOfVector128InChars;

DoFinalUnalignedVectorRead:

            if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0)
            {
                // Perform an unaligned read of the last vector.
                // We need to adjust the pointer because we're re-reading data.

                pBuffer     = (char *)((byte *)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
                firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load

                if (Sse41.IsSupported)
                {
                    // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data.
                    // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data.
                    if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
                    {
                        goto FoundNonAsciiDataInFirstVector;
                    }
                }
                else
                {
                    // See comment earlier in the method for an explanation of how the below logic works.
                    currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
                    if (currentMask != 0)
                    {
                        goto FoundNonAsciiDataInCurrentMask;
                    }
                }

                pBuffer += SizeOfVector128InChars;
            }

Finish:

            Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count.");
            return(((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char)); // and we're done! (remember to adjust for char count)

FoundNonAsciiDataInFirstOrSecondVector:

            // We don't know if the first or the second vector contains non-ASCII data. Check the first
            // vector, and if that's all-ASCII then the second vector must be the culprit. Either way
            // we'll make sure the first vector local is the one that contains the non-ASCII data.

            // See comment earlier in the method for an explanation of how the below logic works.
            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(firstVector, asciiMaskForPTEST))
                {
                    goto FoundNonAsciiDataInFirstVector;
                }
            }
            else
            {
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
                if (currentMask != 0)
                {
                    goto FoundNonAsciiDataInCurrentMask;
                }
            }

            // Wasn't the first vector; must be the second.

            pBuffer    += SizeOfVector128InChars;
            firstVector = secondVector;

FoundNonAsciiDataInFirstVector:

            // See comment earlier in the method for an explanation of how the below logic works.
            if (Sse41.IsSupported)
            {
                currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte());
            }
            else
            {
                currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte());
            }

FoundNonAsciiDataInCurrentMask:

            // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte.
            // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't
            // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements,
            // masks work on BYTE elements, and we account for this in the final fixup.)

            Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data.");
            pBuffer = (char *)((byte *)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask));

            goto Finish;

FoundNonAsciiDataInCurrentDWord:

            uint currentDWord;
            Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data.");

            if (FirstCharInUInt32IsAscii(currentDWord))
            {
                pBuffer++; // skip past the ASCII char
            }

            goto Finish;

InputBufferLessThanOneVectorInLength:

            // These code paths get hit if the original input length was less than one vector in size.
            // We can't perform vectorized reads at this point, so we'll fall back to reading primitives
            // directly. Note that all of these reads are unaligned.

            // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count.
            // We skipped the code path that multiplied the count by sizeof(char).

            Debug.Assert(bufferLength < SizeOfVector128InChars);

            // QWORD drain

            if ((bufferLength & 4) != 0)
            {
                if (Bmi1.X64.IsSupported)
                {
                    // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it.

                    ulong candidateUInt64 = Unsafe.ReadUnaligned <ulong>(pBuffer);
                    if (!AllCharsInUInt64AreAscii(candidateUInt64))
                    {
                        // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt.
                        // Remember the / 8 at the end to convert bit count to byte count,
                        // then the & ~1 at the end to treat a match in the high byte of
                        // any char the same as a match in the low byte of that same char.

                        candidateUInt64 &= 0xFF80FF80_FF80FF80ul;
                        pBuffer          = (char *)((byte *)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1));
                        goto Finish;
                    }
                }
                else
                {
                    // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead.

                    currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer);
                    uint nextDWord = Unsafe.ReadUnaligned <uint>(pBuffer + 4 / sizeof(char));

                    if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord))
                    {
                        // At least one of the values wasn't all-ASCII.
                        // We need to figure out which one it was and stick it in the currentMask local.

                        if (AllCharsInUInt32AreAscii(currentDWord))
                        {
                            currentDWord = nextDWord; // this one is the culprit
                            pBuffer     += 4 / sizeof(char);
                        }

                        goto FoundNonAsciiDataInCurrentDWord;
                    }
                }

                pBuffer += 4; // successfully consumed 4 ASCII chars
            }

            // DWORD drain

            if ((bufferLength & 2) != 0)
            {
                currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer);

                if (!AllCharsInUInt32AreAscii(currentDWord))
                {
                    goto FoundNonAsciiDataInCurrentDWord;
                }

                pBuffer += 2; // successfully consumed 2 ASCII chars
            }

            // WORD drain
            // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char.

            if ((bufferLength & 1) != 0)
            {
                if (*pBuffer <= 0x007F)
                {
                    pBuffer++; // successfully consumed a single char
                }
            }

            goto Finish;
        }

Example #13

0

Show file

File: SSE41.cs Project: badamczewski/SimpleIntrinsics

 public static bool _mm_testz_si128(Vector128 <sbyte> left, Vector128 <sbyte> right)
 {
     return(Sse41.TestZ(left, right));
 }

Example #14

0

Show file

    private static unsafe int EncodeBinarySse(
        ReadOnlySpan <byte> source, Span <byte> destination)
    {
        var length = (uint)source.Length;

        if ((uint)destination.Length < length * 2)
            ThrowException();

        fixed(byte *src = source)
        fixed(byte *dest = destination)
        {
            var srcCurrent  = src;
            var destCurrent = dest;

            var end     = src + length;
            var simdEnd = end - (length % 16);

            while (srcCurrent < simdEnd)
            {
                // Load 16 bytes (unaligned) into the XMM register.
                var data = Sse2.LoadVector128(srcCurrent);

                // Compare each byte of the input with '\0'. This results in a vector
                // where each byte is either \x00 or \xFF, depending on whether the
                // input had a '\x00' in the corresponding position.
                var zeroBytes = Sse2.CompareEqual(data, Vector128 <byte> .Zero);

                // Check whether the resulting vector is all-zero.
                // If it's all zero, we can just store the entire chunk.
                if (Sse41.TestZ(zeroBytes, zeroBytes))
                {
                    Sse2.Store(destCurrent, data);
                }
                else
                {
                    break;
                }

                srcCurrent  += 16;
                destCurrent += 16;
            }

            while (srcCurrent < end)
            {
                byte value = *srcCurrent++;
                if (value == 0)
                {
                    *destCurrent++ = 0;
                    *destCurrent++ = 1;
                }
                else
                {
                    *destCurrent++ = value;
                }
            }

            var written = destCurrent - dest;

            return((int)written);
        }
    }

Example #15

0

Show file

        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char *pUtf16Buffer, byte *pAsciiBuffer, nuint elementCount)
        {
            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
            // will be elided by JIT once we determine which specific ISAs we support.

            // JIT turns the below into constants

            uint  SizeOfVector128          = (uint)Unsafe.SizeOf <Vector128 <byte> >();
            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);

            // This method is written such that control generally flows top-to-bottom, avoiding
            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
            // data, we jump out of the hot paths to targets at the end of the method.

            Debug.Assert(Sse2.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);
            Debug.Assert(elementCount >= 2 * SizeOfVector128);

            Vector128 <short> asciiMaskForPTEST   = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware
            Vector128 <short> asciiMaskForPXOR    = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR
            Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW

            // First, perform an unaligned read of the first part of the input buffer.

            Vector128 <short> utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer); // unaligned load

            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.

            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                {
                    return(0);
                }
            }
            else
            {
                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                {
                    return(0);
                }
            }

            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.

            Vector128 <byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);

            Sse2.StoreScalar((ulong *)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED

            nuint currentOffsetInElements = SizeOfVector128 / 2;             // we processed 8 elements so far

            // We're going to get the best performance when we have aligned writes, so we'll take the
            // hit of potentially unaligned reads in order to hit this sweet spot.

            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
            // that case we can immediately back up to the previous aligned boundary and start the main loop.
            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
            // just past the next aligned boundary address.

            if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2)))
            {
                // We need to perform one more partial vector write before we can get the alignment we want.

                utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load

                // See comments earlier in this method for information about how this works.
                if (Sse41.IsSupported)
                {
                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                    {
                        goto Finish;
                    }
                }
                else
                {
                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                    {
                        goto Finish;
                    }
                }

                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
                Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
            }

            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
            // point, then use that as the base offset going forward.

            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");

            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");

            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;

            do
            {
                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.

                utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements);                                                      // unaligned load
                Vector128 <short> utf16VectorSecond = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
                Vector128 <short> combinedVector    = Sse2.Or(utf16VectorFirst, utf16VectorSecond);

                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
                if (Sse41.IsSupported)
                {
                    if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
                    {
                        goto FoundNonAsciiDataInLoop;
                    }
                }
                else
                {
                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                    {
                        goto FoundNonAsciiDataInLoop;
                    }
                }

                // Build up the UTF-8 vector and perform the store.

                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);

                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned

                currentOffsetInElements += SizeOfVector128;
            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);

Finish:

            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
            return(currentOffsetInElements);

FoundNonAsciiDataInLoop:

            // Can we at least narrow the high vector?
            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                {
                    goto Finish; // found non-ASCII data
                }
            }
            else
            {
                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                {
                    goto Finish; // found non-ASCII data
                }
            }

            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);

            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");

            Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
            currentOffsetInElements += SizeOfVector128 / 2;

            goto Finish;
        }