public void RunLclVarScenario_UnsafeRead()
        {
            var data   = Unsafe.ReadUnaligned <UInt64>(ref Unsafe.As <UInt64, byte>(ref _data));
            var result = Bmi1.TrailingZeroCount(data);

            ValidateResult(data, result);
        }
        public void RunClassLclFldScenario()
        {
            var test   = new ScalarUnaryOpTest__TrailingZeroCountUInt64();
            var result = Bmi1.TrailingZeroCount(test._fld);

            ValidateResult(test._fld, result);
        }
        public void RunStructLclFldScenario()
        {
            var test   = TestStruct.Create();
            var result = Bmi1.TrailingZeroCount(test._fld);

            ValidateResult(test._fld, result);
        }
        public void RunClassFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario));

            var result = Bmi1.TrailingZeroCount(_fld);

            ValidateResult(_fld, result);
        }
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Bmi1.TrailingZeroCount(
                Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data))
                );

            ValidateResult(_data, result);
        }
        public void RunClsVarScenario()
        {
            var result = Bmi1.TrailingZeroCount(
                _clsVar
                );

            ValidateResult(_clsVar, result);
        }
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var data   = Unsafe.ReadUnaligned <UInt64>(ref Unsafe.As <UInt64, byte>(ref _data));
            var result = Bmi1.TrailingZeroCount(data);

            ValidateResult(data, result);
        }
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new ScalarUnaryOpTest__TrailingZeroCountUInt64();
            var result = Bmi1.TrailingZeroCount(test._fld);

            ValidateResult(test._fld, result);
        }
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test   = TestStruct.Create();
            var result = Bmi1.TrailingZeroCount(test._fld);

            ValidateResult(test._fld, result);
        }
Beispiel #10
0
        public unsafe void NativeTrailingZeroCountX86()
        {
            uint count = 0;

            for (int i = 0; i < N; i++)
            {
                count += Bmi1.TrailingZeroCount(uintValue);
            }
            Console.WriteLine($"NativeTrailingZeroCountX86:{count}");
        }
Beispiel #11
0
 public static int TrailingZeroCount(int matches)
 {
     if (Bmi1.IsSupported)
     {
         return((int)Bmi1.TrailingZeroCount((uint)matches));
     }
     else // Software fallback
     {
         // https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup
         // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
         return(Unsafe.AddByteOffset(
                    ref MemoryMarshal.GetReference(TrailingCountMultiplyDeBruijn),
                    ((uint)((matches & -matches) * 0x077CB531U)) >> 27));
     }
 }
Beispiel #12
0
        public static int TrailingZeroCount(uint value)
        {
            if (Bmi1.IsSupported)
            {
                // Note that TZCNT contract specifies 0->32
                return((int)Bmi1.TrailingZeroCount(value));
            }

            // Main code has behavior 0->0, so special-case in order to match intrinsic path 0->32
            if (value == 0u)
            {
                return(32);
            }

            // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
            return(Unsafe.AddByteOffset(
                       ref MemoryMarshal.GetReference(s_TrailingZeroCountDeBruijn),
                       // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u
                       ((uint)((value & -value) * 0x077CB531u)) >> 27));
        }
Beispiel #13
0
        public static int TrailingZeroCount(uint value)
        {
            if (Bmi1.IsSupported)
            {
                // TZCNT contract is 0->32
                return((int)Bmi1.TrailingZeroCount(value));
            }

            // Unguarded fallback contract is 0->0
            if (value == 0)
            {
                return(32);
            }

            // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
            return(Unsafe.AddByteOffset(
                       // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u
                       ref MemoryMarshal.GetReference(s_TrailingZeroCountDeBruijn),
                       // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here
                       (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27))); // Multi-cast mitigates redundant conv.u8
        }
Beispiel #14
0
        public static int TrailingZeroCount(uint value)
        {
            if (Bmi1.IsSupported)
            {
                // Note that TZCNT contract specifies 0->32
                return((int)Bmi1.TrailingZeroCount(value));
            }

            // Software fallback has behavior 0->0, so special-case to match intrinsic path 0->32
            if (value == 0)
            {
                return(32);
            }

            // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
            return(Unsafe.AddByteOffset(
                       // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u
                       ref MemoryMarshal.GetReference(s_TrailingZeroCountDeBruijn),
                       // long -> IntPtr cast on 32-bit platforms is expensive - it does overflow checks not needed here
                       (IntPtr)(int)(((uint)((value & -value) * 0x077CB531u)) >> 27))); // shift over long also expensive on 32-bit
        }
Beispiel #15
0
        public override ulong Run(CancellationToken cancellationToken)
        {
            if (!Bmi1.IsSupported)
            {
                return(0uL);
            }

            var iterations = 0uL;
            var tzcnt      = randomInt;

            while (!cancellationToken.IsCancellationRequested)
            {
                for (var i = 0; i < LENGTH; i++)
                {
                    tzcnt = Bmi1.TrailingZeroCount(tzcnt);
                }

                iterations++;
            }

            return(iterations + tzcnt - tzcnt);
        }
        // Returns &inputBuffer[inputLength] if the input buffer is valid.
        /// <summary>
        /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>,
        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
        /// </summary>
        /// <remarks>
        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
        /// </remarks>
        public static byte *GetPointerToFirstInvalidByte(byte *pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
        {
            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");

            // First, try to drain off as many ASCII bytes as we can from the beginning.

            {
                nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength);
                pInputBuffer += numAsciiBytesCounted;

                // Quick check - did we just end up consuming the entire input buffer?
                // If so, short-circuit the remainder of the method.

                inputLength -= (int)numAsciiBytesCounted;
                if (inputLength == 0)
                {
                    utf16CodeUnitCountAdjustment = 0;
                    scalarCountAdjustment        = 0;
                    return(pInputBuffer);
                }
            }

#if DEBUG
            // Keep these around for final validation at the end of the method.
            byte *pOriginalInputBuffer = pInputBuffer;
            int   originalInputLength  = inputLength;
#endif

            // Enregistered locals that we'll eventually out to our caller.

            int tempUtf16CodeUnitCountAdjustment = 0;
            int tempScalarCountAdjustment        = 0;

            if (inputLength < sizeof(uint))
            {
                goto ProcessInputOfLessThanDWordSize;
            }

            byte *pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint);

            // Begin the main loop.

#if DEBUG
            byte *pLastBufferPosProcessed = null; // used for invariant checking in debug builds
#endif

            while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
            {
                // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar.

                uint thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer);

AfterReadDWord:

#if DEBUG
                Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read.");
                pLastBufferPosProcessed = pInputBuffer;
#endif

                // First, check for the common case of all-ASCII bytes.

                if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
                {
                    // We read an all-ASCII sequence.

                    pInputBuffer += sizeof(uint);

                    // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII.
                    // Below is basically unrolled loops with poor man's vectorization.

                    // Below check is "can I read at least five DWORDs from the input stream?"
                    // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value,
                    // hence using nint instead of nuint.

                    if ((nint)(void *)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint))
                    {
                        // We want reads in the inner loop to be aligned. So let's perform a quick
                        // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump
                        // the read pointer up to the next aligned address.

                        thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                        if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
                        {
                            goto AfterReadDWordSkipAllBytesAsciiCheck;
                        }

                        pInputBuffer = (byte *)((nuint)(pInputBuffer + 4) & ~(nuint)3);

                        // At this point, the input buffer offset points to an aligned DWORD. We also know that there's
                        // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above:
                        // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and
                        // the alignment check consumes at most a single DWORD.)

                        byte *pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here
                        uint  mask;

                        do
                        {
                            if (Sse2.IsSupported && Bmi1.IsSupported)
                            {
                                // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're
                                // going to perform an unaligned load. We don't necessarily care about aligning
                                // this because we pessimistically assume we'll encounter non-ASCII data at some
                                // point in the not-too-distant future (otherwise we would've stayed entirely
                                // within the all-ASCII vectorized code at the entry to this method).

                                mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte *)pInputBuffer));
                                if (mask != 0)
                                {
                                    goto Sse2LoopTerminatedEarlyDueToNonAsciiData;
                                }
                            }
                            else
                            {
                                if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint *)pInputBuffer)[0] | ((uint *)pInputBuffer)[1]))
                                {
                                    goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair;
                                }

                                if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint *)pInputBuffer)[2] | ((uint *)pInputBuffer)[3]))
                                {
                                    goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair;
                                }
                            }

                            pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs
                        } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop);

                        continue; // need to perform a bounds check because we might be running out of data

Sse2LoopTerminatedEarlyDueToNonAsciiData:

                        Debug.Assert(BitConverter.IsLittleEndian);
                        Debug.Assert(Sse2.IsSupported);
                        Debug.Assert(Bmi1.IsSupported);

                        // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit
                        // for each non-ASCII byte we saw. We can count the number of ASCII bytes,
                        // bump our input counter by that amount, and resume processing from the
                        // "the first byte is no longer ASCII" portion of the main loop.

                        Debug.Assert(mask != 0);

                        pInputBuffer += Bmi1.TrailingZeroCount(mask);
                        if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer)
                        {
                            goto ProcessRemainingBytesSlow;
                        }

                        thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); // no longer guaranteed to be aligned
                        goto BeforeProcessTwoByteSequence;

LoopTerminatedEarlyDueToNonAsciiDataInSecondPair:

                        pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs

LoopTerminatedEarlyDueToNonAsciiDataInFirstPair:

                        // We know that there's *at least* two DWORDs of data remaining in the buffer.
                        // We also know that one of them (or both of them) contains non-ASCII data somewhere.
                        // Let's perform a quick check here to bypass the logic at the beginning of the main loop.

                        thisDWord = *(uint *)pInputBuffer; // still aligned here
                        if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord))
                        {
                            pInputBuffer += sizeof(uint);          // consumed 1 more DWORD
                            thisDWord     = *(uint *)pInputBuffer; // still aligned here
                        }

                        goto AfterReadDWordSkipAllBytesAsciiCheck;
                    }

                    continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks
                }

AfterReadDWordSkipAllBytesAsciiCheck:

                Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier

                // Next, try stripping off ASCII bytes one at a time.
                // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above.

                {
                    uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord);
                    pInputBuffer += numLeadingAsciiBytes;

                    if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer)
                    {
                        goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD
                    }
                    else
                    {
                        // The input buffer at the current offset contains a non-ASCII byte.
                        // Read an entire DWORD and fall through to multi-byte consumption logic.
                        thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                    }
                }

BeforeProcessTwoByteSequence:

                // At this point, we suspect we're working with a multi-byte code unit sequence,
                // but we haven't yet validated it for well-formedness.

                // The masks and comparands are derived from the Unicode Standard, Table 3-6.
                // Additionally, we need to check for valid byte sequences per Table 3-7.

                // Check the 2-byte case.

                thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u;
                if ((thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u)) == 0)
                {
                    // Per Table 3-7, valid sequences are:
                    // [ C2..DF ] [ 80..BF ]
                    //
                    // Due to our modification of 'thisDWord' above, this becomes:
                    // [ 02..1F ] [ 00..3F ]
                    //
                    // We've already checked that the leading byte was originally in the range [ C0..DF ]
                    // and that the trailing byte was originally in the range [ 80..BF ], so now we only need
                    // to check that the modified leading byte is >= [ 02 ].

                    if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u) ||
                        (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u))
                    {
                        goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ]
                    }

ProcessTwoByteSequenceSkipOverlongFormCheck:

                    // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew,
                    // there's a good chance that if we see one two-byte run then there's another two-byte
                    // run immediately after. Let's check that now.

                    // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that
                    // the value isn't overlong using a single comparison. On big-endian platforms, we'll need
                    // to validate the mask and validate that the sequence isn't overlong as two separate comparisons.

                    if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) ||
                        (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord))))
                    {
                        // We have two runs of two bytes each.
                        pInputBuffer += 4;
                        tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars)

                        if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
                        {
                            // Optimization: If we read a long run of two-byte sequences, the next sequence is probably
                            // also two bytes. Check for that first before going back to the beginning of the loop.

                            thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer);

                            if (BitConverter.IsLittleEndian)
                            {
                                if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord))
                                {
                                    // The next sequence is a valid two-byte sequence.
                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
                                }
                            }
                            else
                            {
                                if (UInt32BeginsWithUtf8TwoByteMask(thisDWord))
                                {
                                    if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord))
                                    {
                                        goto Error; // The next sequence purports to be a 2-byte sequence but is overlong.
                                    }

                                    goto ProcessTwoByteSequenceSkipOverlongFormCheck;
                                }
                            }

                            // If we reached this point, the next sequence is something other than a valid
                            // two-byte sequence, so go back to the beginning of the loop.
                            goto AfterReadDWord;
                        }
                        else
                        {
                            goto ProcessRemainingBytesSlow; // Running out of data - go down slow path
                        }
                    }

                    // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence.
                    // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining
                    // bytes are ASCII?

                    tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing]

                    if (UInt32ThirdByteIsAscii(thisDWord))
                    {
                        if (UInt32FourthByteIsAscii(thisDWord))
                        {
                            pInputBuffer += 4;
                        }
                        else
                        {
                            pInputBuffer += 3;

                            // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte.
                            // Read in the next DWORD and jump directly to the start of the multi-byte processing block.

                            if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
                            {
                                thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                                goto BeforeProcessTwoByteSequence;
                            }
                        }
                    }
                    else
                    {
                        pInputBuffer += 2;
                    }

                    continue;
                }

                // Check the 3-byte case.
                // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte.

                thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u);
                if ((thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u)) == 0)
                {
ProcessThreeByteSequenceWithCheck:

                    // We assume the caller has confirmed that the bit pattern is representative of a three-byte
                    // sequence, but it may still be overlong or surrogate. We need to check for these possibilities.
                    //
                    // Per Table 3-7, valid sequences are:
                    // [   E0   ] [ A0..BF ] [ 80..BF ]
                    // [ E1..EC ] [ 80..BF ] [ 80..BF ]
                    // [   ED   ] [ 80..9F ] [ 80..BF ]
                    // [ EE..EF ] [ 80..BF ] [ 80..BF ]
                    //
                    // Big-endian examples of using the above validation table:
                    // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# ####
                    // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# ####
                    // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20),
                    // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000),
                    // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20).
                    //
                    // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80)
                    // as long as they haven't touched the bits we're about to use in our mask checking below.

                    if (BitConverter.IsLittleEndian)
                    {
                        // The "overlong or surrogate" check can be implemented using a single jump, but there's
                        // some overhead to moving the bits into the correct locations in order to perform the
                        // correct comparison, and in practice the processor's branch prediction capability is
                        // good enough that we shouldn't bother. So we'll use two jumps instead.

                        // Can't extract this check into its own helper method because JITter produces suboptimal
                        // assembly, even with aggressive inlining.

                        // Code below becomes 5 instructions: test, jz, lea, test, jz

                        if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0))
                        {
                            goto Error; // overlong or surrogate
                        }
                    }
                    else
                    {
                        if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0))
                        {
                            goto Error; // overlong or surrogate
                        }
                    }

ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks:

                    // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way
                    // in to the text. If this happens strip it off now before seeing if the next character
                    // consists of three code units.

                    // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end

                    nint asciiAdjustment;
                    if (BitConverter.IsLittleEndian)
                    {
                        asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value
                    }
                    else
                    {
                        asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value
                    }

                    // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise

                    // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method
                    // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid
                    // to add 4 before backing up since we already checked previously that the input buffer contains at
                    // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can
                    // no longer track the reference. However, we can't back up before adding 4, since we might back up to
                    // before the start of the buffer, and the GC isn't guaranteed to be able to track this.

                    pInputBuffer += 4;                     // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte
                    pInputBuffer += asciiAdjustment;       // back up if we didn't actually consume an ASCII byte

                    tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar)

SuccessfullyProcessedThreeByteSequence:

                    if (IntPtr.Size >= 8 && BitConverter.IsLittleEndian)
                    {
                        // x64 little-endian optimization: A three-byte character could indicate CJK text,
                        // which makes it likely that the character following this one is also CJK.
                        // We'll try to process several three-byte sequences at a time.

                        // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset
                        // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so
                        // use nint instead of nuint.

                        if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5)
                        {
                            ulong thisQWord = Unsafe.ReadUnaligned <ulong>(pInputBuffer);

                            // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward
                            // to a previous location in the loop. This offers defense against reading main memory again (which may
                            // have been modified and could lead to a race condition).

                            thisDWord = (uint)thisQWord;

                            // Is this three 3-byte sequences in a row?
                            // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ]
                            //               ---- CHAR 3  ----   --------- CHAR 2 ---------   --------- CHAR 1 ---------     -CHAR 3-
                            if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && IsUtf8ContinuationByte(in pInputBuffer[8]))
                            {
                                // Saw a proper bitmask for three incoming 3-byte sequences, perform the
                                // overlong and surrogate sequence checking now.

                                // Check the first character.
                                // If the first character is overlong or a surrogate, fail immediately.

                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
                                {
                                    goto Error;
                                }

                                // Check the second character.
                                // At this point, we now know the first three bytes represent a well-formed sequence.
                                // If there's an error beyond here, we'll jump back to the "process three known good bytes"
                                // logic.

                                thisQWord >>= 24;
                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
                                {
                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
                                }

                                // Check the third character (we already checked that it's followed by a continuation byte).

                                thisQWord >>= 24;
                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
                                {
                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
                                }

                                pInputBuffer += 9;
                                tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars)

                                goto SuccessfullyProcessedThreeByteSequence;
                            }

                            // Is this two 3-byte sequences in a row?
                            // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ]
                            //                                   --------- CHAR 2 ---------   --------- CHAR 1 ---------
                            if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul)
                            {
                                // Saw a proper bitmask for two incoming 3-byte sequences, perform the
                                // overlong and surrogate sequence checking now.

                                // Check the first character.
                                // If the first character is overlong or a surrogate, fail immediately.

                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
                                {
                                    goto Error;
                                }

                                // Check the second character.
                                // At this point, we now know the first three bytes represent a well-formed sequence.
                                // If there's an error beyond here, we'll jump back to the "process three known good bytes"
                                // logic.

                                thisQWord >>= 24;
                                if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0))
                                {
                                    goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks;
                                }

                                pInputBuffer += 6;
                                tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)

                                // The next byte in the sequence didn't have a 3-byte marker, so it's probably
                                // an ASCII character. Jump back to the beginning of loop processing.

                                continue;
                            }

                            if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
                            {
                                // A single three-byte sequence.
                                goto ProcessThreeByteSequenceWithCheck;
                            }
                            else
                            {
                                // Not a three-byte sequence; perhaps ASCII?
                                goto AfterReadDWord;
                            }
                        }
                    }

                    if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer)
                    {
                        thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer);

                        // Optimization: A three-byte character could indicate CJK text, which makes it likely
                        // that the character following this one is also CJK. We'll check for a three-byte sequence
                        // marker now and jump directly to three-byte sequence processing if we see one, skipping
                        // all of the logic at the beginning of the loop.

                        if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord))
                        {
                            goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process
                        }
                        else
                        {
                            goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop
                        }
                    }
                    else
                    {
                        goto ProcessRemainingBytesSlow; // Running out of data
                    }
                }

                // Assume the 4-byte case, but we need to validate.

                if (BitConverter.IsLittleEndian)
                {
                    thisDWord &= 0xC0C0_FFFFu;

                    // After the above modifications earlier in this method, we expect 'thisDWord'
                    // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now
                    // perform two checks to confirm this. The first will verify the
                    // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's
                    // complement representation to perform a single *signed* integer check.

                    if ((int)thisDWord > unchecked ((int)0x8000_3FFF))
                    {
                        goto Error; // didn't have three trailing bytes
                    }

                    // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
                    // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).

                    thisDWord = BitOperations.RotateRight(thisDWord, 8);

                    // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ].
                    // The check is now a simple add / cmp / jcc combo.

                    if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu))
                    {
                        goto Error; // overlong or out-of-range
                    }
                }
                else
                {
                    thisDWord -= 0x80u;

                    // After the above modifications earlier in this method, we expect 'thisDWord'
                    // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now
                    // perform two checks to confirm this. The first will verify the
                    // [ ######## 00###### 00###### 00###### ] structure.

                    if ((thisDWord & 0x00C0_C0C0u) != 0)
                    {
                        goto Error; // didn't have three trailing bytes
                    }

                    // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding)
                    // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding).
                    // This is a simple range check. (We don't care about the low two bytes.)

                    if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu))
                    {
                        goto Error; // overlong or out-of-range
                    }
                }

                // Validation of 4-byte case complete.

                pInputBuffer += 4;
                tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units
                tempScalarCountAdjustment--;           // 2 UTF-16 code units -> 1 scalar

                continue;                              // go back to beginning of loop for processing
            }

            goto ProcessRemainingBytesSlow;

ProcessInputOfLessThanDWordSize:

            Debug.Assert(inputLength < 4);
            nuint inputBufferRemainingBytes = (uint)inputLength;
            goto ProcessSmallBufferCommon;

ProcessRemainingBytesSlow:

            inputBufferRemainingBytes = (nuint)(void *)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4;

ProcessSmallBufferCommon:

            Debug.Assert(inputBufferRemainingBytes < 4);
            while (inputBufferRemainingBytes > 0)
            {
                uint firstByte = pInputBuffer[0];

                if ((byte)firstByte < 0x80u)
                {
                    // 1-byte (ASCII) case
                    pInputBuffer++;
                    inputBufferRemainingBytes--;
                    continue;
                }
                else if (inputBufferRemainingBytes >= 2)
                {
                    uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value
                    if ((byte)firstByte < 0xE0u)
                    {
                        // 2-byte case
                        if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte))
                        {
                            pInputBuffer += 2;
                            tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar)
                            inputBufferRemainingBytes -= 2;
                            continue;
                        }
                    }
                    else if (inputBufferRemainingBytes >= 3)
                    {
                        if ((byte)firstByte < 0xF0u)
                        {
                            if ((byte)firstByte == 0xE0u)
                            {
                                if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu))
                                {
                                    goto Error; // overlong encoding
                                }
                            }
                            else if ((byte)firstByte == 0xEDu)
                            {
                                if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu))
                                {
                                    goto Error; // would be a UTF-16 surrogate code point
                                }
                            }
                            else
                            {
                                if (!IsLowByteUtf8ContinuationByte(secondByte))
                                {
                                    goto Error; // first trailing byte doesn't have proper continuation marker
                                }
                            }

                            if (IsUtf8ContinuationByte(in pInputBuffer[2]))
                            {
                                pInputBuffer += 3;
                                tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars)
                                inputBufferRemainingBytes        -= 3;
                                continue;
                            }
                        }
                    }
                }

                // Error - no match.

                goto Error;
            }

            // If we reached this point, we're out of data, and we saw no bad UTF8 sequence.

#if DEBUG
            // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength].
            Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value.");
#endif

Error:

            // Report back to our caller how far we got before seeing invalid data.
            // (Also used for normal termination when falling out of the loop above.)

            utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment;
            scalarCountAdjustment        = tempScalarCountAdjustment;
            return(pInputBuffer);
        }
        public int IndexOfFirstElementGreaterOrEqualToLimit_Avx()
        {
            var   values = this.values;
            float limit  = this.limitToFind;

            if (Avx.IsSupported)
            {
                unsafe
                {
                    fixed(float *valuesPtr = values)
                    {
                        const int ElementsPerByte = sizeof(float) / sizeof(byte);
                        var       alignmentOffset = (long)(uint)(-(int)valuesPtr / ElementsPerByte) & (Vector256 <float> .Count - 1);

                        // handle first values sequentially until we hit the 256bit alignment boundary
                        for (long i = 0; i < alignmentOffset; i++)
                        {
                            if (*(valuesPtr + i) >= limit)
                            {
                                return((int)i);
                            }
                        }

                        var remainingLength    = values.Length - alignmentOffset;
                        var vectorizableLength = values.Length - remainingLength % (long)Vector256 <float> .Count;

                        // handle vectorizable items
                        var limitVector = Vector256.Create(limit);

                        for (var i = alignmentOffset; i < vectorizableLength; i += Vector256 <float> .Count)
                        {
                            var valuesVector           = Avx.LoadAlignedVector256(valuesPtr + i);
                            var comparisonResultVector = Avx.Compare(valuesVector, limitVector, FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling);

                            // create int bitmask from vector bitmask
                            // the first bit (right-to-left) that is 1 indicates a comparision yielding true
                            var comparisonResult = (uint)Avx.MoveMask(comparisonResultVector);

                            if (comparisonResult == 0)
                            {
                                // no element of the vector matches the compare criteria
                                continue;
                            }

                            // a match was found
                            var matchedLocation = i + Bmi1.TrailingZeroCount(comparisonResult);
                            return((int)matchedLocation);
                        }

                        // handle remaining items
                        for (var i = (int)vectorizableLength; i < values.Length; i++)
                        {
                            if (values[i] >= limit)
                            {
                                return(i);
                            }
                        }

                        return(-1);
                    }
                }
            }
            else
            {
                for (int i = 0; i < values.Length; i++)
                {
                    if (values[i] >= limit)
                    {
                        return(i);
                    }
                }
                return(-1);
            }
        }
            public void RunStructFldScenario(ScalarUnaryOpTest__TrailingZeroCountUInt64 testClass)
            {
                var result = Bmi1.TrailingZeroCount(_fld);

                testClass.ValidateResult(_fld, result);
            }
        public void RunClassFldScenario()
        {
            var result = Bmi1.TrailingZeroCount(_fld);

            ValidateResult(_fld, result);
        }
        private static unsafe bool TryFindZero(Storage <float> costs, [NotNull] bool[] rowsCovered, [NotNull] bool[] colsCovered, out Location zeroLocation)
        {
            if (rowsCovered == null)
            {
                throw new ArgumentNullException(nameof(rowsCovered));
            }

            if (colsCovered == null)
            {
                throw new ArgumentNullException(nameof(colsCovered));
            }

            if (Avx2.IsSupported && costs.RowCount >= Vector256 <float> .Count)
            {
                var rowCount        = costs.RowCount;
                var columnCount     = costs.ColumnCount;
                var storage         = costs.ColumnMajorBackingStore;
                var maxVectorOffset = rowCount - rowCount % Vector256 <float> .Count;
                var zeroVector      = Vector256 <float> .Zero;

                var coveredMasks = new int[maxVectorOffset / Vector256 <float> .Count];
                for (var i = 0; i < maxVectorOffset; i += Vector256 <float> .Count)
                {
                    coveredMasks[i / Vector256 <float> .Count] = (rowsCovered[i] ? 0 : 1)
                                                                 | (rowsCovered[i + 1] ? 0 : 2)
                                                                 | (rowsCovered[i + 2] ? 0 : 4)
                                                                 | (rowsCovered[i + 3] ? 0 : 8)
                                                                 | (rowsCovered[i + 4] ? 0 : 16)
                                                                 | (rowsCovered[i + 5] ? 0 : 32)
                                                                 | (rowsCovered[i + 6] ? 0 : 64)
                                                                 | (rowsCovered[i + 7] ? 0 : 128);
                }

                fixed(float *storagePtr = storage)
                {
                    for (var column = 0; column < columnCount; column++)
                    {
                        if (!colsCovered[column])
                        {
                            var basePtr = storagePtr + rowCount * column;
                            for (int row = 0, rowBatchIndex = 0; row < maxVectorOffset; row += Vector256 <float> .Count, rowBatchIndex++)
                            {
                                var rowVector        = Avx.LoadVector256(basePtr + row);
                                var comparisonResult = Avx.Compare(rowVector, zeroVector, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
                                var equality         = (uint)Avx.MoveMask(comparisonResult);

                                if (equality == 0)
                                {
                                    continue;
                                }

                                equality &= (uint)coveredMasks[rowBatchIndex];

                                if (equality == 0)
                                {
                                    continue;
                                }

                                var zeroRow = row + (int)Bmi1.TrailingZeroCount(equality);
                                zeroLocation = new Location(zeroRow, column);
                                return(true);
                            }

                            for (var i = maxVectorOffset; i < rowCount; i++)
                            {
                                if (!rowsCovered[i] && storage[column * rowCount + i] <= 0)
                                {
                                    zeroLocation = new Location(i, column);
                                    return(true);
                                }
                            }
                        }
                    }
                }
            }
            else
            {
                for (var column = 0; column < costs.ColumnCount; column++)
                {
                    if (colsCovered[column])
                    {
                        continue;
                    }

                    for (var row = 0; row < costs.RowCount; row++)
                    {
                        if (!rowsCovered[row] && costs.ColumnMajorBackingStore[column * costs.RowCount + row] <= 0)
                        {
                            zeroLocation = new Location(row, column);
                            return(true);
                        }
                    }
                }
            }

            zeroLocation = new Location(-1, -1);
            return(false);
        }
        // internal for testing and benchmarking
        internal readonly unsafe int Avx2ContainsChar(char *strPtr, int len)
        {
            const int CHARS_PER_INT = sizeof(int) / sizeof(char);
            const int BITS_IN_INT   = sizeof(int) * 8;

            var v256Count      = len / CHARS_PER_VECTOR;
            var remainingChars = len % CHARS_PER_VECTOR;

            short *shortPtr = (short *)strPtr;

            var char1Vec     = Vector256.Create((short)'\r');
            var char2Vec     = Vector256.Create((short)'\n');
            var char3Vec     = Vector256.Create(FirstChar);
            var char4Vec     = Vector256.Create(SecondChar);
            var char4VecMask = Vector256.Create(Char2Mask);
            var char5Vec     = Vector256.Create(ThirdChar);
            var char5VecMask = Vector256.Create(Char3Mask);

            for (var i = 0; i < v256Count; i++)
            {
                var chars = Avx.LoadVector256(shortPtr);
                shortPtr += CHARS_PER_VECTOR;

                // chars is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP
                //
                // each letter is 4 bits of a char, chunks with the same letter are the same char

                // The first three chars will always be set, no mask needed
                var a   = Avx2.CompareEqual(char1Vec, chars);
                var res = a;

                var b = Avx2.CompareEqual(char2Vec, chars);
                res = Avx2.Or(res, b);

                var c = Avx2.CompareEqual(char3Vec, chars);
                res = Avx2.Or(res, c);

                // The last 2 chars are optional, so we use a mask to invalidate the compare if they're not set
                var d = Avx2.CompareEqual(char4Vec, chars);
                d   = Avx2.And(char4VecMask, d);
                res = Avx2.Or(res, d);

                var e = Avx2.CompareEqual(char5Vec, chars);
                e   = Avx2.And(char5VecMask, e);
                res = Avx2.Or(res, e);

                // res is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP
                //
                // each letter is either four 1s or four 0s, and corresponds to character with the same letter in chars

                var resBytes      = res.AsByte();
                var matchingBytes = Avx2.MoveMask(resBytes);

                // mask is now 0bAA_BB_CC_DD_EE_FF_GG_HH_II_JJ_KK_LL_MM_NN_OO_PP
                //
                // each letter is a bit, and is the high bit of a 2 letter pair from res

                var trailingZeros = (int)Bmi1.TrailingZeroCount((uint)matchingBytes);

                // trailingZeros is now the count of the number of trailing zeros in mask
                //
                // every 2 trailing zeros corresponds to one character that DID NOT
                //   match

                if (trailingZeros != BITS_IN_INT)
                {
                    var charsToSkip = trailingZeros / sizeof(char);

                    var charIx = i * CHARS_PER_VECTOR + charsToSkip;
                    var r      = CheckValueSeparatorPresent(charIx, strPtr + charIx, len);
                    if (r != -1)
                    {
                        return(r);
                    }
                }
            }

            // if there are any trailing chars, try and handle as many as we can still in parallel
            //   because of AVX limitations, we can only deal with an even number of chars
            //   so there can be one left over
            if (remainingChars >= 2)
            {
                var remainingInts = remainingChars / CHARS_PER_INT;

                int *remainingIntPtr = (int *)shortPtr;

                // figure out how many CHARS to take (and build a mask for it),
                //   but we can only take INTS, so we need to round down
                Vector256 <short> maskShort;
                fixed(ushort *maskPtr = SUB_VECTOR_MASK)
                {
                    short *offsetMaskPtr = (short *)maskPtr;

                    offsetMaskPtr += CHARS_PER_VECTOR * remainingInts;
                    maskShort      = Avx.LoadVector256(offsetMaskPtr);
                }

                // need to use a mask here so we don't load past the end of the buffer
                var maskInts = maskShort.AsInt32();
                var ints     = Avx2.MaskLoad(remainingIntPtr, maskInts);

                var chars = ints.AsInt16();

                // chars is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP
                //
                // each letter is 4 bits of a char, chunks with the same letter are the same char
                //
                // if they were masked out, the bits should be all zeros (but treat them as garbage)

                // The first three chars will always be set, no mask needed
                var a   = Avx2.CompareEqual(char1Vec, chars);
                var res = a;

                var b = Avx2.CompareEqual(char2Vec, chars);
                res = Avx2.Or(res, b);

                var c = Avx2.CompareEqual(char3Vec, chars);
                res = Avx2.Or(res, c);

                // The last 2 chars are optional, so we use a mask to invalidate the compare if they're not set
                var d = Avx2.CompareEqual(char4Vec, chars);
                d   = Avx2.And(char4VecMask, d);
                res = Avx2.Or(res, d);

                var e = Avx2.CompareEqual(char5Vec, chars);
                e   = Avx2.And(char5VecMask, e);
                res = Avx2.Or(res, e);

                // res is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP
                //
                // each letter is either four 1s or four 0s, and corresponds to character with the same letter in chars

                // need to do one last mask to clear any junk out of res before we check matching bits
                res = Avx2.And(res, maskShort);

                var resBytes      = res.AsByte();
                var matchingBytes = Avx2.MoveMask(resBytes);

                // mask is now 0bAA_BB_CC_DD_EE_FF_GG_HH_II_JJ_KK_LL_MM_NN_OO_PP
                //
                // each letter is a bit, and is the high bit of a 2 letter pair from res

                var trailingZeros = (int)Bmi1.TrailingZeroCount((uint)matchingBytes);

                // trailingZeros is now the count of the number of trailing zeros in mask
                //
                // every 2 trailing zeros corresponds to one character that DID NOT
                //   match

                if (trailingZeros != BITS_IN_INT)
                {
                    var charsToSkip = trailingZeros / sizeof(char);

                    var charIx = v256Count * CHARS_PER_VECTOR + charsToSkip;
                    var r      = CheckValueSeparatorPresent(charIx, strPtr + charIx, len);
                    if (r != -1)
                    {
                        return(r);
                    }
                }

                remainingIntPtr += remainingInts;
                shortPtr         = (short *)remainingIntPtr;
            }

            var hasRemainingChar = (remainingChars % CHARS_PER_INT) != 0;

            if (hasRemainingChar)
            {
                var finalChar  = *shortPtr;
                var needEncode =
                    finalChar == '\n' ||
                    finalChar == '\r' ||
                    finalChar == FirstChar ||
                    (Char2Mask != 0 && finalChar == SecondChar) ||
                    (Char3Mask != 0 && finalChar == ThirdChar);

                if (needEncode)
                {
                    var charIx = len - 1;
                    return(CheckValueSeparatorPresent(charIx, strPtr + charIx, len));
                }
            }

            return(-1);
        }
Beispiel #22
0
        protected override unsafe void ExecuteDay(byte[] input)
        {
            if (input == null)
            {
                return;
            }

            // borrowed liberally from https://github.com/Voltara/advent2017-fast/blob/master/src/day06.c
            var bytes  = stackalloc byte[Vector128 <byte> .Count];
            var ulongs = (ulong *)bytes;

            var x   = Vector128 <byte> .Zero;
            int n   = 0;
            var ctr = 0;

            for (int i = 0; i < input.Length && ctr < 16; i++)
            {
                if (input[i] < '0')
                {
                    x = x.WithElement(ctr++, (byte)n);
                    n = 0;
                }
                else
                {
                    n = n * 10 + (input[i] - '0');
                }
            }

            var map = new Dictionary <Vector128 <byte>, int>(capacity: PERFORMANCE_NOTE)
            {
                [x] = 0,
            };

            ctr = 0;

            var mask1 = Vector128.Create(0x0607040502030001ul, 0x0e0f0c0d0a0b0809ul).AsByte();
            var mask2 = Vector128.Create(0x0405060700010203ul, 0x0c0d0e0f08090a0bul).AsByte();
            var mask3 = Vector128.Create(0x0001020304050607ul, 0x08090a0b0c0d0e0ful).AsByte();
            var mask4 = Vector128.Create(0x08090a0b0c0d0e0ful, 0x0001020304050607ul).AsByte();

            while (true)
            {
                // get max byte
                var tmp = Avx2.Max(x, Avx2.Shuffle(x, mask1));
                tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask2));
                tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask3));
                tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask4));

                // every byte in tmp should be max value
                var max = Avx2.Extract(tmp, 0);

                // where is it in the original?
                var idx = (int)Bmi1.TrailingZeroCount((uint)
                                                      Avx2.MoveMask(Avx2.CompareEqual(x, tmp)));

                // subtract it from it's original place
                var high  = (ulong)(long)-((idx & 0x08) >> 3);
                var shift = idx << 3;
                ulongs[0] = ((ulong)max << shift) & ~high;
                ulongs[1] = ((ulong)max << shift) & high;
                tmp       = Avx2.Subtract(x, Avx2.LoadVector128(bytes));

                // over 16? add 1 to all
                high      = (ulong)(long)-((max & 0x10) >> 4);
                ulongs[0] = high & 0x0101010101010101ul;
                ulongs[1] = high & 0x0101010101010101ul;
                tmp       = Avx2.Add(tmp, Avx2.LoadVector128(bytes));

                // spread remainder to all
                // bitmask however many we're adding
                max  &= 0x0f;
                shift = max << 3;
                var isLong   = (ulong)(long)-((max & 0x08) >> 3);
                var mask     = (0x1ul << shift) - 1;
                var lowMask  = isLong | mask;
                var highMask = isLong & mask;

                // rotate our start point
                var start = (idx + 1) & 0x0f;
                isLong = (ulong)(long)-((start & 0x08) >> 3);
                var tmpLow  = (~isLong & lowMask) | (isLong & highMask);
                var tmpHigh = (isLong & lowMask) | (~isLong & highMask);

                var doShift = (ulong)((-(start & 0x07)) >> 4);
                shift   = start << 3;
                lowMask =
                    ((tmpLow << shift | tmpHigh >> (128 - shift)) & doShift) |
                    (~doShift & tmpLow);
                highMask =
                    ((tmpHigh << shift | tmpLow >> (128 - shift)) & doShift) |
                    (~doShift & tmpHigh);

                // build our adders and add values
                ulongs[0] = 0x0101010101010101ul & lowMask;
                ulongs[1] = 0x0101010101010101ul & highMask;
                tmp       = Avx2.Add(tmp, Avx2.LoadVector128(bytes));

                x = tmp;

                ctr++;
                if (map.ContainsKey(x))
                {
                    PartA = ctr.ToString();
                    PartB = (ctr - map[x]).ToString();
                    return;
                }

                map[x] = ctr;
            }
        }
Beispiel #23
0
        internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(uint value)
        {
            Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value.");

            // Use BMI1 directly rather than going through BitOperations. We only see a perf gain here
            // if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations
            // is too slow for our purposes since we can provide our own faster, specialized software fallback.

            if (Bmi1.IsSupported)
            {
                Debug.Assert(BitConverter.IsLittleEndian);
                return(Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3);
            }

            // Couldn't emit tzcnt, use specialized software fallback.
            // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending
            // on whether all processed bytes were ASCII. Then we accumulate all of the
            // results to calculate how many consecutive ASCII bytes are present.

            value = ~value;

            if (BitConverter.IsLittleEndian)
            {
                // Read first byte
                value >>= 7;
                uint allBytesUpToNowAreAscii = value & 1;
                uint numAsciiBytes           = allBytesUpToNowAreAscii;

                // Read second byte
                value >>= 8;
                allBytesUpToNowAreAscii &= value;
                numAsciiBytes           += allBytesUpToNowAreAscii;

                // Read third byte
                value >>= 8;
                allBytesUpToNowAreAscii &= value;
                numAsciiBytes           += allBytesUpToNowAreAscii;

                return(numAsciiBytes);
            }
            else
            {
                // BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on
                // little-endian platforms, so using it in this big-endian path would be too
                // expensive. Instead we'll just change how we perform the shifts.

                // Read first byte
                value = BitOperations.RotateLeft(value, 1);
                uint allBytesUpToNowAreAscii = value & 1;
                uint numAsciiBytes           = allBytesUpToNowAreAscii;

                // Read second byte
                value = BitOperations.RotateLeft(value, 8);
                allBytesUpToNowAreAscii &= value;
                numAsciiBytes           += allBytesUpToNowAreAscii;

                // Read third byte
                value = BitOperations.RotateLeft(value, 8);
                allBytesUpToNowAreAscii &= value;
                numAsciiBytes           += allBytesUpToNowAreAscii;

                return(numAsciiBytes);
            }
        }