public void RunLclVarScenario_UnsafeRead() { var data = Unsafe.ReadUnaligned <UInt64>(ref Unsafe.As <UInt64, byte>(ref _data)); var result = Bmi1.TrailingZeroCount(data); ValidateResult(data, result); }
public void RunClassLclFldScenario() { var test = new ScalarUnaryOpTest__TrailingZeroCountUInt64(); var result = Bmi1.TrailingZeroCount(test._fld); ValidateResult(test._fld, result); }
public void RunStructLclFldScenario() { var test = TestStruct.Create(); var result = Bmi1.TrailingZeroCount(test._fld); ValidateResult(test._fld, result); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Bmi1.TrailingZeroCount(_fld); ValidateResult(_fld, result); }
public void RunBasicScenario_UnsafeRead() { var result = Bmi1.TrailingZeroCount( Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data)) ); ValidateResult(_data, result); }
public void RunClsVarScenario() { var result = Bmi1.TrailingZeroCount( _clsVar ); ValidateResult(_clsVar, result); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var data = Unsafe.ReadUnaligned <UInt64>(ref Unsafe.As <UInt64, byte>(ref _data)); var result = Bmi1.TrailingZeroCount(data); ValidateResult(data, result); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new ScalarUnaryOpTest__TrailingZeroCountUInt64(); var result = Bmi1.TrailingZeroCount(test._fld); ValidateResult(test._fld, result); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Bmi1.TrailingZeroCount(test._fld); ValidateResult(test._fld, result); }
public unsafe void NativeTrailingZeroCountX86() { uint count = 0; for (int i = 0; i < N; i++) { count += Bmi1.TrailingZeroCount(uintValue); } Console.WriteLine($"NativeTrailingZeroCountX86:{count}"); }
public static int TrailingZeroCount(int matches) { if (Bmi1.IsSupported) { return((int)Bmi1.TrailingZeroCount((uint)matches)); } else // Software fallback { // https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check return(Unsafe.AddByteOffset( ref MemoryMarshal.GetReference(TrailingCountMultiplyDeBruijn), ((uint)((matches & -matches) * 0x077CB531U)) >> 27)); } }
public static int TrailingZeroCount(uint value) { if (Bmi1.IsSupported) { // Note that TZCNT contract specifies 0->32 return((int)Bmi1.TrailingZeroCount(value)); } // Main code has behavior 0->0, so special-case in order to match intrinsic path 0->32 if (value == 0u) { return(32); } // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check return(Unsafe.AddByteOffset( ref MemoryMarshal.GetReference(s_TrailingZeroCountDeBruijn), // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u ((uint)((value & -value) * 0x077CB531u)) >> 27)); }
public static int TrailingZeroCount(uint value) { if (Bmi1.IsSupported) { // TZCNT contract is 0->32 return((int)Bmi1.TrailingZeroCount(value)); } // Unguarded fallback contract is 0->0 if (value == 0) { return(32); } // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check return(Unsafe.AddByteOffset( // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u ref MemoryMarshal.GetReference(s_TrailingZeroCountDeBruijn), // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27))); // Multi-cast mitigates redundant conv.u8 }
public static int TrailingZeroCount(uint value) { if (Bmi1.IsSupported) { // Note that TZCNT contract specifies 0->32 return((int)Bmi1.TrailingZeroCount(value)); } // Software fallback has behavior 0->0, so special-case to match intrinsic path 0->32 if (value == 0) { return(32); } // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check return(Unsafe.AddByteOffset( // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u ref MemoryMarshal.GetReference(s_TrailingZeroCountDeBruijn), // long -> IntPtr cast on 32-bit platforms is expensive - it does overflow checks not needed here (IntPtr)(int)(((uint)((value & -value) * 0x077CB531u)) >> 27))); // shift over long also expensive on 32-bit }
public override ulong Run(CancellationToken cancellationToken) { if (!Bmi1.IsSupported) { return(0uL); } var iterations = 0uL; var tzcnt = randomInt; while (!cancellationToken.IsCancellationRequested) { for (var i = 0; i < LENGTH; i++) { tzcnt = Bmi1.TrailingZeroCount(tzcnt); } iterations++; } return(iterations + tzcnt - tzcnt); }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static byte *GetPointerToFirstInvalidByte(byte *pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, try to drain off as many ASCII bytes as we can from the beginning. { nuint numAsciiBytesCounted = ASCIIUtility.GetIndexOfFirstNonAsciiByte(pInputBuffer, (uint)inputLength); pInputBuffer += numAsciiBytesCounted; // Quick check - did we just end up consuming the entire input buffer? // If so, short-circuit the remainder of the method. inputLength -= (int)numAsciiBytesCounted; if (inputLength == 0) { utf16CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } } #if DEBUG // Keep these around for final validation at the end of the method. byte *pOriginalInputBuffer = pInputBuffer; int originalInputLength = inputLength; #endif // Enregistered locals that we'll eventually out to our caller. int tempUtf16CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if (inputLength < sizeof(uint)) { goto ProcessInputOfLessThanDWordSize; } byte *pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint); // Begin the main loop. #if DEBUG byte *pLastBufferPosProcessed = null; // used for invariant checking in debug builds #endif while (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { // Read 32 bits at a time. This is enough to hold any possible UTF8-encoded scalar. uint thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); AfterReadDWord: #if DEBUG Debug.Assert(pLastBufferPosProcessed < pInputBuffer, "Algorithm should've made forward progress since last read."); pLastBufferPosProcessed = pInputBuffer; #endif // First, check for the common case of all-ASCII bytes. if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) { // We read an all-ASCII sequence. pInputBuffer += sizeof(uint); // If we saw a sequence of all ASCII, there's a good chance a significant amount of following data is also ASCII. // Below is basically unrolled loops with poor man's vectorization. // Below check is "can I read at least five DWORDs from the input stream?" // n.b. Since we incremented pInputBuffer above the below subtraction may result in a negative value, // hence using nint instead of nuint. if ((nint)(void *)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) >= 4 * sizeof(uint)) { // We want reads in the inner loop to be aligned. So let's perform a quick // ASCII check of the next 32 bits (4 bytes) now, and if that succeeds bump // the read pointer up to the next aligned address. thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) { goto AfterReadDWordSkipAllBytesAsciiCheck; } pInputBuffer = (byte *)((nuint)(pInputBuffer + 4) & ~(nuint)3); // At this point, the input buffer offset points to an aligned DWORD. We also know that there's // enough room to read at least four DWORDs from the buffer. (Heed the comment a few lines above: // the original 'if' check confirmed that there were 5 DWORDs before the alignment check, and // the alignment check consumes at most a single DWORD.) byte *pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here uint mask; do { if (Sse2.IsSupported && Bmi1.IsSupported) { // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're // going to perform an unaligned load. We don't necessarily care about aligning // this because we pessimistically assume we'll encounter non-ASCII data at some // point in the not-too-distant future (otherwise we would've stayed entirely // within the all-ASCII vectorized code at the entry to this method). mask = (uint)Sse2.MoveMask(Sse2.LoadVector128((byte *)pInputBuffer)); if (mask != 0) { goto Sse2LoopTerminatedEarlyDueToNonAsciiData; } } else { if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint *)pInputBuffer)[0] | ((uint *)pInputBuffer)[1])) { goto LoopTerminatedEarlyDueToNonAsciiDataInFirstPair; } if (!ASCIIUtility.AllBytesInUInt32AreAscii(((uint *)pInputBuffer)[2] | ((uint *)pInputBuffer)[3])) { goto LoopTerminatedEarlyDueToNonAsciiDataInSecondPair; } } pInputBuffer += 4 * sizeof(uint); // consumed 4 DWORDs } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); continue; // need to perform a bounds check because we might be running out of data Sse2LoopTerminatedEarlyDueToNonAsciiData: Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(Sse2.IsSupported); Debug.Assert(Bmi1.IsSupported); // The 'mask' value will have a 0 bit for each ASCII byte we saw and a 1 bit // for each non-ASCII byte we saw. We can count the number of ASCII bytes, // bump our input counter by that amount, and resume processing from the // "the first byte is no longer ASCII" portion of the main loop. Debug.Assert(mask != 0); pInputBuffer += Bmi1.TrailingZeroCount(mask); if (pInputBuffer > pFinalPosWhereCanReadDWordFromInputBuffer) { goto ProcessRemainingBytesSlow; } thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); // no longer guaranteed to be aligned goto BeforeProcessTwoByteSequence; LoopTerminatedEarlyDueToNonAsciiDataInSecondPair: pInputBuffer += 2 * sizeof(uint); // consumed 2 DWORDs LoopTerminatedEarlyDueToNonAsciiDataInFirstPair: // We know that there's *at least* two DWORDs of data remaining in the buffer. // We also know that one of them (or both of them) contains non-ASCII data somewhere. // Let's perform a quick check here to bypass the logic at the beginning of the main loop. thisDWord = *(uint *)pInputBuffer; // still aligned here if (ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)) { pInputBuffer += sizeof(uint); // consumed 1 more DWORD thisDWord = *(uint *)pInputBuffer; // still aligned here } goto AfterReadDWordSkipAllBytesAsciiCheck; } continue; // not enough data remaining to unroll loop - go back to beginning with bounds checks } AfterReadDWordSkipAllBytesAsciiCheck: Debug.Assert(!ASCIIUtility.AllBytesInUInt32AreAscii(thisDWord)); // this should have been handled earlier // Next, try stripping off ASCII bytes one at a time. // We only handle up to three ASCII bytes here since we handled the four ASCII byte case above. { uint numLeadingAsciiBytes = ASCIIUtility.CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(thisDWord); pInputBuffer += numLeadingAsciiBytes; if (pFinalPosWhereCanReadDWordFromInputBuffer < pInputBuffer) { goto ProcessRemainingBytesSlow; // Input buffer doesn't contain enough data to read a DWORD } else { // The input buffer at the current offset contains a non-ASCII byte. // Read an entire DWORD and fall through to multi-byte consumption logic. thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); } } BeforeProcessTwoByteSequence: // At this point, we suspect we're working with a multi-byte code unit sequence, // but we haven't yet validated it for well-formedness. // The masks and comparands are derived from the Unicode Standard, Table 3-6. // Additionally, we need to check for valid byte sequences per Table 3-7. // Check the 2-byte case. thisDWord -= (BitConverter.IsLittleEndian) ? 0x0000_80C0u : 0xC080_0000u; if ((thisDWord & (BitConverter.IsLittleEndian ? 0x0000_C0E0u : 0xE0C0_0000u)) == 0) { // Per Table 3-7, valid sequences are: // [ C2..DF ] [ 80..BF ] // // Due to our modification of 'thisDWord' above, this becomes: // [ 02..1F ] [ 00..3F ] // // We've already checked that the leading byte was originally in the range [ C0..DF ] // and that the trailing byte was originally in the range [ 80..BF ], so now we only need // to check that the modified leading byte is >= [ 02 ]. if ((BitConverter.IsLittleEndian && (byte)thisDWord < 0x02u) || (!BitConverter.IsLittleEndian && thisDWord < 0x0200_0000u)) { goto Error; // overlong form - leading byte was [ C0 ] or [ C1 ] } ProcessTwoByteSequenceSkipOverlongFormCheck: // Optimization: If this is a two-byte-per-character language like Cyrillic or Hebrew, // there's a good chance that if we see one two-byte run then there's another two-byte // run immediately after. Let's check that now. // On little-endian platforms, we can check for the two-byte UTF8 mask *and* validate that // the value isn't overlong using a single comparison. On big-endian platforms, we'll need // to validate the mask and validate that the sequence isn't overlong as two separate comparisons. if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord)))) { // We have two runs of two bytes each. pInputBuffer += 4; tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 code units -> 2 UTF-16 code units (and 2 scalars) if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { // Optimization: If we read a long run of two-byte sequences, the next sequence is probably // also two bytes. Check for that first before going back to the beginning of the loop. thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (BitConverter.IsLittleEndian) { if (UInt32BeginsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) { // The next sequence is a valid two-byte sequence. goto ProcessTwoByteSequenceSkipOverlongFormCheck; } } else { if (UInt32BeginsWithUtf8TwoByteMask(thisDWord)) { if (UInt32BeginsWithOverlongUtf8TwoByteSequence(thisDWord)) { goto Error; // The next sequence purports to be a 2-byte sequence but is overlong. } goto ProcessTwoByteSequenceSkipOverlongFormCheck; } } // If we reached this point, the next sequence is something other than a valid // two-byte sequence, so go back to the beginning of the loop. goto AfterReadDWord; } else { goto ProcessRemainingBytesSlow; // Running out of data - go down slow path } } // The buffer contains a 2-byte sequence followed by 2 bytes that aren't a 2-byte sequence. // Unlikely that a 3-byte sequence would follow a 2-byte sequence, so perhaps remaining // bytes are ASCII? tempUtf16CodeUnitCountAdjustment--; // 2-byte sequence + (some number of ASCII bytes) -> 1 UTF-16 code units (and 1 scalar) [+ trailing] if (UInt32ThirdByteIsAscii(thisDWord)) { if (UInt32FourthByteIsAscii(thisDWord)) { pInputBuffer += 4; } else { pInputBuffer += 3; // A two-byte sequence followed by an ASCII byte followed by a non-ASCII byte. // Read in the next DWORD and jump directly to the start of the multi-byte processing block. if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); goto BeforeProcessTwoByteSequence; } } } else { pInputBuffer += 2; } continue; } // Check the 3-byte case. // We need to restore the C0 leading byte we stripped out earlier, then we can strip out the expected E0 byte. thisDWord -= (BitConverter.IsLittleEndian) ? (0x0080_00E0u - 0x0000_00C0u) : (0xE000_8000u - 0xC000_0000u); if ((thisDWord & (BitConverter.IsLittleEndian ? 0x00C0_C0F0u : 0xF0C0_C000u)) == 0) { ProcessThreeByteSequenceWithCheck: // We assume the caller has confirmed that the bit pattern is representative of a three-byte // sequence, but it may still be overlong or surrogate. We need to check for these possibilities. // // Per Table 3-7, valid sequences are: // [ E0 ] [ A0..BF ] [ 80..BF ] // [ E1..EC ] [ 80..BF ] [ 80..BF ] // [ ED ] [ 80..9F ] [ 80..BF ] // [ EE..EF ] [ 80..BF ] [ 80..BF ] // // Big-endian examples of using the above validation table: // E0A0 = 1110 0000 1010 0000 => invalid (overlong ) patterns are 1110 0000 100# #### // ED9F = 1110 1101 1001 1111 => invalid (surrogate) patterns are 1110 1101 101# #### // If using the bitmask ......................................... 0000 1111 0010 0000 (=0F20), // Then invalid (overlong) patterns match the comparand ......... 0000 0000 0000 0000 (=0000), // And invalid (surrogate) patterns match the comparand ......... 0000 1101 0010 0000 (=0D20). // // It's ok if the caller has manipulated 'thisDWord' (e.g., by subtracting 0xE0 or 0x80) // as long as they haven't touched the bits we're about to use in our mask checking below. if (BitConverter.IsLittleEndian) { // The "overlong or surrogate" check can be implemented using a single jump, but there's // some overhead to moving the bits into the correct locations in order to perform the // correct comparison, and in practice the processor's branch prediction capability is // good enough that we shouldn't bother. So we'll use two jumps instead. // Can't extract this check into its own helper method because JITter produces suboptimal // assembly, even with aggressive inlining. // Code below becomes 5 instructions: test, jz, lea, test, jz if (((thisDWord & 0x0000_200Fu) == 0) || (((thisDWord - 0x0000_200Du) & 0x0000_200Fu) == 0)) { goto Error; // overlong or surrogate } } else { if (((thisDWord & 0x0F20_0000u) == 0) || (((thisDWord - 0x0D20_0000u) & 0x0F20_0000u) == 0)) { goto Error; // overlong or surrogate } } ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks: // Occasionally one-off ASCII characters like spaces, periods, or newlines will make their way // in to the text. If this happens strip it off now before seeing if the next character // consists of three code units. // Branchless: consume a 3-byte UTF-8 sequence and optionally an extra ASCII byte hanging off the end nint asciiAdjustment; if (BitConverter.IsLittleEndian) { asciiAdjustment = (int)thisDWord >> 31; // smear most significant bit across entire value } else { asciiAdjustment = (nint)(sbyte)thisDWord >> 7; // smear most significant bit of least significant byte across entire value } // asciiAdjustment = 0 if fourth byte is ASCII; -1 otherwise // Please *DO NOT* reorder the below two lines. It provides extra defense in depth in case this method // is ever changed such that pInputBuffer becomes a 'ref byte' instead of a simple 'byte*'. It's valid // to add 4 before backing up since we already checked previously that the input buffer contains at // least a DWORD's worth of data, so we're not going to run past the end of the buffer where the GC can // no longer track the reference. However, we can't back up before adding 4, since we might back up to // before the start of the buffer, and the GC isn't guaranteed to be able to track this. pInputBuffer += 4; // optimistically, assume consumed a 3-byte UTF-8 sequence plus an extra ASCII byte pInputBuffer += asciiAdjustment; // back up if we didn't actually consume an ASCII byte tempUtf16CodeUnitCountAdjustment -= 2; // 3 (or 4) UTF-8 bytes -> 1 (or 2) UTF-16 code unit (and 1 [or 2] scalar) SuccessfullyProcessedThreeByteSequence: if (IntPtr.Size >= 8 && BitConverter.IsLittleEndian) { // x64 little-endian optimization: A three-byte character could indicate CJK text, // which makes it likely that the character following this one is also CJK. // We'll try to process several three-byte sequences at a time. // The check below is really "can we read 9 bytes from the input buffer?" since 'pFinalPos...' is already offset // n.b. The subtraction below could result in a negative value (since we advanced pInputBuffer above), so // use nint instead of nuint. if ((nint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) >= 5) { ulong thisQWord = Unsafe.ReadUnaligned <ulong>(pInputBuffer); // Stage the next 32 bits into 'thisDWord' so that it's ready for us in case we need to jump backward // to a previous location in the loop. This offers defense against reading main memory again (which may // have been modified and could lead to a race condition). thisDWord = (uint)thisQWord; // Is this three 3-byte sequences in a row? // thisQWord = [ 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] [ 10xxxxxx ] // ---- CHAR 3 ---- --------- CHAR 2 --------- --------- CHAR 1 --------- -CHAR 3- if ((thisQWord & 0xC0F0_C0C0_F0C0_C0F0ul) == 0x80E0_8080_E080_80E0ul && IsUtf8ContinuationByte(in pInputBuffer[8])) { // Saw a proper bitmask for three incoming 3-byte sequences, perform the // overlong and surrogate sequence checking now. // Check the first character. // If the first character is overlong or a surrogate, fail immediately. if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) { goto Error; } // Check the second character. // At this point, we now know the first three bytes represent a well-formed sequence. // If there's an error beyond here, we'll jump back to the "process three known good bytes" // logic. thisQWord >>= 24; if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) { goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; } // Check the third character (we already checked that it's followed by a continuation byte). thisQWord >>= 24; if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) { goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; } pInputBuffer += 9; tempUtf16CodeUnitCountAdjustment -= 6; // 9 UTF-8 bytes -> 3 UTF-16 code units (and 3 scalars) goto SuccessfullyProcessedThreeByteSequence; } // Is this two 3-byte sequences in a row? // thisQWord = [ ######## ######## | 10xxxxxx 10yyyyyy 1110zzzz | 10xxxxxx 10yyyyyy 1110zzzz ] // --------- CHAR 2 --------- --------- CHAR 1 --------- if ((thisQWord & 0xC0C0_F0C0_C0F0ul) == 0x8080_E080_80E0ul) { // Saw a proper bitmask for two incoming 3-byte sequences, perform the // overlong and surrogate sequence checking now. // Check the first character. // If the first character is overlong or a surrogate, fail immediately. if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) { goto Error; } // Check the second character. // At this point, we now know the first three bytes represent a well-formed sequence. // If there's an error beyond here, we'll jump back to the "process three known good bytes" // logic. thisQWord >>= 24; if ((((uint)thisQWord & 0x200Fu) == 0) || ((((uint)thisQWord - 0x200Du) & 0x200Fu) == 0)) { goto ProcessSingleThreeByteSequenceSkipOverlongAndSurrogateChecks; } pInputBuffer += 6; tempUtf16CodeUnitCountAdjustment -= 4; // 6 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars) // The next byte in the sequence didn't have a 3-byte marker, so it's probably // an ASCII character. Jump back to the beginning of loop processing. continue; } if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) { // A single three-byte sequence. goto ProcessThreeByteSequenceWithCheck; } else { // Not a three-byte sequence; perhaps ASCII? goto AfterReadDWord; } } } if (pInputBuffer <= pFinalPosWhereCanReadDWordFromInputBuffer) { thisDWord = Unsafe.ReadUnaligned <uint>(pInputBuffer); // Optimization: A three-byte character could indicate CJK text, which makes it likely // that the character following this one is also CJK. We'll check for a three-byte sequence // marker now and jump directly to three-byte sequence processing if we see one, skipping // all of the logic at the beginning of the loop. if (UInt32BeginsWithUtf8ThreeByteMask(thisDWord)) { goto ProcessThreeByteSequenceWithCheck; // Found another [not yet validated] three-byte sequence; process } else { goto AfterReadDWord; // Probably ASCII punctuation or whitespace; go back to start of loop } } else { goto ProcessRemainingBytesSlow; // Running out of data } } // Assume the 4-byte case, but we need to validate. if (BitConverter.IsLittleEndian) { thisDWord &= 0xC0C0_FFFFu; // After the above modifications earlier in this method, we expect 'thisDWord' // to have the structure [ 10000000 00000000 00uuzzzz 00010uuu ]. We'll now // perform two checks to confirm this. The first will verify the // [ 10000000 00000000 00###### ######## ] structure by taking advantage of two's // complement representation to perform a single *signed* integer check. if ((int)thisDWord > unchecked ((int)0x8000_3FFF)) { goto Error; // didn't have three trailing bytes } // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). thisDWord = BitOperations.RotateRight(thisDWord, 8); // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. // The check is now a simple add / cmp / jcc combo. if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1080_0010u, 0x1480_000Fu)) { goto Error; // overlong or out-of-range } } else { thisDWord -= 0x80u; // After the above modifications earlier in this method, we expect 'thisDWord' // to have the structure [ 00010uuu 00uuzzzz 00yyyyyy 00xxxxxx ]. We'll now // perform two checks to confirm this. The first will verify the // [ ######## 00###### 00###### 00###### ] structure. if ((thisDWord & 0x00C0_C0C0u) != 0) { goto Error; // didn't have three trailing bytes } // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). // This is a simple range check. (We don't care about the low two bytes.) if (!UnicodeUtility.IsInRangeInclusive(thisDWord, 0x1010_0000u, 0x140F_FFFFu)) { goto Error; // overlong or out-of-range } } // Validation of 4-byte case complete. pInputBuffer += 4; tempUtf16CodeUnitCountAdjustment -= 2; // 4 UTF-8 bytes -> 2 UTF-16 code units tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar continue; // go back to beginning of loop for processing } goto ProcessRemainingBytesSlow; ProcessInputOfLessThanDWordSize: Debug.Assert(inputLength < 4); nuint inputBufferRemainingBytes = (uint)inputLength; goto ProcessSmallBufferCommon; ProcessRemainingBytesSlow: inputBufferRemainingBytes = (nuint)(void *)Unsafe.ByteOffset(ref *pInputBuffer, ref *pFinalPosWhereCanReadDWordFromInputBuffer) + 4; ProcessSmallBufferCommon: Debug.Assert(inputBufferRemainingBytes < 4); while (inputBufferRemainingBytes > 0) { uint firstByte = pInputBuffer[0]; if ((byte)firstByte < 0x80u) { // 1-byte (ASCII) case pInputBuffer++; inputBufferRemainingBytes--; continue; } else if (inputBufferRemainingBytes >= 2) { uint secondByte = pInputBuffer[1]; // typed as 32-bit since we perform arithmetic (not just comparisons) on this value if ((byte)firstByte < 0xE0u) { // 2-byte case if ((byte)firstByte >= 0xC2u && IsLowByteUtf8ContinuationByte(secondByte)) { pInputBuffer += 2; tempUtf16CodeUnitCountAdjustment--; // 2 UTF-8 bytes -> 1 UTF-16 code unit (and 1 scalar) inputBufferRemainingBytes -= 2; continue; } } else if (inputBufferRemainingBytes >= 3) { if ((byte)firstByte < 0xF0u) { if ((byte)firstByte == 0xE0u) { if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0xA0u, 0xBFu)) { goto Error; // overlong encoding } } else if ((byte)firstByte == 0xEDu) { if (!UnicodeUtility.IsInRangeInclusive(secondByte, 0x80u, 0x9Fu)) { goto Error; // would be a UTF-16 surrogate code point } } else { if (!IsLowByteUtf8ContinuationByte(secondByte)) { goto Error; // first trailing byte doesn't have proper continuation marker } } if (IsUtf8ContinuationByte(in pInputBuffer[2])) { pInputBuffer += 3; tempUtf16CodeUnitCountAdjustment -= 2; // 3 UTF-8 bytes -> 2 UTF-16 code units (and 2 scalars) inputBufferRemainingBytes -= 3; continue; } } } } // Error - no match. goto Error; } // If we reached this point, we're out of data, and we saw no bad UTF8 sequence. #if DEBUG // Quick check that for the success case we're going to fulfill our contract of returning &inputBuffer[inputLength]. Debug.Assert(pOriginalInputBuffer + originalInputLength == pInputBuffer, "About to return an unexpected value."); #endif Error: // Report back to our caller how far we got before seeing invalid data. // (Also used for normal termination when falling out of the loop above.) utf16CodeUnitCountAdjustment = tempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
public int IndexOfFirstElementGreaterOrEqualToLimit_Avx() { var values = this.values; float limit = this.limitToFind; if (Avx.IsSupported) { unsafe { fixed(float *valuesPtr = values) { const int ElementsPerByte = sizeof(float) / sizeof(byte); var alignmentOffset = (long)(uint)(-(int)valuesPtr / ElementsPerByte) & (Vector256 <float> .Count - 1); // handle first values sequentially until we hit the 256bit alignment boundary for (long i = 0; i < alignmentOffset; i++) { if (*(valuesPtr + i) >= limit) { return((int)i); } } var remainingLength = values.Length - alignmentOffset; var vectorizableLength = values.Length - remainingLength % (long)Vector256 <float> .Count; // handle vectorizable items var limitVector = Vector256.Create(limit); for (var i = alignmentOffset; i < vectorizableLength; i += Vector256 <float> .Count) { var valuesVector = Avx.LoadAlignedVector256(valuesPtr + i); var comparisonResultVector = Avx.Compare(valuesVector, limitVector, FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling); // create int bitmask from vector bitmask // the first bit (right-to-left) that is 1 indicates a comparision yielding true var comparisonResult = (uint)Avx.MoveMask(comparisonResultVector); if (comparisonResult == 0) { // no element of the vector matches the compare criteria continue; } // a match was found var matchedLocation = i + Bmi1.TrailingZeroCount(comparisonResult); return((int)matchedLocation); } // handle remaining items for (var i = (int)vectorizableLength; i < values.Length; i++) { if (values[i] >= limit) { return(i); } } return(-1); } } } else { for (int i = 0; i < values.Length; i++) { if (values[i] >= limit) { return(i); } } return(-1); } }
public void RunStructFldScenario(ScalarUnaryOpTest__TrailingZeroCountUInt64 testClass) { var result = Bmi1.TrailingZeroCount(_fld); testClass.ValidateResult(_fld, result); }
public void RunClassFldScenario() { var result = Bmi1.TrailingZeroCount(_fld); ValidateResult(_fld, result); }
private static unsafe bool TryFindZero(Storage <float> costs, [NotNull] bool[] rowsCovered, [NotNull] bool[] colsCovered, out Location zeroLocation) { if (rowsCovered == null) { throw new ArgumentNullException(nameof(rowsCovered)); } if (colsCovered == null) { throw new ArgumentNullException(nameof(colsCovered)); } if (Avx2.IsSupported && costs.RowCount >= Vector256 <float> .Count) { var rowCount = costs.RowCount; var columnCount = costs.ColumnCount; var storage = costs.ColumnMajorBackingStore; var maxVectorOffset = rowCount - rowCount % Vector256 <float> .Count; var zeroVector = Vector256 <float> .Zero; var coveredMasks = new int[maxVectorOffset / Vector256 <float> .Count]; for (var i = 0; i < maxVectorOffset; i += Vector256 <float> .Count) { coveredMasks[i / Vector256 <float> .Count] = (rowsCovered[i] ? 0 : 1) | (rowsCovered[i + 1] ? 0 : 2) | (rowsCovered[i + 2] ? 0 : 4) | (rowsCovered[i + 3] ? 0 : 8) | (rowsCovered[i + 4] ? 0 : 16) | (rowsCovered[i + 5] ? 0 : 32) | (rowsCovered[i + 6] ? 0 : 64) | (rowsCovered[i + 7] ? 0 : 128); } fixed(float *storagePtr = storage) { for (var column = 0; column < columnCount; column++) { if (!colsCovered[column]) { var basePtr = storagePtr + rowCount * column; for (int row = 0, rowBatchIndex = 0; row < maxVectorOffset; row += Vector256 <float> .Count, rowBatchIndex++) { var rowVector = Avx.LoadVector256(basePtr + row); var comparisonResult = Avx.Compare(rowVector, zeroVector, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); var equality = (uint)Avx.MoveMask(comparisonResult); if (equality == 0) { continue; } equality &= (uint)coveredMasks[rowBatchIndex]; if (equality == 0) { continue; } var zeroRow = row + (int)Bmi1.TrailingZeroCount(equality); zeroLocation = new Location(zeroRow, column); return(true); } for (var i = maxVectorOffset; i < rowCount; i++) { if (!rowsCovered[i] && storage[column * rowCount + i] <= 0) { zeroLocation = new Location(i, column); return(true); } } } } } } else { for (var column = 0; column < costs.ColumnCount; column++) { if (colsCovered[column]) { continue; } for (var row = 0; row < costs.RowCount; row++) { if (!rowsCovered[row] && costs.ColumnMajorBackingStore[column * costs.RowCount + row] <= 0) { zeroLocation = new Location(row, column); return(true); } } } } zeroLocation = new Location(-1, -1); return(false); }
// internal for testing and benchmarking internal readonly unsafe int Avx2ContainsChar(char *strPtr, int len) { const int CHARS_PER_INT = sizeof(int) / sizeof(char); const int BITS_IN_INT = sizeof(int) * 8; var v256Count = len / CHARS_PER_VECTOR; var remainingChars = len % CHARS_PER_VECTOR; short *shortPtr = (short *)strPtr; var char1Vec = Vector256.Create((short)'\r'); var char2Vec = Vector256.Create((short)'\n'); var char3Vec = Vector256.Create(FirstChar); var char4Vec = Vector256.Create(SecondChar); var char4VecMask = Vector256.Create(Char2Mask); var char5Vec = Vector256.Create(ThirdChar); var char5VecMask = Vector256.Create(Char3Mask); for (var i = 0; i < v256Count; i++) { var chars = Avx.LoadVector256(shortPtr); shortPtr += CHARS_PER_VECTOR; // chars is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP // // each letter is 4 bits of a char, chunks with the same letter are the same char // The first three chars will always be set, no mask needed var a = Avx2.CompareEqual(char1Vec, chars); var res = a; var b = Avx2.CompareEqual(char2Vec, chars); res = Avx2.Or(res, b); var c = Avx2.CompareEqual(char3Vec, chars); res = Avx2.Or(res, c); // The last 2 chars are optional, so we use a mask to invalidate the compare if they're not set var d = Avx2.CompareEqual(char4Vec, chars); d = Avx2.And(char4VecMask, d); res = Avx2.Or(res, d); var e = Avx2.CompareEqual(char5Vec, chars); e = Avx2.And(char5VecMask, e); res = Avx2.Or(res, e); // res is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP // // each letter is either four 1s or four 0s, and corresponds to character with the same letter in chars var resBytes = res.AsByte(); var matchingBytes = Avx2.MoveMask(resBytes); // mask is now 0bAA_BB_CC_DD_EE_FF_GG_HH_II_JJ_KK_LL_MM_NN_OO_PP // // each letter is a bit, and is the high bit of a 2 letter pair from res var trailingZeros = (int)Bmi1.TrailingZeroCount((uint)matchingBytes); // trailingZeros is now the count of the number of trailing zeros in mask // // every 2 trailing zeros corresponds to one character that DID NOT // match if (trailingZeros != BITS_IN_INT) { var charsToSkip = trailingZeros / sizeof(char); var charIx = i * CHARS_PER_VECTOR + charsToSkip; var r = CheckValueSeparatorPresent(charIx, strPtr + charIx, len); if (r != -1) { return(r); } } } // if there are any trailing chars, try and handle as many as we can still in parallel // because of AVX limitations, we can only deal with an even number of chars // so there can be one left over if (remainingChars >= 2) { var remainingInts = remainingChars / CHARS_PER_INT; int *remainingIntPtr = (int *)shortPtr; // figure out how many CHARS to take (and build a mask for it), // but we can only take INTS, so we need to round down Vector256 <short> maskShort; fixed(ushort *maskPtr = SUB_VECTOR_MASK) { short *offsetMaskPtr = (short *)maskPtr; offsetMaskPtr += CHARS_PER_VECTOR * remainingInts; maskShort = Avx.LoadVector256(offsetMaskPtr); } // need to use a mask here so we don't load past the end of the buffer var maskInts = maskShort.AsInt32(); var ints = Avx2.MaskLoad(remainingIntPtr, maskInts); var chars = ints.AsInt16(); // chars is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP // // each letter is 4 bits of a char, chunks with the same letter are the same char // // if they were masked out, the bits should be all zeros (but treat them as garbage) // The first three chars will always be set, no mask needed var a = Avx2.CompareEqual(char1Vec, chars); var res = a; var b = Avx2.CompareEqual(char2Vec, chars); res = Avx2.Or(res, b); var c = Avx2.CompareEqual(char3Vec, chars); res = Avx2.Or(res, c); // The last 2 chars are optional, so we use a mask to invalidate the compare if they're not set var d = Avx2.CompareEqual(char4Vec, chars); d = Avx2.And(char4VecMask, d); res = Avx2.Or(res, d); var e = Avx2.CompareEqual(char5Vec, chars); e = Avx2.And(char5VecMask, e); res = Avx2.Or(res, e); // res is now: 0xAAAA_BBBB_CCCC_DDDD_EEEE_FFFF_GGGG_HHHH_IIII_JJJJ_KKKK_LLLL_MMMM_NNNN_OOOO_PPPP // // each letter is either four 1s or four 0s, and corresponds to character with the same letter in chars // need to do one last mask to clear any junk out of res before we check matching bits res = Avx2.And(res, maskShort); var resBytes = res.AsByte(); var matchingBytes = Avx2.MoveMask(resBytes); // mask is now 0bAA_BB_CC_DD_EE_FF_GG_HH_II_JJ_KK_LL_MM_NN_OO_PP // // each letter is a bit, and is the high bit of a 2 letter pair from res var trailingZeros = (int)Bmi1.TrailingZeroCount((uint)matchingBytes); // trailingZeros is now the count of the number of trailing zeros in mask // // every 2 trailing zeros corresponds to one character that DID NOT // match if (trailingZeros != BITS_IN_INT) { var charsToSkip = trailingZeros / sizeof(char); var charIx = v256Count * CHARS_PER_VECTOR + charsToSkip; var r = CheckValueSeparatorPresent(charIx, strPtr + charIx, len); if (r != -1) { return(r); } } remainingIntPtr += remainingInts; shortPtr = (short *)remainingIntPtr; } var hasRemainingChar = (remainingChars % CHARS_PER_INT) != 0; if (hasRemainingChar) { var finalChar = *shortPtr; var needEncode = finalChar == '\n' || finalChar == '\r' || finalChar == FirstChar || (Char2Mask != 0 && finalChar == SecondChar) || (Char3Mask != 0 && finalChar == ThirdChar); if (needEncode) { var charIx = len - 1; return(CheckValueSeparatorPresent(charIx, strPtr + charIx, len)); } } return(-1); }
protected override unsafe void ExecuteDay(byte[] input) { if (input == null) { return; } // borrowed liberally from https://github.com/Voltara/advent2017-fast/blob/master/src/day06.c var bytes = stackalloc byte[Vector128 <byte> .Count]; var ulongs = (ulong *)bytes; var x = Vector128 <byte> .Zero; int n = 0; var ctr = 0; for (int i = 0; i < input.Length && ctr < 16; i++) { if (input[i] < '0') { x = x.WithElement(ctr++, (byte)n); n = 0; } else { n = n * 10 + (input[i] - '0'); } } var map = new Dictionary <Vector128 <byte>, int>(capacity: PERFORMANCE_NOTE) { [x] = 0, }; ctr = 0; var mask1 = Vector128.Create(0x0607040502030001ul, 0x0e0f0c0d0a0b0809ul).AsByte(); var mask2 = Vector128.Create(0x0405060700010203ul, 0x0c0d0e0f08090a0bul).AsByte(); var mask3 = Vector128.Create(0x0001020304050607ul, 0x08090a0b0c0d0e0ful).AsByte(); var mask4 = Vector128.Create(0x08090a0b0c0d0e0ful, 0x0001020304050607ul).AsByte(); while (true) { // get max byte var tmp = Avx2.Max(x, Avx2.Shuffle(x, mask1)); tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask2)); tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask3)); tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask4)); // every byte in tmp should be max value var max = Avx2.Extract(tmp, 0); // where is it in the original? var idx = (int)Bmi1.TrailingZeroCount((uint) Avx2.MoveMask(Avx2.CompareEqual(x, tmp))); // subtract it from it's original place var high = (ulong)(long)-((idx & 0x08) >> 3); var shift = idx << 3; ulongs[0] = ((ulong)max << shift) & ~high; ulongs[1] = ((ulong)max << shift) & high; tmp = Avx2.Subtract(x, Avx2.LoadVector128(bytes)); // over 16? add 1 to all high = (ulong)(long)-((max & 0x10) >> 4); ulongs[0] = high & 0x0101010101010101ul; ulongs[1] = high & 0x0101010101010101ul; tmp = Avx2.Add(tmp, Avx2.LoadVector128(bytes)); // spread remainder to all // bitmask however many we're adding max &= 0x0f; shift = max << 3; var isLong = (ulong)(long)-((max & 0x08) >> 3); var mask = (0x1ul << shift) - 1; var lowMask = isLong | mask; var highMask = isLong & mask; // rotate our start point var start = (idx + 1) & 0x0f; isLong = (ulong)(long)-((start & 0x08) >> 3); var tmpLow = (~isLong & lowMask) | (isLong & highMask); var tmpHigh = (isLong & lowMask) | (~isLong & highMask); var doShift = (ulong)((-(start & 0x07)) >> 4); shift = start << 3; lowMask = ((tmpLow << shift | tmpHigh >> (128 - shift)) & doShift) | (~doShift & tmpLow); highMask = ((tmpHigh << shift | tmpLow >> (128 - shift)) & doShift) | (~doShift & tmpHigh); // build our adders and add values ulongs[0] = 0x0101010101010101ul & lowMask; ulongs[1] = 0x0101010101010101ul & highMask; tmp = Avx2.Add(tmp, Avx2.LoadVector128(bytes)); x = tmp; ctr++; if (map.ContainsKey(x)) { PartA = ctr.ToString(); PartB = (ctr - map[x]).ToString(); return; } map[x] = ctr; } }
internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiData(uint value) { Debug.Assert(!AllBytesInUInt32AreAscii(value), "Caller shouldn't provide an all-ASCII value."); // Use BMI1 directly rather than going through BitOperations. We only see a perf gain here // if we're able to emit a real tzcnt instruction; the software fallback used by BitOperations // is too slow for our purposes since we can provide our own faster, specialized software fallback. if (Bmi1.IsSupported) { Debug.Assert(BitConverter.IsLittleEndian); return(Bmi1.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3); } // Couldn't emit tzcnt, use specialized software fallback. // The 'allBytesUpToNowAreAscii' DWORD uses bit twiddling to hold a 1 or a 0 depending // on whether all processed bytes were ASCII. Then we accumulate all of the // results to calculate how many consecutive ASCII bytes are present. value = ~value; if (BitConverter.IsLittleEndian) { // Read first byte value >>= 7; uint allBytesUpToNowAreAscii = value & 1; uint numAsciiBytes = allBytesUpToNowAreAscii; // Read second byte value >>= 8; allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; // Read third byte value >>= 8; allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; return(numAsciiBytes); } else { // BinaryPrimitives.ReverseEndianness is only implemented as an intrinsic on // little-endian platforms, so using it in this big-endian path would be too // expensive. Instead we'll just change how we perform the shifts. // Read first byte value = BitOperations.RotateLeft(value, 1); uint allBytesUpToNowAreAscii = value & 1; uint numAsciiBytes = allBytesUpToNowAreAscii; // Read second byte value = BitOperations.RotateLeft(value, 8); allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; // Read third byte value = BitOperations.RotateLeft(value, 8); allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; return(numAsciiBytes); } }