public void RunStructLclFldScenario() { var test = TestStruct.Create(); var result = Bmi2.ParallelBitDeposit(test._fld1, test._fld2); ValidateResult(test._fld1, test._fld2, result); }
public void RunLclFldScenario() { var test = new ScalarBinaryOpTest__ParallelBitDepositUInt32(); var result = Bmi2.ParallelBitDeposit(test._fld1, test._fld2); ValidateResult(test._fld1, test._fld2, result); }
public void RunLclVarScenario_UnsafeRead() { var data1 = Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data1)); var data2 = Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data2)); var result = Bmi2.ParallelBitDeposit(data1, data2); ValidateResult(data1, data2, result); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Bmi2.ParallelBitDeposit(_fld1, _fld2); ValidateResult(_fld1, _fld2, result); }
public static uint ParallelBitDeposit(uint x, uint mask) { if (Bmi2.IsSupported) { return(Bmi2.ParallelBitDeposit(x, mask)); } return(ParallelBitDepositLogic(x, mask)); }
public void RunBasicScenario_UnsafeRead() { var result = Bmi2.ParallelBitDeposit( Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data1)), Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data2)) ); ValidateResult(_data1, _data2, result); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Bmi2.ParallelBitDeposit(test._fld1, test._fld2); ValidateResult(test._fld1, test._fld2, result); }
public void RunClsVarScenario() { var result = Bmi2.ParallelBitDeposit( _clsVar1, _clsVar2 ); ValidateResult(_clsVar1, _clsVar2, result); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new ScalarBinaryOpTest__ParallelBitDepositUInt32(); var result = Bmi2.ParallelBitDeposit(test._fld1, test._fld2); ValidateResult(test._fld1, test._fld2, result); }
private static uint UInt16ToUpperHexWithBmi2(uint value) { Debug.Assert(Bmi2.IsSupported, "This code path shouldn't have gotten hit unless BMI2 was supported."); // Convert 0x0000WXYZ to 0x0W0X0Y0Z. value = Bmi2.ParallelBitDeposit(value, 0x0F0F0F0Fu); // From WriteHexByte, must document better return((((0x89898989u - value) & 0x70707070u) >> 4) + value + 0x30303030u); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var data1 = Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data1)); var data2 = Unsafe.ReadUnaligned <UInt32>(ref Unsafe.As <UInt32, byte>(ref _data2)); var result = Bmi2.ParallelBitDeposit(data1, data2); ValidateResult(data1, data2, result); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Bmi2.ParallelBitDeposit( Unsafe.ReadUnaligned <UInt64>(ref Unsafe.As <UInt64, byte>(ref _data1)), Unsafe.ReadUnaligned <UInt64>(ref Unsafe.As <UInt64, byte>(ref _data2)) ); ValidateResult(_data1, _data2, result); }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Bmi2.ParallelBitDeposit( _clsVar1, _clsVar2 ); ValidateResult(_clsVar1, _clsVar2, result); }
public override ulong Run(CancellationToken cancellationToken) { if (!Bmi2.IsSupported) { return(0uL); } var iterations = 0uL; var zhb = randomInt; while (!cancellationToken.IsCancellationRequested) { for (var i = 0; i < LENGTH; i++) { zhb = Bmi2.ParallelBitDeposit(zhb, anotherRandomInt); } iterations++; } return(iterations + zhb - zhb); }
public void RunFldScenario() { var result = Bmi2.ParallelBitDeposit(_fld1, _fld2); ValidateResult(_fld1, _fld2, result); }
public static unsafe bool TryGetAsciiString(byte *input, char *output, int count) { Debug.Assert(input != null); Debug.Assert(output != null); var end = input + count; Debug.Assert((long)end >= Vector256 <sbyte> .Count); if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256 <sbyte> .Count) { Vector256 <sbyte> zero = Vector256 <sbyte> .Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, zero)) { return(false); } var tmp0 = Avx2.UnpackLow(vector, zero); var tmp1 = Avx2.UnpackHigh(vector, zero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); var out1 = Avx2.Permute2x128(tmp0, tmp1, 0x31); Avx.Store((ushort *)output, out0.AsUInt16()); Avx.Store((ushort *)output + Vector256 <ushort> .Count, out1.AsUInt16()); input += Vector256 <sbyte> .Count; output += Vector256 <sbyte> .Count; } while (input <= end - Vector256 <sbyte> .Count); if (input == end) { return(true); } } if (input <= end - Vector128 <sbyte> .Count) { Vector128 <sbyte> zero = Vector128 <sbyte> .Zero; do { var vector = Sse2.LoadVector128(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, zero)) { return(false); } var c0 = Sse2.UnpackLow(vector, zero).AsUInt16(); var c1 = Sse2.UnpackHigh(vector, zero).AsUInt16(); Sse2.Store((ushort *)output, c0); Sse2.Store((ushort *)output + Vector128 <ushort> .Count, c1); input += Vector128 <sbyte> .Count; output += Vector128 <sbyte> .Count; } while (input <= end - Vector128 <sbyte> .Count); if (input == end) { return(true); } } } else if (Vector.IsHardwareAccelerated) { while (input <= end - Vector <sbyte> .Count) { var vector = Unsafe.AsRef <Vector <sbyte> >(input); if (!CheckBytesInAsciiRange(vector)) { return(false); } Vector.Widen( vector, out Unsafe.AsRef <Vector <short> >(output), out Unsafe.AsRef <Vector <short> >(output + Vector <short> .Count)); input += Vector <sbyte> .Count; output += Vector <sbyte> .Count; } if (input == end) { return(true); } } if (Environment.Is64BitProcess) // Use Intrinsic switch for branch elimination { // 64-bit: Loop longs by default while (input <= end - sizeof(long)) { var value = *(long *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } if (Bmi2.X64.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((ulong *)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul); ((ulong *)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; output[4] = (char)input[4]; output[5] = (char)input[5]; output[6] = (char)input[6]; output[7] = (char)input[7]; } input += sizeof(long); output += sizeof(long); } if (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } if (Bmi2.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((uint *)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); ((uint *)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; } input += sizeof(int); output += sizeof(int); } } else { // 32-bit: Loop ints by default while (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } if (Bmi2.IsSupported) { // BMI2 will work regardless of the processor's endianness. ((uint *)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); ((uint *)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; } input += sizeof(int); output += sizeof(int); } } if (input <= end - sizeof(short)) { if (!CheckBytesInAsciiRange(((short *)input)[0])) { return(false); } output[0] = (char)input[0]; output[1] = (char)input[1]; input += sizeof(short); output += sizeof(short); } if (input < end) { if (!CheckBytesInAsciiRange(((sbyte *)input)[0])) { return(false); } output[0] = (char)input[0]; } return(true); }
internal static uint ExtractFourUtf8BytesFromSurrogatePair(uint value) { Debug.Assert(IsWellFormedUtf16SurrogatePair(value)); if (BitConverter.IsLittleEndian) { // input = [ 110111yyyyxxxxxx 110110wwwwzzzzyy ] = scalar (000uuuuu zzzzyyyy yyxxxxxx) // must return [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ], where wwww = uuuuu - 1 if (Bmi2.IsSupported) { // Since pdep and pext have high latencies and can only be dispatched to a single execution port, we want // to use them conservatively. Here, we'll build up the scalar value (this would normally be pext) via simple // logical and arithmetic operations, and use only pdep for the expensive step of exploding the scalar across // all four output bytes. uint unmaskedScalar = (value << 10) + (value >> 16) + ((0x40u) << 10) /* uuuuu = wwww + 1 */ - 0xDC00u /* remove low surrogate marker */; // Now, unmaskedScalar = [ xxxxxx11 011uuuuu zzzzyyyy yyxxxxxx ]. There's a bit of unneeded junk at the beginning // that should normally be masked out via an and, but we'll just direct pdep to ignore it. uint exploded = Bmi2.ParallelBitDeposit(unmaskedScalar, 0b00000111_00111111_00111111_00111111u); // = [ 00000uuu 00uuzzzz 00yyyyyy 00xxxxxx ] return(BinaryPrimitives.ReverseEndianness(exploded + 0xF080_8080u)); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] } else { value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] tempC |= tempB; uint tempD = (value & 0x03u) << 20; // = [ 00000000 00yy0000 00000000 00000000 ] tempD |= 0x8080_80F0u; return(tempD | tempA | tempC); // = [ 10xxxxxx 10yyyyyy 10uuzzzz 11110uuu ] } } else { // input = [ 110110wwwwzzzzyy 110111yyyyxxxxxx ], where wwww = uuuuu - 1 // must return [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ], where wwww = uuuuu - 1 value -= 0xD800_DC00u; // = [ 000000wwwwzzzzyy 000000yyyyxxxxxx ] value += 0x0040_0000u; // = [ 00000uuuuuzzzzyy 000000yyyyxxxxxx ] uint tempA = value & 0x0700_0000u; // = [ 00000uuu 00000000 00000000 00000000 ] uint tempB = (value >> 2) & 0x003F_0000u; // = [ 00000000 00uuzzzz 00000000 00000000 ] tempB |= tempA; uint tempC = (value << 2) & 0x0000_0F00u; // = [ 00000000 00000000 0000yyyy 00000000 ] uint tempD = (value >> 6) & 0x0003_0000u; // = [ 00000000 00000000 00yy0000 00000000 ] tempD |= tempC; uint tempE = (value & 0x3Fu) + 0xF080_8080u; // = [ 11110000 10000000 10000000 10xxxxxx ] return(tempE | tempB | tempD); // = [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ] } }
public void RunStructFldScenario(ScalarBinaryOpTest__ParallelBitDepositUInt64 testClass) { var result = Bmi2.ParallelBitDeposit(_fld1, _fld2); testClass.ValidateResult(_fld1, _fld2, result); }