public void RunBasicScenario_Load() { var result = Avx2.And( Avx.LoadVector256((Byte *)(_dataTable.inArray1Ptr)), Avx.LoadVector256((Byte *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Avx2.And(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Avx2.And( Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new SimpleBinaryOpTest__AndInt64(); var result = Avx2.And(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Avx.LoadAlignedVector256((UInt32 *)(_dataTable.inArray1Ptr)); var right = Avx.LoadAlignedVector256((UInt32 *)(_dataTable.inArray2Ptr)); var result = Avx2.And(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray2Ptr); var result = Avx2.And(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var left = Unsafe.Read <Vector256 <UInt32> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector256 <UInt32> >(_dataTable.inArray2Ptr); var result = Avx2.And(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load)); var op1 = Avx.LoadVector256((Int32 *)(_dataTable.inArray1Ptr)); var op2 = Avx.LoadVector256((Int32 *)(_dataTable.inArray2Ptr)); var result = Avx2.And(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Avx2.And( Avx.LoadVector256((UInt32 *)(_dataTable.inArray1Ptr)), Avx.LoadVector256((UInt32 *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Avx2.And( Unsafe.Read <Vector256 <UInt16> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector256 <UInt16> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Avx2.And( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public static unsafe void Decode32Bytes(byte *source, byte *dest) { Vector256 <byte> maskA = Vector256.Create((uint)0x0000_003f).AsByte(); Vector256 <byte> maskB = Vector256.Create((uint)0x0000_3f00).AsByte(); Vector256 <byte> maskC = Vector256.Create((uint)0x003f_0000).AsByte(); Vector256 <byte> maskD = Vector256.Create((uint)0x3f00_0000).AsByte(); Vector256 <byte> offsets = Vector256.Create((sbyte)-32).AsByte(); Vector256 <byte> vecSource = Unsafe.As <byte, Vector256 <byte> >(ref source[0]); Vector256 <byte> subtracted = Avx2.Add(vecSource, offsets); Vector256 <byte> a = Avx2.And(subtracted, maskA); Vector256 <byte> b = Avx2.And(subtracted, maskB); Vector256 <byte> c = Avx2.And(subtracted, maskC); Vector256 <byte> d = Avx2.And(subtracted, maskD); a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte(); // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000 b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte(); // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000 c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte(); // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000 d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte(); // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd // After Or: 00000000 aaaaaabb bbbbcccc ccdddddd // byte 3 byte 1 byte 2 byte 0 // a uint: 0x00000000_00000000__00000000_00111111 // b uint: 0x00000000_00000000__00111111_00000000 // c uint: 0x00000000_00111111__00000000_00000000 // d uint: 0x00111111_00000000__00000000_00000000 a = Avx2.Or(a, b); c = Avx2.Or(c, d); a = Avx2.Or(a, c); // AA BB CC 00 AA BB CC 00 // a contains: [C,B,A,0, F,E,D,0, I,H,G,0, L,K,J,0] // Shuffle bytes so that it becomes: [A,B,C, D,E,F, G,H,I, J,K,L, 0,0,0,0] //2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, // 3, 7, 11, 15 // 18, 17, 16, 22, 21, 20, // 19 var vecShuffle = Vector256.Create( 0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c, 0x80, 0x80, 0x80, 0x80, // 0x03, 0x07, 0x0b, 0x0f 0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c, 0x80, 0x80, 0x80, 0x80); // 0x13, 0x17, 0x1b, 0x1f var vecBytes2 = Avx2.Shuffle(a, vecShuffle); Sse2.Store(dest, vecBytes2.GetLower()); Sse2.Store(dest + 12, vecBytes2.GetUpper()); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Avx2.And( Avx.LoadVector256((UInt16 *)(&test._fld1)), Avx.LoadVector256((UInt16 *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public static Vector256 <byte> op_Multiply(Vector256 <byte> left, Vector256 <byte> right) { Vector256 <ushort> lowBits = Vector256.Create((ushort)0x00FF); var lowProduct = Avx2.And(lowBits, Avx2.MultiplyLow(left.As <ushort>(), right.As <ushort>())).AsByte(); var highProduct = Avx2.ShiftLeftLogical( Avx2.MultiplyLow( Avx2.ShiftRightLogical(left.As <ushort>(), 8), Avx2.ShiftRightLogical(right.As <ushort>(), 8) ), 8).AsByte(); return(Avx2.Or(lowProduct, highProduct)); }
private static Vector256 <ulong> SpreadSIMD(Vector256 <byte> x) { // x = _mm256_shuffle_epi8(x, _mm256_set_epi8( // -1, 11, -1, 10, -1, 9, -1, 8, // -1, 3, -1, 2, -1, 1, -1, 0, // -1, 11, -1, 10, -1, 9, -1, 8, // -1, 3, -1, 2, -1, 1, -1, 0)); //the order of Vector256.Create is reversed of _mm256_set_epi8! x = Avx2.Shuffle(x.AsSByte(), Vector256.Create( 0, -1, 1, -1, 2, -1, 3, -1, 8, -1, 9, -1, 10, -1, 11, -1, 0, -1, 1, -1, 2, -1, 3, -1, 8, -1, 9, -1, 10, -1, 11, -1)).AsByte(); // const __m256i lut = _mm256_set_epi8( // 85, 84, 81, 80, 69, 68, 65, 64, // 21, 20, 17, 16, 5, 4, 1, 0, // 85, 84, 81, 80, 69, 68, 65, 64, // 21, 20, 17, 16, 5, 4, 1, 0); // Vector256<byte> lut = Vector256.Create( // (byte)0,1,4,5,16,17,20,21, // 64,65,68,69,80,81,84,85, // 0,1,4,5,16,17,20,21, // 64,65,68,69,80,81,84,85 // ); Vector256 <byte> lut = Vector256.Create( (byte)0b00000000, 0b00000001, 0b00000100, 0b00000101, 0b00010000, 0b00010001, 0b00010100, 0b00010101, 0b01000000, 0b01000001, 0b01000100, 0b01000101, 0b01010000, 0b01010001, 0b01010100, 0b01010101, 0b00000000, 0b00000001, 0b00000100, 0b00000101, 0b00010000, 0b00010001, 0b00010100, 0b00010101, 0b01000000, 0b01000001, 0b01000100, 0b01000101, 0b01010000, 0b01010001, 0b01010100, 0b01010101 ); // __m256i lo = _mm256_and_si256(x, _mm256_set1_epi8(0xf)); Vector256 <byte> lo = Avx2.And(x, Vector256.Create((byte)0x0f)); // lo = _mm256_shuffle_epi8(lut, lo); lo = Avx2.Shuffle(lut, lo); // __m256i hi = _mm256_and_si256(x, _mm256_set1_epi8(0xf0)); var hi = Avx2.And(x, Vector256.Create((byte)0xf0)); // hi = _mm256_shuffle_epi8(lut, _mm256_srli_epi64(hi, 4)); hi = Avx2.Shuffle(lut, Avx2.ShiftRightLogical(hi.AsUInt64(), 4).AsByte()); // x = _mm256_or_si256(lo, _mm256_slli_epi64(hi, 8)); x = Avx2.Or(lo, Avx2.ShiftRightLogical(hi.AsUInt64(), 8).AsByte()); return(x.AsUInt64()); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndUInt16 testClass) { fixed(Vector256 <UInt16> *pFld1 = &_fld1) fixed(Vector256 <UInt16> *pFld2 = &_fld2) { var result = Avx2.And( Avx.LoadVector256((UInt16 *)(pFld1)), Avx.LoadVector256((UInt16 *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
/// <summary> /// Absolute error bounded by 1e-4. /// </summary> public static Vector256 <float> Log(Vector256 <float> x) { Vector256 <float> exp, addcst, val; exp = Avx2.ConvertToVector256Single(Avx2.ShiftRightArithmetic(x.As <float, int>(), 23)); // According to BenchmarkDotNet, isolating all the constants up-front // yield nearly 10% speed-up. const float bf0 = -89.970756366f; const float bf1 = float.NaN; // behavior of MathF.Log() on negative numbers const float bf2 = 3.529304993f; const float bf3 = -2.461222105f; const float bf4 = 1.130626167f; const float bf5 = -0.288739945f; const float bf6 = 3.110401639e-2f; const float bf7 = 0.6931471805f; const int bi0 = 0x7FFFFF; const int bi1 = 0x3F800000; //addcst = val > 0 ? -89.970756366f : -(float)INFINITY; addcst = Avx.BlendVariable(Vector256.Create(bf0), Vector256.Create(bf1), Avx.Compare(x, Vector256 <float> .Zero, FloatComparisonMode.OrderedLessThanNonSignaling)); val = Avx2.Or(Avx2.And( x.As <float, int>(), Vector256.Create(bi0)), Vector256.Create(bi1)).As <int, float>(); /* x * (3.529304993f + * x * (-2.461222105f + * x * (1.130626167f + * x * (-0.288739945f + * x * 3.110401639e-2f)))) + (addcst + 0.6931471805f*exp); */ return(Avx2.Add( Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf2), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf3), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf4), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf5), Avx2.Multiply(val, Vector256.Create(bf6)))))))))), Avx.Add(addcst, Avx2.Multiply(Vector256.Create(bf7), exp)))); }
unsafe private static void denoiseLineAvx2(byte *pcurr, byte *pprev, byte *pnext, int cb) { byte *ip = pcurr, pp = pprev, np = pnext; nuint cnt = 0, end = (nuint)cb - (nuint)Vector256 <byte> .Count; var voffset = Vector256.Create((byte)0x80); var vthresh = Vector256.Create(denoiseThreshold); LoopTop: do { var vcurr = Avx.LoadVector256(ip + cnt); var vprev = Avx.LoadVector256(pp + cnt); var vnext = Avx.LoadVector256(np + cnt); var vdiffp = Avx2.Or(Avx2.SubtractSaturate(vcurr, vprev), Avx2.SubtractSaturate(vprev, vcurr)); var vmaskp = Avx2.CompareEqual(Avx2.Max(vdiffp, vthresh), vthresh); var vdiffn = Avx2.Or(Avx2.SubtractSaturate(vcurr, vnext), Avx2.SubtractSaturate(vnext, vcurr)); var vmaskn = Avx2.CompareEqual(Avx2.Max(vdiffn, vthresh), vthresh); var vavgp = Avx2.Average(vcurr, vprev); var vavgn = Avx2.Average(vcurr, vnext); var voutval = Avx2.Average(Avx2.BlendVariable(vavgn, vavgp, vmaskp), Avx2.BlendVariable(vavgp, vavgn, vmaskn)); var voutmsk = Avx2.Or(vmaskp, vmaskn); voutval = Avx2.Average(voutval, Avx2.BlendVariable(voutval, Avx2.Average(vprev, vnext), Avx2.And(vmaskp, vmaskn))); var vcurrs = Avx2.Xor(vcurr, voffset).AsSByte(); var vprevs = Avx2.Xor(vprev, voffset).AsSByte(); var vnexts = Avx2.Xor(vnext, voffset).AsSByte(); var vsurlt = Avx2.And(Avx2.CompareGreaterThan(vcurrs, vprevs), Avx2.CompareGreaterThan(vcurrs, vnexts)); var vsurgt = Avx2.And(Avx2.CompareGreaterThan(vprevs, vcurrs), Avx2.CompareGreaterThan(vnexts, vcurrs)); voutmsk = Avx2.And(voutmsk, Avx2.Or(vsurlt, vsurgt).AsByte()); voutval = Avx2.BlendVariable(vcurr, voutval, voutmsk); Avx.Store(ip + cnt, voutval); cnt += (nuint)Vector256 <byte> .Count; } while (cnt <= end); if (cnt < end + (nuint)Vector256 <byte> .Count) { cnt = end; goto LoopTop; } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector256 <UInt16> *pFld1 = &_fld1) fixed(Vector256 <UInt16> *pFld2 = &_fld2) { var result = Avx2.And( Avx.LoadVector256((UInt16 *)(pFld1)), Avx.LoadVector256((UInt16 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector256 <Int32> *pClsVar1 = &_clsVar1) fixed(Vector256 <Int32> *pClsVar2 = &_clsVar2) { var result = Avx2.And( Avx.LoadVector256((Int32 *)(pClsVar1)), Avx.LoadVector256((Int32 *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
public unsafe void Test_AVX_BitsToBytes() { uint x = 0b0000_0001__0010_0011__0100_0101__0110_0111u; uint y = 0b1000_1001__1010_1011__1100_1101__1110_1111u; Vector256 <byte> mask1, mask2, zero = Vector256 <byte> .Zero, one, ff; byte[] mask1_bytes = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, }; byte[] mask2_bytes = new byte[] { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; fixed(byte *ptr = mask1_bytes) mask1 = Avx2.LoadVector256(ptr); fixed(byte *ptr = mask2_bytes) mask2 = Avx2.LoadVector256(ptr); byte one_byte = 1; one = Avx2.BroadcastScalarToVector256(&one_byte); byte ff_byte = 0xff; ff = Avx2.BroadcastScalarToVector256(&ff_byte); // ***** load **** // Vector256 <uint> ux = Avx2.BroadcastScalarToVector256(&y); Vector256 <byte> bx = ux.AsByte(); Vector256 <byte> shuffled_x = Avx2.Shuffle(bx, mask1); Vector256 <byte> result_x = Avx2.And(shuffled_x, mask2); result_x = Avx2.Min(result_x, one); // ***** store **** // Vector256 <byte> reverse_x = Avx2.CompareEqual(result_x, zero); reverse_x = Avx2.AndNot(reverse_x, ff); uint reversed_x = (uint)Avx2.MoveMask(reverse_x); Assert.AreEqual(reversed_x, y); }
private unsafe static void BCnDecodeTileRgb(Span <uint> clut, Span <byte> output, ReadOnlySpan <byte> input) { if (Avx2.IsSupported) { Span <Vector256 <uint> > outputAsVector256 = MemoryMarshal.Cast <byte, Vector256 <uint> >(output); Vector256 <uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u); Vector256 <uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u); Vector256 <uint> masks = Vector256.Create(3u); Vector256 <uint> vClut; fixed(uint *pClut = &clut[0]) { vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe(); } Vector256 <uint> indices0; fixed(byte *pInput = input) { indices0 = Avx2.BroadcastScalarToVector256((uint *)(pInput + 4)); } Vector256 <uint> indices1 = indices0; indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0); indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1); indices0 = Avx2.And(indices0, masks); indices1 = Avx2.And(indices1, masks); outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0); outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1); } else { Span <uint> outputAsUint = MemoryMarshal.Cast <byte, uint>(output); uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4)); for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2) { outputAsUint[i] = clut[(int)(indices & 3)]; } } }
// when 0xED is found, next byte must be no larger than 0x9F // when 0xF4 is found, next byte must be no larger than 0x8F // next byte must be continuation, ie sign bit is set, so signed < is ok static void avxcheckFirstContinuationMax(Vector256 <byte> current_bytes, Vector256 <byte> off1_current_bytes, ref Vector256 <byte> has_error) { Vector256 <byte> maskED = Avx2.CompareEqual(off1_current_bytes, Vector256.Create((byte)0xED)); Vector256 <byte> maskF4 = Avx2.CompareEqual(off1_current_bytes, Vector256.Create((byte)0xF4)); Vector256 <byte> badfollowED = Avx2.And( Avx2.CompareGreaterThan(current_bytes.AsSByte(), Vector256.Create((byte)0x9F).AsSByte()).AsByte(), maskED); Vector256 <byte> badfollowF4 = Avx2.And( Avx2.CompareGreaterThan(current_bytes.AsSByte(), Vector256.Create((byte)0x8F).AsSByte()).AsByte(), maskF4); has_error = Avx2.Or(has_error, Avx2.Or(badfollowED, badfollowF4)); }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__AndInt32(); fixed(Vector256 <Int32> *pFld1 = &test._fld1) fixed(Vector256 <Int32> *pFld2 = &test._fld2) { var result = Avx2.And( Avx.LoadVector256((Int32 *)(pFld1)), Avx.LoadVector256((Int32 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
private static void denoiseLineAvx2(byte *pcurr, byte *pprev, byte *pnext, nint cb) { byte *ip = pcurr, pp = pprev, np = pnext; nint cnt = 0, end = cb - Vector256 <byte> .Count; var vthresh = Vector256.Create(denoiseThreshold); var vones = Avx2.CompareEqual(vthresh, vthresh); LoopTop: do { var vcurr = Avx.LoadVector256(ip + cnt); var vprev = Avx.LoadVector256(pp + cnt); var vnext = Avx.LoadVector256(np + cnt); var vdiffp = Avx2.Or(Avx2.SubtractSaturate(vcurr, vprev), Avx2.SubtractSaturate(vprev, vcurr)); var vmaskp = Avx2.CompareEqual(Avx2.Max(vdiffp, vthresh), vthresh); var vdiffn = Avx2.Or(Avx2.SubtractSaturate(vcurr, vnext), Avx2.SubtractSaturate(vnext, vcurr)); var vmaskn = Avx2.CompareEqual(Avx2.Max(vdiffn, vthresh), vthresh); var vavgp = Avx2.Average(vcurr, vprev); var vavgn = Avx2.Average(vcurr, vnext); var voutval = Avx2.Average(Avx2.BlendVariable(vavgn, vavgp, vmaskp), Avx2.BlendVariable(vavgp, vavgn, vmaskn)); var voutmsk = Avx2.Or(vmaskp, vmaskn); voutval = Avx2.Average(voutval, Avx2.BlendVariable(voutval, Avx2.Average(vprev, vnext), Avx2.And(vmaskp, vmaskn))); var vsurlt = Avx2.Xor(Avx2.CompareEqual(Avx2.Min(Avx2.Max(vprev, vnext), vcurr), vcurr), vones); var vsurgt = Avx2.Xor(Avx2.CompareEqual(Avx2.Max(Avx2.Min(vprev, vnext), vcurr), vcurr), vones); voutmsk = Avx2.And(voutmsk, Avx2.Or(vsurlt, vsurgt)); voutval = Avx2.BlendVariable(vcurr, voutval, voutmsk); Avx.Store(ip + cnt, voutval); cnt += Vector256 <byte> .Count; } while (cnt <= end); if (cnt < end + Vector256 <byte> .Count) { cnt = end; goto LoopTop; } }
private static unsafe (Vector256 <UInt32>[] hi, Vector256 <UInt32>[] lo) Mul(Vector256 <UInt32>[] v, UInt32 n) { Vector256 <UInt32>[] w_hi = new Vector256 <UInt32> [v.Length], w_lo = new Vector256 <UInt32> [v.Length]; Vector256 <UInt32> u = Avx2.ConvertToVector256Int64(Vector128.Create(n)).AsUInt32(); Vector256 <UInt32> mask = lower_mask; fixed(Vector256 <UInt32> *pv = v, pw_hi = w_hi, pw_lo = w_lo) { for (int i = 0; i < v.Length; i++) { Vector256 <UInt32> c = Avx2.Multiply(pv[i], u).AsUInt32(); pw_hi[i] = Avx2.And(Avx2.Shuffle(c, MM_PERM_CDAB), mask); pw_lo[i] = Avx2.And(c, mask); } } return(w_hi, w_lo); }
public static unsafe void ReverseBits(this Span <int> span) { var intsReversed = 0; if (Avx2.IsSupported) { fixed(int *ptr = span) { var vectorCount = span.Length / 8; for (int i = 0; i < vectorCount; i++) { var vector = Avx.LoadVector256((ptr + intsReversed)); var vector2 = Avx2.And(Avx2.And(vector, Vector256.Create(0xFF00FF)), Vector256.Create(-16711936)); vector = Avx2.Add( Avx2.Or( Avx2.ShiftRightLogical(vector, 8), Avx2.ShiftLeftLogical(vector, 24) ), Avx2.Or( Avx2.ShiftLeftLogical(vector2, 8), Avx2.ShiftRightLogical(vector2, 24) ) ); Avx.Store(ptr + intsReversed, vector); intsReversed += 8; } } } for (int i = intsReversed; i < span.Length; i++) { span[i] = BinaryPrimitives.ReverseEndianness(span[i]); } fixed(void *ptr = span) { new Span <byte>(ptr, span.Length * 4).ReverseBits(); } }
public static unsafe byte[] ArgbToAlpha32(byte[] argbBytes) { var result = new byte[argbBytes.Length]; var resultPtr = result.ToPtr(); var bytePtr = argbBytes.ToPtr(); for (var ptrIndex = 0; ptrIndex < argbBytes.Length; ptrIndex += 16) { var vector128 = Avx2.LoadVector128(bytePtr + ptrIndex).AsUInt32(); vector128 = Avx2.And(vector128, alphaOnlyVector128); vector128 = Avx2.Or(vector128, Avx2.ShiftRightLogical128BitLane(vector128, 1)); vector128 = Avx2.Or(vector128, Avx2.ShiftRightLogical128BitLane(vector128, 2)); Avx2.Store(resultPtr + ptrIndex, vector128.AsByte()); } return(result); }
//[MethodImpl(MethodImplOptions.AggressiveInlining)] public static __m256 exp256_ps(__m256 V) { __m256 x = V; __m256 tmp = __m256.Zero; __m256 one = SET(1.0f); x = Avx2.Min(x, exp_hi); x = Avx2.Max(x, exp_lo); __m256 fx = Avx2.Multiply(x, cLOG2EF); fx = Avx2.Add(fx, SET(0.5f)); tmp = Avx2.Floor(fx); var mask = Avx2.Compare(tmp, fx, FloatComparisonMode.OrderedGreaterThanSignaling); mask = Avx2.And(mask, one); fx = Avx2.Subtract(tmp, mask); tmp = Avx2.Multiply(fx, cexp_C1); __m256 z = Avx2.Multiply(fx, cexp_C2); x = Avx2.Subtract(x, tmp); x = Avx2.Subtract(x, z); z = Avx2.Multiply(x, x); __m256 y = cexp_p0; y = Fma.MultiplyAdd(y, x, cexp_p1); y = Fma.MultiplyAdd(y, x, cexp_p2); y = Fma.MultiplyAdd(y, x, cexp_p3); y = Fma.MultiplyAdd(y, x, cexp_p4); y = Fma.MultiplyAdd(y, x, cexp_p5); y = Fma.MultiplyAdd(y, z, x); y = Avx2.Add(y, one); var imm0 = Avx2.ConvertToVector256Int32(fx); var F7 = Vector256.Create((int)0x7f); imm0 = Avx2.Add(imm0, F7); imm0 = Avx2.ShiftLeftLogical(imm0, 23); __m256 pow2n = Vector256.AsSingle(imm0); y = Avx2.Multiply(y, pow2n); return(y); }
public static void CollectColorBlueTransforms(Span <uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span <int> histo) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && tileWidth >= 16) { const int span = 16; Span <ushort> values = stackalloc ushort[span]; var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span <uint> srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); for (nint x = 0; x <= tileWidth - span; x += span) { nint input0Idx = x; nint input1Idx = x + (span / 2); Vector256 <byte> input0 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 <byte> input1 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector256 <byte> r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); Vector256 <byte> r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); Vector256 <byte> r = Avx2.Or(r0, r1); Vector256 <byte> gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); Vector256 <byte> gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); Vector256 <ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); Vector256 <byte> g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); Vector256 <short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr); Vector256 <short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); Vector256 <byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte()); Vector256 <byte> d = Avx2.Subtract(c, a.AsByte()); Vector256 <byte> e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As <ushort, Vector256 <ushort> >(ref outputRef) = e.AsUInt16(); for (int i = 0; i < span; i++) { ++histo[values[i]]; } } }