public static void CollectColorBlueTransforms(Span <uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span <int> histo) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && tileWidth >= 16) { const int span = 16; Span <ushort> values = stackalloc ushort[span]; var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span <uint> srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); for (nint x = 0; x <= tileWidth - span; x += span) { nint input0Idx = x; nint input1Idx = x + (span / 2); Vector256 <byte> input0 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 <byte> input1 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector256 <byte> r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); Vector256 <byte> r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); Vector256 <byte> r = Avx2.Or(r0, r1); Vector256 <byte> gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); Vector256 <byte> gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); Vector256 <ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); Vector256 <byte> g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); Vector256 <short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr); Vector256 <short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); Vector256 <byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte()); Vector256 <byte> d = Avx2.Subtract(c, a.AsByte()); Vector256 <byte> e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As <ushort, Vector256 <ushort> >(ref outputRef) = e.AsUInt16(); for (int i = 0; i < span; i++) { ++histo[values[i]]; } } }
public unsafe ReadOnlyProposal[] ForVectorized() { ProposalResult p = ProposalBuilder.GetSortedVectorizedInsurances(); var insuranceId = Vector256.Create(SearchedInsuranceId); fixed(int *iip = p.InsuranceIds) { int i = 0; int length = p.InsuranceIds.Length - Vector256 <int> .Count + 1; int mask = 8; while (mask == 8 && i < length) { mask = (int)Lzcnt.LeadingZeroCount( (uint)Avx2.MoveMask( Vector256.AsByte( Avx2.CompareEqual( Avx2.LoadVector256(iip + i), insuranceId ) ).Reverse() ) ) >> 2; i += Vector256 <int> .Count; } i -= Vector256 <int> .Count; int initial = i + mask; if (initial == p.InsuranceIds.Length) { return(Array.Empty <ReadOnlyProposal>()); } mask = 0; while (mask == 0 && i < length) { mask = (int)Lzcnt.LeadingZeroCount( (uint)Avx2.MoveMask( Vector256.AsByte( Avx2.CompareEqual( Avx2.LoadVector256(iip + i), insuranceId ) ) ) ) >> 2; i += Vector256 <int> .Count; } return(p.Proposals.AsSpan(initial, i - mask - initial).ToArray()); } }
// removed, not testing loop speed //[Benchmark] public unsafe ReadOnlyProposal[] PositionalSortedVectorized() { ProposalResult[] proposals = ProposalBuilder.GetPositionalSortedVectorizedInsurances(); ProposalResult p = proposals[SearchedInsuranceId]; var minPremium = Vector256.Create(decimal.ToOACurrency(SearchedNetPremium)); fixed(long *npp = p.NetPremiums) { int i = 0; int initial = 0; for (; i < p.NetPremiums.Length - Vector256 <long> .Count + 1; i += Vector256 <long> .Count) { int mask = (int)Lzcnt.LeadingZeroCount( (uint)Avx2.MoveMask( Vector256.AsByte( Avx2.CompareGreaterThan( Avx2.LoadVector256(npp + i), minPremium ) ).Reverse() ) ) >> 3; if (mask != 8) { initial = i + mask; break; } } for (; i < p.NetPremiums.Length - Vector256 <long> .Count + 1; i += Vector256 <long> .Count) { int mask = (int)Lzcnt.LeadingZeroCount( (uint)Avx2.MoveMask( Vector256.AsByte( Avx2.CompareGreaterThan( Avx2.LoadVector256(npp + i), minPremium ) ) ) ) >> 3; if (mask != 0) { int length = i + Vector256 <long> .Count - mask - initial; return(p.Proposals.AsSpan(initial, length).ToArray()); } } return(p.Proposals.AsSpan(initial).ToArray()); } }
public static byte[] SHA512(byte[] data) { SHADataContext ctx = new SHADataContext(data, SHADataContext.AlgorithmWordSize._64); ulong *state = stackalloc ulong[8] { 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 }; ulong *schedule = stackalloc ulong[80]; do { ctx.PrepareBlock((byte *)schedule, sizeof(ulong) * 16); InitScheduleSHA512(schedule); ProcessBlockSHA512(state, schedule); }while (!ctx.Complete); if (BitConverter.IsLittleEndian) { var hash = new byte[8 * sizeof(ulong)]; if (Avx2.IsSupported) { Vector256 <ulong> vec = Avx2.LoadVector256(state), vec2 = Avx2.LoadVector256(state + 4); Unsafe.As <byte, Vector256 <byte> >(ref hash[0]) = Avx2.Shuffle(vec.AsByte(), ReverseEndianess_64_256); Unsafe.As <byte, Vector256 <byte> >(ref hash[sizeof(ulong) * 4]) = Avx2.Shuffle(vec2.AsByte(), ReverseEndianess_64_256); } else { fixed(byte *phash = hash) ReverseEndianess(state, (ulong *)phash, 8); } return(hash); } else { return(new Span <byte>(state, sizeof(ulong) * 8).ToArray()); } }
public static Vector256 <T> Vector256Add <T>(Vector256 <T> left, Vector256 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Avx2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Avx2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Avx2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Avx2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Avx2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Avx2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Avx2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Avx2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Avx.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Avx.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
public unsafe void Test_AVX_BitsToBytes() { uint x = 0b0000_0001__0010_0011__0100_0101__0110_0111u; uint y = 0b1000_1001__1010_1011__1100_1101__1110_1111u; Vector256 <byte> mask1, mask2, zero = Vector256 <byte> .Zero, one, ff; byte[] mask1_bytes = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, }; byte[] mask2_bytes = new byte[] { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; fixed(byte *ptr = mask1_bytes) mask1 = Avx2.LoadVector256(ptr); fixed(byte *ptr = mask2_bytes) mask2 = Avx2.LoadVector256(ptr); byte one_byte = 1; one = Avx2.BroadcastScalarToVector256(&one_byte); byte ff_byte = 0xff; ff = Avx2.BroadcastScalarToVector256(&ff_byte); // ***** load **** // Vector256 <uint> ux = Avx2.BroadcastScalarToVector256(&y); Vector256 <byte> bx = ux.AsByte(); Vector256 <byte> shuffled_x = Avx2.Shuffle(bx, mask1); Vector256 <byte> result_x = Avx2.And(shuffled_x, mask2); result_x = Avx2.Min(result_x, one); // ***** store **** // Vector256 <byte> reverse_x = Avx2.CompareEqual(result_x, zero); reverse_x = Avx2.AndNot(reverse_x, ff); uint reversed_x = (uint)Avx2.MoveMask(reverse_x); Assert.AreEqual(reversed_x, y); }
internal static unsafe void ProcessTextureAvx2(Span <Color8> data) { uint registerElements = (uint)Vector256 <uint> .Count; registerElements.AssertEqual((uint)(sizeof(Vector256 <uint>) / sizeof(Color8))); uint offset; fixed(Color8 *dataPtr8 = data) { uint *dataPtr = (uint *)dataPtr8; for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements) { Vector256 <uint> rawColor = Avx2.LoadVector256(dataPtr + offset); Vector256 <uint> alphaMask = Vector256.Create(0xFF000000U); Vector256 <uint> alpha = Avx2.And(rawColor, alphaMask); Vector256 <ushort> lo = Avx2.UnpackLow(rawColor.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi = Avx2.UnpackHigh(rawColor.AsByte(), Vector256 <byte> .Zero).AsUInt16(); const byte offset0 = 6; const byte offset1 = offset0 + 8; const byte offset2 = offset1 + 8; const byte offset3 = offset2 + 8; Vector256 <byte> alphaShuffle = Vector256.Create( offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF ); Vector256 <uint> alphaLo = Avx2.Shuffle(lo.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi = Avx2.Shuffle(hi.AsByte(), alphaShuffle).AsUInt32(); Vector256 <ushort> prodLo = Avx2.MultiplyLow(lo, alphaLo.AsUInt16()); Vector256 <ushort> prodHi = Avx2.MultiplyLow(hi, alphaHi.AsUInt16()); Vector256 <ushort> addend = Vector256.Create((ushort)0x00FFU); var sumLo = Avx2.Add(prodLo, addend); var sumHi = Avx2.Add(prodHi, addend); var shiftLo = Avx2.ShiftRightLogical(sumLo, 8); var shiftHi = Avx2.ShiftRightLogical(sumHi, 8); var packed = Avx2.PackUnsignedSaturate(shiftLo.AsInt16(), shiftHi.AsInt16()).AsUInt32(); var mask = Vector256.Create(0x00FFFFFFU); packed = Avx2.And(packed, mask); packed = Avx2.Or(packed, alpha); Avx2.Store(dataPtr + offset, packed); } } // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4) if (offset < data.Length) { ProcessTextureScalar(data.SliceUnsafe(offset)); } }
internal static unsafe void ProcessTextureAvx2Unrolled(Span <Color8> data) { uint registerElements = (uint)Vector256 <uint> .Count * 4; registerElements.AssertEqual((uint)(sizeof(Vector256 <uint>) / sizeof(Color8))); uint offset; fixed(Color8 *dataPtr8 = data) { uint *dataPtr = (uint *)dataPtr8; for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements) { Vector256 <uint> rawColor0 = Avx2.LoadVector256(dataPtr + offset + 0x00); Vector256 <uint> rawColor1 = Avx2.LoadVector256(dataPtr + offset + 0x08); Vector256 <uint> rawColor2 = Avx2.LoadVector256(dataPtr + offset + 0x10); Vector256 <uint> rawColor3 = Avx2.LoadVector256(dataPtr + offset + 0x18); Vector256 <uint> alphaMask = Vector256.Create(0xFF000000U); Vector256 <uint> alpha0 = Avx2.And(rawColor0, alphaMask); Vector256 <uint> alpha1 = Avx2.And(rawColor1, alphaMask); Vector256 <uint> alpha2 = Avx2.And(rawColor2, alphaMask); Vector256 <uint> alpha3 = Avx2.And(rawColor3, alphaMask); Vector256 <ushort> lo0 = Avx2.UnpackLow(rawColor0.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> lo1 = Avx2.UnpackLow(rawColor1.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> lo2 = Avx2.UnpackLow(rawColor2.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> lo3 = Avx2.UnpackLow(rawColor3.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi0 = Avx2.UnpackHigh(rawColor0.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi1 = Avx2.UnpackHigh(rawColor1.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi2 = Avx2.UnpackHigh(rawColor2.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi3 = Avx2.UnpackHigh(rawColor3.AsByte(), Vector256 <byte> .Zero).AsUInt16(); const byte offset0 = 6; const byte offset1 = offset0 + 8; const byte offset2 = offset1 + 8; const byte offset3 = offset2 + 8; Vector256 <byte> alphaShuffle = Vector256.Create( offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF ); Vector256 <uint> alphaLo0 = Avx2.Shuffle(lo0.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaLo1 = Avx2.Shuffle(lo1.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaLo2 = Avx2.Shuffle(lo2.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaLo3 = Avx2.Shuffle(lo3.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi0 = Avx2.Shuffle(hi0.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi1 = Avx2.Shuffle(hi1.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi2 = Avx2.Shuffle(hi2.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi3 = Avx2.Shuffle(hi3.AsByte(), alphaShuffle).AsUInt32(); Vector256 <ushort> prodLo0 = Avx2.MultiplyLow(lo0, alphaLo0.AsUInt16()); Vector256 <ushort> prodLo1 = Avx2.MultiplyLow(lo1, alphaLo1.AsUInt16()); Vector256 <ushort> prodLo2 = Avx2.MultiplyLow(lo2, alphaLo2.AsUInt16()); Vector256 <ushort> prodLo3 = Avx2.MultiplyLow(lo3, alphaLo3.AsUInt16()); Vector256 <ushort> prodHi0 = Avx2.MultiplyLow(hi0, alphaHi0.AsUInt16()); Vector256 <ushort> prodHi1 = Avx2.MultiplyLow(hi1, alphaHi1.AsUInt16()); Vector256 <ushort> prodHi2 = Avx2.MultiplyLow(hi2, alphaHi2.AsUInt16()); Vector256 <ushort> prodHi3 = Avx2.MultiplyLow(hi3, alphaHi3.AsUInt16()); Vector256 <ushort> addend = Vector256.Create((ushort)0x00FFU); var sumLo0 = Avx2.Add(prodLo0, addend); var sumLo1 = Avx2.Add(prodLo1, addend); var sumLo2 = Avx2.Add(prodLo2, addend); var sumLo3 = Avx2.Add(prodLo3, addend); var sumHi0 = Avx2.Add(prodHi0, addend); var sumHi1 = Avx2.Add(prodHi1, addend); var sumHi2 = Avx2.Add(prodHi2, addend); var sumHi3 = Avx2.Add(prodHi3, addend); var shiftLo0 = Avx2.ShiftRightLogical(sumLo0, 8); var shiftLo1 = Avx2.ShiftRightLogical(sumLo1, 8); var shiftLo2 = Avx2.ShiftRightLogical(sumLo2, 8); var shiftLo3 = Avx2.ShiftRightLogical(sumLo3, 8); var shiftHi0 = Avx2.ShiftRightLogical(sumHi0, 8); var shiftHi1 = Avx2.ShiftRightLogical(sumHi1, 8); var shiftHi2 = Avx2.ShiftRightLogical(sumHi2, 8); var shiftHi3 = Avx2.ShiftRightLogical(sumHi3, 8); var packed0 = Avx2.PackUnsignedSaturate(shiftLo0.AsInt16(), shiftHi0.AsInt16()).AsUInt32(); var packed1 = Avx2.PackUnsignedSaturate(shiftLo1.AsInt16(), shiftHi1.AsInt16()).AsUInt32(); var packed2 = Avx2.PackUnsignedSaturate(shiftLo2.AsInt16(), shiftHi2.AsInt16()).AsUInt32(); var packed3 = Avx2.PackUnsignedSaturate(shiftLo3.AsInt16(), shiftHi3.AsInt16()).AsUInt32(); var mask = Vector256.Create(0x00FFFFFFU); packed0 = Avx2.And(packed0, mask); packed1 = Avx2.And(packed1, mask); packed2 = Avx2.And(packed2, mask); packed3 = Avx2.And(packed3, mask); packed0 = Avx2.Or(packed0, alpha0); packed1 = Avx2.Or(packed1, alpha1); packed2 = Avx2.Or(packed2, alpha2); packed3 = Avx2.Or(packed3, alpha3); Avx2.Store(dataPtr + offset + 0x00, packed0); Avx2.Store(dataPtr + offset + 0x08, packed1); Avx2.Store(dataPtr + offset + 0x10, packed2); Avx2.Store(dataPtr + offset + 0x18, packed3); } } // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4) if (offset < data.Length) { ProcessTextureScalar(data.SliceUnsafe(offset)); } }
/// <summary> Implemented using 'Avx2' intrinsics</summary> /// <remarks> Without 'Avx2' support will behave as a simple loop</remarks> public static unsafe int GetIndexIntrinsics(ReadOnlySpan <int> span, int item) { // Get a fixed pointer so the garbage-collector doesn't move the collection fixed(int *startPointer = span) { int *endPointer = startPointer + span.Length; int *pointer = startPointer; // Query if the cpu actually supports 'Avx2' instructions if (Avx2.IsSupported) { // Load '1' item into a 128 bit vector Vector128 <int> itemScaler = Sse2.LoadScalarVector128(&item); // Copy that first item into the other 7 slots of a 256 bit vector, this means // we now have a vector that is holding 8 times the value 'item' Vector256 <int> itemVector = Avx2.BroadcastScalarToVector256(itemScaler); // Loop through the span 8 elements at a time (256 bit / 32 bit = 8) for (; pointer + 8 < endPointer; pointer += 8) { // Load 8 elements from the span Vector256 <int> elements = Avx.LoadVector256(pointer); // Compare those 8 elements with our item. This will give us 8 values of // 'FFFF' or '0000' (32 bits of either 1 or 0) in a 256 bit vector Vector256 <int> elementEquals = Avx2.CompareEqual(elements, itemVector); /* * Because 256 bit is a too big type to work with we combine it into a single * integer by taking 4 bits from each 32 bit value (bit 7, 15, 23 and 31). * * eq 32: 0 0 1 0 0 0 0 0 0 * MoveMask: 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 * Hex: 0 0 F 0 0 0 0 0 0 */ int mask = Avx2.MoveMask(elementEquals.AsByte()); // If we make the assumption that the item only exists in the span once then // we can construct a jump table for it. switch (mask) { case 0x0000000F: // At element 0 return((int)(pointer - startPointer)); case 0x000000F0: // At element 1 return((int)(pointer + 1 - startPointer)); case 0x00000F00: // At element 2 return((int)(pointer + 2 - startPointer)); case 0x0000F000: // At element 3 return((int)(pointer + 3 - startPointer)); case 0x000F0000: // At element 4 return((int)(pointer + 4 - startPointer)); case 0x00F00000: // At element 5 return((int)(pointer + 5 - startPointer)); case 0x0F000000: // At element 6 return((int)(pointer + 6 - startPointer)); case unchecked ((int)0xF0000000): // At element 7 return((int)(pointer + 7 - startPointer)); case 0x00000000: // Not found continue; default: throw new Exception("Item found in span multiple times"); } } } // Handle the remaiming items with a simple loop for (; pointer < endPointer; pointer++) { if (*pointer == item) { return((int)(pointer - startPointer)); } } } return(-1); }
public static Vector256 <uint> RotateLeftUInt32_16(this Vector256 <uint> value) { return(Avx2.Shuffle(value.AsByte(), Rot16).AsUInt32()); }
public static Vector256 <byte> ReverseEndianness32(this Vector256 <uint> value) { return(Avx2.Shuffle(value.AsByte(), Reverse32_256)); }
public static Vector256 <T> RotateLeftUInt32_24 <T>(this Vector256 <T> value) where T : struct { return(Avx2.Shuffle(value.AsByte(), VRot24).As <byte, T>()); }
public static Vector256 <T> ReverseEndianness32 <T>(this Vector256 <T> value) where T : struct { return(Avx2.Shuffle(value.AsByte(), VReverse32).As <byte, T>()); }
public static Vector256 <T> ReverseEndianness128 <T>(this Vector256 <T> a) where T : struct { return(Avx2.Shuffle(a.AsByte(), VReverse128).As <byte, T>()); }