public static Vector256 <T> Vector256Add <T>(Vector256 <T> left, Vector256 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Avx2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Avx2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Avx2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Avx2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Avx2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Avx2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Avx2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Avx2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Avx.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Avx.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
public static void CollectColorBlueTransforms(Span <uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span <int> histo) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && tileWidth >= 16) { const int span = 16; Span <ushort> values = stackalloc ushort[span]; var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span <uint> srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); for (nint x = 0; x <= tileWidth - span; x += span) { nint input0Idx = x; nint input1Idx = x + (span / 2); Vector256 <byte> input0 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 <byte> input1 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector256 <byte> r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); Vector256 <byte> r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); Vector256 <byte> r = Avx2.Or(r0, r1); Vector256 <byte> gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); Vector256 <byte> gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); Vector256 <ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); Vector256 <byte> g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); Vector256 <short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr); Vector256 <short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); Vector256 <byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte()); Vector256 <byte> d = Avx2.Subtract(c, a.AsByte()); Vector256 <byte> e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As <ushort, Vector256 <ushort> >(ref outputRef) = e.AsUInt16(); for (int i = 0; i < span; i++) { ++histo[values[i]]; } } }
internal static unsafe void ProcessTextureAvx2(Span <Color8> data) { uint registerElements = (uint)Vector256 <uint> .Count; registerElements.AssertEqual((uint)(sizeof(Vector256 <uint>) / sizeof(Color8))); uint offset; fixed(Color8 *dataPtr8 = data) { uint *dataPtr = (uint *)dataPtr8; for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements) { Vector256 <uint> rawColor = Avx2.LoadVector256(dataPtr + offset); Vector256 <uint> alphaMask = Vector256.Create(0xFF000000U); Vector256 <uint> alpha = Avx2.And(rawColor, alphaMask); Vector256 <ushort> lo = Avx2.UnpackLow(rawColor.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi = Avx2.UnpackHigh(rawColor.AsByte(), Vector256 <byte> .Zero).AsUInt16(); const byte offset0 = 6; const byte offset1 = offset0 + 8; const byte offset2 = offset1 + 8; const byte offset3 = offset2 + 8; Vector256 <byte> alphaShuffle = Vector256.Create( offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF ); Vector256 <uint> alphaLo = Avx2.Shuffle(lo.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi = Avx2.Shuffle(hi.AsByte(), alphaShuffle).AsUInt32(); Vector256 <ushort> prodLo = Avx2.MultiplyLow(lo, alphaLo.AsUInt16()); Vector256 <ushort> prodHi = Avx2.MultiplyLow(hi, alphaHi.AsUInt16()); Vector256 <ushort> addend = Vector256.Create((ushort)0x00FFU); var sumLo = Avx2.Add(prodLo, addend); var sumHi = Avx2.Add(prodHi, addend); var shiftLo = Avx2.ShiftRightLogical(sumLo, 8); var shiftHi = Avx2.ShiftRightLogical(sumHi, 8); var packed = Avx2.PackUnsignedSaturate(shiftLo.AsInt16(), shiftHi.AsInt16()).AsUInt32(); var mask = Vector256.Create(0x00FFFFFFU); packed = Avx2.And(packed, mask); packed = Avx2.Or(packed, alpha); Avx2.Store(dataPtr + offset, packed); } } // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4) if (offset < data.Length) { ProcessTextureScalar(data.SliceUnsafe(offset)); } }
internal static unsafe void ProcessTextureAvx2Unrolled(Span <Color8> data) { uint registerElements = (uint)Vector256 <uint> .Count * 4; registerElements.AssertEqual((uint)(sizeof(Vector256 <uint>) / sizeof(Color8))); uint offset; fixed(Color8 *dataPtr8 = data) { uint *dataPtr = (uint *)dataPtr8; for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements) { Vector256 <uint> rawColor0 = Avx2.LoadVector256(dataPtr + offset + 0x00); Vector256 <uint> rawColor1 = Avx2.LoadVector256(dataPtr + offset + 0x08); Vector256 <uint> rawColor2 = Avx2.LoadVector256(dataPtr + offset + 0x10); Vector256 <uint> rawColor3 = Avx2.LoadVector256(dataPtr + offset + 0x18); Vector256 <uint> alphaMask = Vector256.Create(0xFF000000U); Vector256 <uint> alpha0 = Avx2.And(rawColor0, alphaMask); Vector256 <uint> alpha1 = Avx2.And(rawColor1, alphaMask); Vector256 <uint> alpha2 = Avx2.And(rawColor2, alphaMask); Vector256 <uint> alpha3 = Avx2.And(rawColor3, alphaMask); Vector256 <ushort> lo0 = Avx2.UnpackLow(rawColor0.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> lo1 = Avx2.UnpackLow(rawColor1.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> lo2 = Avx2.UnpackLow(rawColor2.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> lo3 = Avx2.UnpackLow(rawColor3.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi0 = Avx2.UnpackHigh(rawColor0.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi1 = Avx2.UnpackHigh(rawColor1.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi2 = Avx2.UnpackHigh(rawColor2.AsByte(), Vector256 <byte> .Zero).AsUInt16(); Vector256 <ushort> hi3 = Avx2.UnpackHigh(rawColor3.AsByte(), Vector256 <byte> .Zero).AsUInt16(); const byte offset0 = 6; const byte offset1 = offset0 + 8; const byte offset2 = offset1 + 8; const byte offset3 = offset2 + 8; Vector256 <byte> alphaShuffle = Vector256.Create( offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset0, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset1, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset2, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF, offset3, 0xFF ); Vector256 <uint> alphaLo0 = Avx2.Shuffle(lo0.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaLo1 = Avx2.Shuffle(lo1.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaLo2 = Avx2.Shuffle(lo2.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaLo3 = Avx2.Shuffle(lo3.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi0 = Avx2.Shuffle(hi0.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi1 = Avx2.Shuffle(hi1.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi2 = Avx2.Shuffle(hi2.AsByte(), alphaShuffle).AsUInt32(); Vector256 <uint> alphaHi3 = Avx2.Shuffle(hi3.AsByte(), alphaShuffle).AsUInt32(); Vector256 <ushort> prodLo0 = Avx2.MultiplyLow(lo0, alphaLo0.AsUInt16()); Vector256 <ushort> prodLo1 = Avx2.MultiplyLow(lo1, alphaLo1.AsUInt16()); Vector256 <ushort> prodLo2 = Avx2.MultiplyLow(lo2, alphaLo2.AsUInt16()); Vector256 <ushort> prodLo3 = Avx2.MultiplyLow(lo3, alphaLo3.AsUInt16()); Vector256 <ushort> prodHi0 = Avx2.MultiplyLow(hi0, alphaHi0.AsUInt16()); Vector256 <ushort> prodHi1 = Avx2.MultiplyLow(hi1, alphaHi1.AsUInt16()); Vector256 <ushort> prodHi2 = Avx2.MultiplyLow(hi2, alphaHi2.AsUInt16()); Vector256 <ushort> prodHi3 = Avx2.MultiplyLow(hi3, alphaHi3.AsUInt16()); Vector256 <ushort> addend = Vector256.Create((ushort)0x00FFU); var sumLo0 = Avx2.Add(prodLo0, addend); var sumLo1 = Avx2.Add(prodLo1, addend); var sumLo2 = Avx2.Add(prodLo2, addend); var sumLo3 = Avx2.Add(prodLo3, addend); var sumHi0 = Avx2.Add(prodHi0, addend); var sumHi1 = Avx2.Add(prodHi1, addend); var sumHi2 = Avx2.Add(prodHi2, addend); var sumHi3 = Avx2.Add(prodHi3, addend); var shiftLo0 = Avx2.ShiftRightLogical(sumLo0, 8); var shiftLo1 = Avx2.ShiftRightLogical(sumLo1, 8); var shiftLo2 = Avx2.ShiftRightLogical(sumLo2, 8); var shiftLo3 = Avx2.ShiftRightLogical(sumLo3, 8); var shiftHi0 = Avx2.ShiftRightLogical(sumHi0, 8); var shiftHi1 = Avx2.ShiftRightLogical(sumHi1, 8); var shiftHi2 = Avx2.ShiftRightLogical(sumHi2, 8); var shiftHi3 = Avx2.ShiftRightLogical(sumHi3, 8); var packed0 = Avx2.PackUnsignedSaturate(shiftLo0.AsInt16(), shiftHi0.AsInt16()).AsUInt32(); var packed1 = Avx2.PackUnsignedSaturate(shiftLo1.AsInt16(), shiftHi1.AsInt16()).AsUInt32(); var packed2 = Avx2.PackUnsignedSaturate(shiftLo2.AsInt16(), shiftHi2.AsInt16()).AsUInt32(); var packed3 = Avx2.PackUnsignedSaturate(shiftLo3.AsInt16(), shiftHi3.AsInt16()).AsUInt32(); var mask = Vector256.Create(0x00FFFFFFU); packed0 = Avx2.And(packed0, mask); packed1 = Avx2.And(packed1, mask); packed2 = Avx2.And(packed2, mask); packed3 = Avx2.And(packed3, mask); packed0 = Avx2.Or(packed0, alpha0); packed1 = Avx2.Or(packed1, alpha1); packed2 = Avx2.Or(packed2, alpha2); packed3 = Avx2.Or(packed3, alpha3); Avx2.Store(dataPtr + offset + 0x00, packed0); Avx2.Store(dataPtr + offset + 0x08, packed1); Avx2.Store(dataPtr + offset + 0x10, packed2); Avx2.Store(dataPtr + offset + 0x18, packed3); } } // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4) if (offset < data.Length) { ProcessTextureScalar(data.SliceUnsafe(offset)); } }
internal static void avx_count_nibbles(Vector256 <byte> bytes, ref avx_processed_utf_bytes answer) { answer.rawbytes = bytes; answer.high_nibbles = Avx2.And(Avx2.ShiftRightLogical(bytes.AsUInt16(), 4).AsByte(), Vector256.Create((byte)0x0F)); }