public static int QuantizeBlock(Span <short> input, Span <short> output, ref Vp8Matrix mtx) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { // Load all inputs. Vector256 <short> input0 = Unsafe.As <short, Vector256 <short> >(ref MemoryMarshal.GetReference(input)); Vector256 <ushort> iq0 = Unsafe.As <ushort, Vector256 <ushort> >(ref mtx.IQ[0]); Vector256 <ushort> q0 = Unsafe.As <ushort, Vector256 <ushort> >(ref mtx.Q[0]); // coeff = abs(in) Vector256 <ushort> coeff0 = Avx2.Abs(input0); // coeff = abs(in) + sharpen Vector256 <short> sharpen0 = Unsafe.As <short, Vector256 <short> >(ref mtx.Sharpen[0]); Avx2.Add(coeff0.AsInt16(), sharpen0); // out = (coeff * iQ + B) >> QFIX // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) Vector256 <ushort> coeffiQ0H = Avx2.MultiplyHigh(coeff0, iq0); Vector256 <ushort> coeffiQ0L = Avx2.MultiplyLow(coeff0, iq0); Vector256 <ushort> out00 = Avx2.UnpackLow(coeffiQ0L, coeffiQ0H); Vector256 <ushort> out08 = Avx2.UnpackHigh(coeffiQ0L, coeffiQ0H); // out = (coeff * iQ + B) Vector256 <uint> bias00 = Unsafe.As <uint, Vector256 <uint> >(ref mtx.Bias[0]); Vector256 <uint> bias08 = Unsafe.As <uint, Vector256 <uint> >(ref mtx.Bias[8]); out00 = Avx2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16(); out08 = Avx2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16(); // out = QUANTDIV(coeff, iQ, B, QFIX) out00 = Avx2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16(); out08 = Avx2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16(); // Pack result as 16b. Vector256 <short> out0 = Avx2.PackSignedSaturate(out00.AsInt32(), out08.AsInt32()); // if (coeff > 2047) coeff = 2047 out0 = Avx2.Min(out0, MaxCoeff2047Vec256); // Put the sign back. out0 = Avx2.Sign(out0, input0); // in = out * Q input0 = Avx2.MultiplyLow(out0, q0.AsInt16()); ref short inputRef = ref MemoryMarshal.GetReference(input); Unsafe.As <short, Vector256 <short> >(ref inputRef) = input0; // zigzag the output before storing it. Vector256 <byte> tmp256 = Avx2.Shuffle(out0.AsByte(), Cst256); Vector256 <byte> tmp78 = Avx2.Shuffle(out0.AsByte(), Cst78); // Reverse the order of the 16-byte lanes. Vector256 <byte> tmp87 = Avx2.Permute2x128(tmp78, tmp78, 1); Vector256 <short> outZ = Avx2.Or(tmp256, tmp87).AsInt16(); ref short outputRef = ref MemoryMarshal.GetReference(output);
public static Vector256 <float> Log(Vector256 <float> value) { Vector256 <float> invalidMask = Compare(value, Vector256 <float> .Zero, FloatComparisonMode.LessThanOrEqualOrderedNonSignaling); Vector256 <float> x = Max(value, MinNormPos.AsSingle()); Vector256 <int> ei = Avx2.ShiftRightLogical(x.AsInt32(), 23); x = Or(And(x, MantMask.AsSingle()), Point5); ei = Avx2.Subtract(ei, Ox7); Vector256 <float> e = Add(ConvertToVector256Single(ei), One); Vector256 <float> mask = Compare(x, Sqrthf, FloatComparisonMode.LessThanOrderedNonSignaling); Vector256 <float> tmp = And(x, mask); x = Subtract(x, One); e = Subtract(e, And(One, mask)); x = Add(x, tmp); Vector256 <float> z = Multiply(x, x); Vector256 <float> y = LogP0; y = Add(Multiply(y, x), LogP1); y = Add(Multiply(y, x), LogP2); y = Add(Multiply(y, x), LogP3); y = Add(Multiply(y, x), LogP4); y = Add(Multiply(y, x), LogP5); y = Add(Multiply(y, x), LogP6); y = Add(Multiply(y, x), LogP7); y = Add(Multiply(y, x), LogP8); y = Multiply(Multiply(y, x), z); y = Add(y, Multiply(e, LogQ1)); y = Subtract(y, Multiply(z, Point5)); x = Add(Add(x, y), Multiply(e, LogQ2)); return(Or(x, invalidMask)); }
public static unsafe Vector128 <int> AsInt(this Vector256 <long> l) { // (0, 1, 0, 2, 0, 3, 0, 4) -> (1, 2, 1, 2, 3, 4, 3, 4) var v = Avx2.Shuffle( l.AsInt32(), 136 ); var content = stackalloc int[8]; Avx2.Store(content, v); // (1, 2, 1, 2, 3, 4, 3, 4) -> (1, 2, 3, 4) return(Avx.LoadVector128(content + 2)); }
public static Vector256 <T> Vector256Add <T>(Vector256 <T> left, Vector256 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Avx2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Avx2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Avx2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Avx2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Avx2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Avx2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Avx2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Avx2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Avx.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Avx.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
public static void CollectColorBlueTransforms(Span <uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span <int> histo) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported && tileWidth >= 16) { const int span = 16; Span <ushort> values = stackalloc ushort[span]; var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue)); var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue)); for (int y = 0; y < tileHeight; y++) { Span <uint> srcSpan = bgra.Slice(y * stride); ref uint inputRef = ref MemoryMarshal.GetReference(srcSpan); for (nint x = 0; x <= tileWidth - span; x += span) { nint input0Idx = x; nint input1Idx = x + (span / 2); Vector256 <byte> input0 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte(); Vector256 <byte> input1 = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte(); Vector256 <byte> r0 = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256); Vector256 <byte> r1 = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256); Vector256 <byte> r = Avx2.Or(r0, r1); Vector256 <byte> gb0 = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256); Vector256 <byte> gb1 = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256); Vector256 <ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32()); Vector256 <byte> g = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256); Vector256 <short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr); Vector256 <short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); Vector256 <byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte()); Vector256 <byte> d = Avx2.Subtract(c, a.AsByte()); Vector256 <byte> e = Avx2.And(d, CollectColorBlueTransformsBlueMask256); ref ushort outputRef = ref MemoryMarshal.GetReference(values); Unsafe.As <ushort, Vector256 <ushort> >(ref outputRef) = e.AsUInt16(); for (int i = 0; i < span; i++) { ++histo[values[i]]; } } }
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); fixed(byte *shuffleVectorsPtr = AvxShuffleMasks) { Vector256 <byte> rowsAB = block.V01.AsByte(); Vector256 <byte> rowsCD = block.V23.AsByte(); Vector256 <byte> rowsEF = block.V45.AsByte(); Vector256 <byte> rowsGH = block.V67.AsByte(); // rows 0 1 Vector256 <int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); Vector256 <byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); Vector256 <int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); Vector256 <byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); Vector256 <byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); Vector256 <byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 <byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); // rows 2 3 Vector256 <int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); Vector256 <byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 <byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); Vector256 <byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); Vector256 <byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); Vector256 <byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); Vector256 <byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); Vector256 <byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); // rows 4 5 Vector256 <byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); Vector256 <byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 <byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); Vector256 <int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); Vector256 <byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); Vector256 <byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); Vector256 <byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); // rows 6 7 Vector256 <byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); Vector256 <byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); Vector256 <byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); Vector256 <byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); block.V01 = row01.AsInt16(); block.V23 = row23.AsInt16(); block.V45 = row45.AsInt16(); block.V67 = row67.AsInt16(); } }
private static void Set38(Vector256 <short> vector0, Vector256 <short> vector1, ref char output) { Unsafe.As <char, Vector256 <short> >(ref output) = vector0; Unsafe.Add(ref Unsafe.As <char, int>(ref output), 8) = vector1.AsInt32().GetElement(0); Unsafe.Add(ref Unsafe.As <char, short>(ref output), 18) = vector1.GetElement(2); }
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); fixed(byte *shuffleVectorsPtr = &MemoryMarshal.GetReference(AvxShuffleMasks)) { Vector256 <byte> rowAB = block.V01.AsByte(); Vector256 <byte> rowCD = block.V23.AsByte(); Vector256 <byte> rowEF = block.V45.AsByte(); Vector256 <byte> rowGH = block.V67.AsByte(); /* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */ Vector256 <int> crln_01_AB_CD = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); Vector256 <byte> row01_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_01_AB_CD).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); Vector256 <byte> row01_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_AB_CD).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (2 * 32))).AsByte(); Vector256 <int> crln_01_23_EF_23_CD = Avx.LoadVector256(shuffleVectorsPtr + (3 * 32)).AsInt32(); Vector256 <byte> row01_23_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_01_23_EF_23_CD).AsByte(); Vector256 <byte> row01_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 <byte> row01 = Avx2.Or(row01_AB, Avx2.Or(row01_CD, row01_EF)); /* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */ Vector256 <int> crln_23_AB_23_45_GH = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); Vector256 <byte> row23_45_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_23_AB_23_45_GH).AsByte(); Vector256 <byte> row23_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); Vector256 <byte> row23_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_23_EF_23_CD).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); Vector256 <byte> row23_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); Vector256 <byte> row23_45_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_23_AB_23_45_GH).AsByte(); Vector256 <byte> row23_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32))).AsByte(); Vector256 <byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); /* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */ Vector256 <byte> row45_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32))).AsByte(); Vector256 <int> crln_45_67_CD_45_EF = Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsInt32(); Vector256 <byte> row45_67_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_45_67_CD_45_EF).AsByte(); Vector256 <byte> row45_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (12 * 32))).AsByte(); Vector256 <byte> row45_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_45_67_CD_45_EF).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32))).AsByte(); Vector256 <byte> row45_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32))).AsByte(); Vector256 <byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); /* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */ Vector256 <byte> row67_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32))).AsByte(); Vector256 <int> crln_67_EF_67_GH = Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsInt32(); Vector256 <byte> row67_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_67_EF_67_GH).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32))).AsByte(); Vector256 <byte> row67_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_67_EF_67_GH).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (18 * 32))).AsByte(); Vector256 <byte> row67 = Avx2.Or(row67_CD, Avx2.Or(row67_EF, row67_GH)); block.V01 = row01.AsInt16(); block.V23 = row23.AsInt16(); block.V45 = row45.AsInt16(); block.V67 = row67.AsInt16(); } }