public static int Quantize2Blocks(Span <short> input, Span <short> output, ref Vp8Matrix mtx) { int nz = QuantizeBlock(input.Slice(0, 16), output.Slice(0, 16), ref mtx) << 0; nz |= QuantizeBlock(input.Slice(1 * 16, 16), output.Slice(1 * 16, 16), ref mtx) << 1; return(nz); }
public static int QuantizeBlock(Span <short> input, Span <short> output, ref Vp8Matrix mtx) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx2.IsSupported) { // Load all inputs. Vector256 <short> input0 = Unsafe.As <short, Vector256 <short> >(ref MemoryMarshal.GetReference(input)); Vector256 <ushort> iq0 = Unsafe.As <ushort, Vector256 <ushort> >(ref mtx.IQ[0]); Vector256 <ushort> q0 = Unsafe.As <ushort, Vector256 <ushort> >(ref mtx.Q[0]); // coeff = abs(in) Vector256 <ushort> coeff0 = Avx2.Abs(input0); // coeff = abs(in) + sharpen Vector256 <short> sharpen0 = Unsafe.As <short, Vector256 <short> >(ref mtx.Sharpen[0]); Avx2.Add(coeff0.AsInt16(), sharpen0); // out = (coeff * iQ + B) >> QFIX // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) Vector256 <ushort> coeffiQ0H = Avx2.MultiplyHigh(coeff0, iq0); Vector256 <ushort> coeffiQ0L = Avx2.MultiplyLow(coeff0, iq0); Vector256 <ushort> out00 = Avx2.UnpackLow(coeffiQ0L, coeffiQ0H); Vector256 <ushort> out08 = Avx2.UnpackHigh(coeffiQ0L, coeffiQ0H); // out = (coeff * iQ + B) Vector256 <uint> bias00 = Unsafe.As <uint, Vector256 <uint> >(ref mtx.Bias[0]); Vector256 <uint> bias08 = Unsafe.As <uint, Vector256 <uint> >(ref mtx.Bias[8]); out00 = Avx2.Add(out00.AsInt32(), bias00.AsInt32()).AsUInt16(); out08 = Avx2.Add(out08.AsInt32(), bias08.AsInt32()).AsUInt16(); // out = QUANTDIV(coeff, iQ, B, QFIX) out00 = Avx2.ShiftRightArithmetic(out00.AsInt32(), WebpConstants.QFix).AsUInt16(); out08 = Avx2.ShiftRightArithmetic(out08.AsInt32(), WebpConstants.QFix).AsUInt16(); // Pack result as 16b. Vector256 <short> out0 = Avx2.PackSignedSaturate(out00.AsInt32(), out08.AsInt32()); // if (coeff > 2047) coeff = 2047 out0 = Avx2.Min(out0, MaxCoeff2047Vec256); // Put the sign back. out0 = Avx2.Sign(out0, input0); // in = out * Q input0 = Avx2.MultiplyLow(out0, q0.AsInt16()); ref short inputRef = ref MemoryMarshal.GetReference(input); Unsafe.As <short, Vector256 <short> >(ref inputRef) = input0; // zigzag the output before storing it. Vector256 <byte> tmp256 = Avx2.Shuffle(out0.AsByte(), Cst256); Vector256 <byte> tmp78 = Avx2.Shuffle(out0.AsByte(), Cst78); // Reverse the order of the 16-byte lanes. Vector256 <byte> tmp87 = Avx2.Permute2x128(tmp78, tmp78, 1); Vector256 <short> outZ = Avx2.Or(tmp256, tmp87).AsInt16(); ref short outputRef = ref MemoryMarshal.GetReference(output);