public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) { Vector4 my1 = s.V1L; Vector4 my7 = s.V7L; Vector4 mz0 = my1 + my7; Vector4 my3 = s.V3L; Vector4 mz2 = my3 + my7; Vector4 my5 = s.V5L; Vector4 mz1 = my3 + my5; Vector4 mz3 = my1 + my5; Vector4 mz4 = (mz0 + mz1) * C_1_175876; mz2 = (mz2 * C_1_961571) + mz4; mz3 = (mz3 * C_0_390181) + mz4; mz0 = mz0 * C_0_899976; mz1 = mz1 * C_2_562915; Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; Vector4 my2 = s.V2L; Vector4 my6 = s.V6L; mz4 = (my2 + my6) * C_0_541196; Vector4 my0 = s.V0L; Vector4 my4 = s.V4L; mz0 = my0 + my4; mz1 = my0 - my4; mz2 = mz4 + (my6 * C_1_847759); mz3 = mz4 + (my2 * C_0_765367); my0 = mz0 + mz3; my3 = mz0 - mz3; my1 = mz1 + mz2; my2 = mz1 - mz2; d.V0L = my0 + mb0; d.V7L = my0 - mb0; d.V1L = my1 + mb1; d.V6L = my1 - mb1; d.V2L = my2 + mb2; d.V5L = my2 - mb2; d.V3L = my3 + mb3; d.V4L = my3 - mb3; }
/// <summary> /// Performs 8x8 matrix Inverse Discrete Cosine Transform /// </summary> /// <param name="s">Source</param> /// <param name="d">Destination</param> public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { IDCT8x8_Avx(ref s, ref d); } else #endif { IDCT8x4_LeftPart(ref s, ref d); IDCT8x4_RightPart(ref s, ref d); } }
/// <summary> /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 /// </summary> /// <param name="src">Source</param> /// <param name="dest">Destination</param> /// <param name="temp">Temporary block provided by the caller</param> public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) { src.TransposeInto(ref temp); IDCT8x4_LeftPart(ref temp, ref dest); IDCT8x4_RightPart(ref temp, ref dest); dest.TransposeInto(ref temp); IDCT8x4_LeftPart(ref temp, ref dest); IDCT8x4_RightPart(ref temp, ref dest); // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? dest.MultiplyInPlace(C_0_125); }
/// <summary> /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 /// </summary> /// <param name="src">Source</param> /// <param name="dest">Destination</param> /// <param name="temp">Temporary block provided by the caller</param> public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) { // TODO: Transpose is a bottleneck now. We need full AVX support to optimize it: // https://github.com/dotnet/corefx/issues/22940 src.TransposeInto(ref temp); IDCT8x4_LeftPart(ref temp, ref dest); IDCT8x4_RightPart(ref temp, ref dest); dest.TransposeInto(ref temp); IDCT8x4_LeftPart(ref temp, ref dest); IDCT8x4_RightPart(ref temp, ref dest); // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? dest.MultiplyInplace(C_0_125); }
/// <summary> /// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/> /// using AVX commands. /// </summary> /// <param name="s">Source</param> /// <param name="d">Destination</param> public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) { #if SUPPORTS_RUNTIME_INTRINSICS Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); Vector256 <float> t0 = Avx.Add(s.V0, s.V7); Vector256 <float> t7 = Avx.Subtract(s.V0, s.V7); Vector256 <float> t1 = Avx.Add(s.V1, s.V6); Vector256 <float> t6 = Avx.Subtract(s.V1, s.V6); Vector256 <float> t2 = Avx.Add(s.V2, s.V5); Vector256 <float> t5 = Avx.Subtract(s.V2, s.V5); Vector256 <float> t3 = Avx.Add(s.V3, s.V4); Vector256 <float> t4 = Avx.Subtract(s.V3, s.V4); Vector256 <float> c0 = Avx.Add(t0, t3); Vector256 <float> c1 = Avx.Add(t1, t2); // 0 4 d.V0 = Avx.Add(c0, c1); d.V4 = Avx.Subtract(c0, c1); Vector256 <float> c3 = Avx.Subtract(t0, t3); Vector256 <float> c2 = Avx.Subtract(t1, t2); // 2 6 d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065); d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411); c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856); c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758); c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6); c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870); // 3 5 d.V3 = Avx.Subtract(c0, c2); d.V5 = Avx.Subtract(c3, c1); c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2); c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2); // 1 7 d.V1 = Avx.Add(c0, c3); d.V7 = Avx.Subtract(c0, c3); #endif }
/// <summary> /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. /// </summary> /// <remarks> /// Requires Avx support. /// </remarks> /// <param name="block">Input matrix.</param> public static void FDCT8x8_Avx(ref Block8x8F block) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); Vector256 <float> tmp0 = Avx.Add(block.V0, block.V7); Vector256 <float> tmp7 = Avx.Subtract(block.V0, block.V7); Vector256 <float> tmp1 = Avx.Add(block.V1, block.V6); Vector256 <float> tmp6 = Avx.Subtract(block.V1, block.V6); Vector256 <float> tmp2 = Avx.Add(block.V2, block.V5); Vector256 <float> tmp5 = Avx.Subtract(block.V2, block.V5); Vector256 <float> tmp3 = Avx.Add(block.V3, block.V4); Vector256 <float> tmp4 = Avx.Subtract(block.V3, block.V4); // Even part Vector256 <float> tmp10 = Avx.Add(tmp0, tmp3); Vector256 <float> tmp13 = Avx.Subtract(tmp0, tmp3); Vector256 <float> tmp11 = Avx.Add(tmp1, tmp2); Vector256 <float> tmp12 = Avx.Subtract(tmp1, tmp2); block.V0 = Avx.Add(tmp10, tmp11); block.V4 = Avx.Subtract(tmp10, tmp11); Vector256 <float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); block.V2 = Avx.Add(tmp13, z1); block.V6 = Avx.Subtract(tmp13, z1); // Odd part tmp10 = Avx.Add(tmp4, tmp5); tmp11 = Avx.Add(tmp5, tmp6); tmp12 = Avx.Add(tmp6, tmp7); Vector256 <float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); Vector256 <float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); Vector256 <float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); Vector256 <float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071); Vector256 <float> z11 = Avx.Add(tmp7, z3); Vector256 <float> z13 = Avx.Subtract(tmp7, z3); block.V5 = Avx.Add(z13, z2); block.V3 = Avx.Subtract(z13, z2); block.V1 = Avx.Add(z11, z4); block.V7 = Avx.Subtract(z11, z4); }
/// <summary> /// Apply 2D floating point FDCT inplace. /// </summary> /// <param name="block">Input matrix.</param> public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { ForwardTransform_Avx(ref block); } else #endif if (Vector.IsHardwareAccelerated) { ForwardTransform_Vector4(ref block); } else { ForwardTransform_Scalar(ref block); } }
// Applies 1D floating point FDCT inplace static void FDCT8x8_1D_Avx(ref Block8x8F block) { Vector256 <float> tmp0 = Avx.Add(block.V0, block.V7); Vector256 <float> tmp7 = Avx.Subtract(block.V0, block.V7); Vector256 <float> tmp1 = Avx.Add(block.V1, block.V6); Vector256 <float> tmp6 = Avx.Subtract(block.V1, block.V6); Vector256 <float> tmp2 = Avx.Add(block.V2, block.V5); Vector256 <float> tmp5 = Avx.Subtract(block.V2, block.V5); Vector256 <float> tmp3 = Avx.Add(block.V3, block.V4); Vector256 <float> tmp4 = Avx.Subtract(block.V3, block.V4); // Even part Vector256 <float> tmp10 = Avx.Add(tmp0, tmp3); Vector256 <float> tmp13 = Avx.Subtract(tmp0, tmp3); Vector256 <float> tmp11 = Avx.Add(tmp1, tmp2); Vector256 <float> tmp12 = Avx.Subtract(tmp1, tmp2); block.V0 = Avx.Add(tmp10, tmp11); block.V4 = Avx.Subtract(tmp10, tmp11); Vector256 <float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); block.V2 = Avx.Add(tmp13, z1); block.V6 = Avx.Subtract(tmp13, z1); // Odd part tmp10 = Avx.Add(tmp4, tmp5); tmp11 = Avx.Add(tmp5, tmp6); tmp12 = Avx.Add(tmp6, tmp7); Vector256 <float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); Vector256 <float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); Vector256 <float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); Vector256 <float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071); Vector256 <float> z11 = Avx.Add(tmp7, z3); Vector256 <float> z13 = Avx.Subtract(tmp7, z3); block.V5 = Avx.Add(z13, z2); block.V3 = Avx.Subtract(z13, z2); block.V1 = Avx.Add(z11, z4); block.V7 = Avx.Subtract(z11, z4); }
/// <summary> /// Apply floating point FDCT from src into dest /// </summary> /// <param name="src">Source</param> /// <param name="dest">Destination</param> /// <param name="temp">Temporary block provided by the caller for optimization</param> /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param> public static void TransformFDCT( ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp, bool offsetSourceByNeg128 = true) { src.TransposeInto(ref temp); if (offsetSourceByNeg128) { temp.AddInPlace(-128F); } FDCT8x8(ref temp, ref dest); dest.TransposeInto(ref temp); FDCT8x8(ref temp, ref dest); dest.MultiplyInPlace(C_0_125); }
/// <summary> /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization) /// </summary> /// <param name="src">Source</param> /// <param name="dest">Destination</param> /// <param name="temp">Temporary block provided by the caller</param> /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param> public static void TransformFDCT( ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp, bool offsetSourceByNeg128 = true) { src.TransposeInto(ref temp); if (offsetSourceByNeg128) { temp.AddToAllInplace(new Vector4(-128)); } FDCT8x4_LeftPart(ref temp, ref dest); FDCT8x4_RightPart(ref temp, ref dest); dest.TransposeInto(ref temp); FDCT8x4_LeftPart(ref temp, ref dest); FDCT8x4_RightPart(ref temp, ref dest); dest.MultiplyInplace(C_0_125); }
public void TransposeInto(ref Block8x8F d) { d.V0L.X = V0L.X; d.V1L.X = V0L.Y; d.V2L.X = V0L.Z; d.V3L.X = V0L.W; d.V4L.X = V0R.X; d.V5L.X = V0R.Y; d.V6L.X = V0R.Z; d.V7L.X = V0R.W; d.V0L.Y = V1L.X; d.V1L.Y = V1L.Y; d.V2L.Y = V1L.Z; d.V3L.Y = V1L.W; d.V4L.Y = V1R.X; d.V5L.Y = V1R.Y; d.V6L.Y = V1R.Z; d.V7L.Y = V1R.W; d.V0L.Z = V2L.X; d.V1L.Z = V2L.Y; d.V2L.Z = V2L.Z; d.V3L.Z = V2L.W; d.V4L.Z = V2R.X; d.V5L.Z = V2R.Y; d.V6L.Z = V2R.Z; d.V7L.Z = V2R.W; d.V0L.W = V3L.X; d.V1L.W = V3L.Y; d.V2L.W = V3L.Z; d.V3L.W = V3L.W; d.V4L.W = V3R.X; d.V5L.W = V3R.Y; d.V6L.W = V3R.Z; d.V7L.W = V3R.W; d.V0R.X = V4L.X; d.V1R.X = V4L.Y; d.V2R.X = V4L.Z; d.V3R.X = V4L.W; d.V4R.X = V4R.X; d.V5R.X = V4R.Y; d.V6R.X = V4R.Z; d.V7R.X = V4R.W; d.V0R.Y = V5L.X; d.V1R.Y = V5L.Y; d.V2R.Y = V5L.Z; d.V3R.Y = V5L.W; d.V4R.Y = V5R.X; d.V5R.Y = V5R.Y; d.V6R.Y = V5R.Z; d.V7R.Y = V5R.W; d.V0R.Z = V6L.X; d.V1R.Z = V6L.Y; d.V2R.Z = V6L.Z; d.V3R.Z = V6L.W; d.V4R.Z = V6R.X; d.V5R.Z = V6R.Y; d.V6R.Z = V6R.Z; d.V7R.Z = V6R.W; d.V0R.W = V7L.X; d.V1R.W = V7L.Y; d.V2R.W = V7L.Z; d.V3R.W = V7L.W; d.V4R.W = V7R.X; d.V5R.W = V7R.Y; d.V6R.W = V7R.Z; d.V7R.W = V7R.W; }
public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable) => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance);
public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable) => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance);
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); ref Vector256 <float> aBase = ref a.V0;
public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable) => EstimateQuality(ref chrominanceTable, ChrominanceTable);
public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable) => EstimateQuality(ref luminanceTable, LuminanceTable);