public static float2 msubadd(float2 a, float2 b, float2 c) { if (Fma.IsFmaSupported) { v128 temp = Fma.fmaddsub_ps(*(v128 *)&a, *(v128 *)&b, *(v128 *)&c); return(*(float2 *)&temp); } else if (Sse.IsSseSupported) { v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(1 << 31, 0, 0, 0)); return(math.mad(a, b, *(float2 *)&negate)); } else { return(new float2(a.x * b.x - c.x, a.y * b.y + c.y)); } }
public static float4 msubadd(float4 a, float4 b, float4 c) { if (Fma.IsFmaSupported) { v128 temp = Fma.fmaddsub_ps(*(v128 *)&a, *(v128 *)&b, *(v128 *)&c); return(*(float4 *)&temp); } else if (Sse.IsSseSupported) { v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(1 << 31, 0, 1 << 31, 0)); return(math.mad(a, b, *(float4 *)&negate)); } else { return(new float4(a.x * b.x - c.x, a.y * b.y + c.y, a.z * b.z - c.z, a.w * b.w + c.w)); } }
private unsafe void AddMulScalarU(Span <float> scalar, Span <float> dst) { fixed(float *pdst = dst) fixed(float *psrc = scalar) { var pDstEnd = pdst + dst.Length; var pDstCurrent = pdst; var scalarVector128 = Sse.LoadScalarVector128(psrc); while (pDstCurrent < pDstEnd) { var dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Fma.MultiplyAdd(dstVector, scalarVector128, scalarVector128); Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; } } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleTernaryOpTest__MultiplySubtractNegatedSingle(); fixed(Vector256 <Single> *pFld1 = &test._fld1) fixed(Vector256 <Single> *pFld2 = &test._fld2) fixed(Vector256 <Single> *pFld3 = &test._fld3) { var result = Fma.MultiplySubtractNegated( Avx.LoadVector256((Single *)(pFld1)), Avx.LoadVector256((Single *)(pFld2)), Avx.LoadVector256((Single *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
public static float3 dsubadd(float3 a, float3 b, float3 c) { if (Fma.IsFmaSupported) { v128 temp = Fma.fmaddsub_ps(*(v128 *)&a, Sse.rcp_ps(*(v128 *)&b), *(v128 *)&c); return(*(float3 *)&temp); } else if (Sse.IsSseSupported) { b = math.rcp(b); v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(1 << 31, 0, 1 << 31, 0)); return(math.mad(a, b, *(float3 *)&negate)); } else { return(new float3(a.x / b.x - c.x, a.y / b.y + c.y, a.z / b.z - c.z)); } }
public static float4 dadsub(float4 a, float4 b, float4 c) { if (Fma.IsFmaSupported) { v128 temp = Fma.fmsubadd_ps(*(v128 *)&a, Sse.rcp_ps(*(v128 *)&b), *(v128 *)&c); return(*(float4 *)&temp); } else if (Sse.IsSseSupported) { b = math.rcp(b); v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(0, 1 << 31, 0, 1 << 31)); return(math.mad(a, b, *(float4 *)&negate)); } else { return(new float4(a.x / b.x + c.x, a.y / b.y - c.y, a.z / b.z + c.z, a.w / b.w - c.w)); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new AlternatingTernaryOpTest__MultiplySubtractAddDouble(); fixed(Vector128 <Double> *pFld1 = &test._fld1) fixed(Vector128 <Double> *pFld2 = &test._fld2) fixed(Vector128 <Double> *pFld3 = &test._fld3) { var result = Fma.MultiplySubtractAdd( Sse2.LoadVector128((Double *)(pFld1)), Sse2.LoadVector128((Double *)(pFld2)), Sse2.LoadVector128((Double *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
//[MethodImpl(MethodImplOptions.AggressiveInlining)] public static __m256 exp256_ps(__m256 V) { __m256 x = V; __m256 tmp = __m256.Zero; __m256 one = SET(1.0f); x = Avx2.Min(x, exp_hi); x = Avx2.Max(x, exp_lo); __m256 fx = Avx2.Multiply(x, cLOG2EF); fx = Avx2.Add(fx, SET(0.5f)); tmp = Avx2.Floor(fx); var mask = Avx2.Compare(tmp, fx, FloatComparisonMode.OrderedGreaterThanSignaling); mask = Avx2.And(mask, one); fx = Avx2.Subtract(tmp, mask); tmp = Avx2.Multiply(fx, cexp_C1); __m256 z = Avx2.Multiply(fx, cexp_C2); x = Avx2.Subtract(x, tmp); x = Avx2.Subtract(x, z); z = Avx2.Multiply(x, x); __m256 y = cexp_p0; y = Fma.MultiplyAdd(y, x, cexp_p1); y = Fma.MultiplyAdd(y, x, cexp_p2); y = Fma.MultiplyAdd(y, x, cexp_p3); y = Fma.MultiplyAdd(y, x, cexp_p4); y = Fma.MultiplyAdd(y, x, cexp_p5); y = Fma.MultiplyAdd(y, z, x); y = Avx2.Add(y, one); var imm0 = Avx2.ConvertToVector256Int32(fx); var F7 = Vector256.Create((int)0x7f); imm0 = Avx2.Add(imm0, F7); imm0 = Avx2.ShiftLeftLogical(imm0, 23); __m256 pow2n = Vector256.AsSingle(imm0); y = Avx2.Multiply(y, pow2n); return(y); }
private static float8 vdiv_ushort_AVX(ushort8 dividend, ushort8 divisor) { Assert.AreNotEqual(divisor.x0, 0); Assert.AreNotEqual(divisor.x1, 0); Assert.AreNotEqual(divisor.x2, 0); Assert.AreNotEqual(divisor.x3, 0); Assert.AreNotEqual(divisor.x4, 0); Assert.AreNotEqual(divisor.x5, 0); Assert.AreNotEqual(divisor.x6, 0); Assert.AreNotEqual(divisor.x7, 0); if (Avx.IsAvxSupported) { float8 dividend_f32 = dividend; float8 divisor_f32 = divisor; float8 divisor_f32_rcp = Avx.mm256_rcp_ps(divisor_f32); float8 precisionLossCompensation; if (Fma.IsFmaSupported) { precisionLossCompensation = Fma.mm256_fnmadd_ps(divisor_f32_rcp, divisor_f32, new v256(PRECISION_ADJUSTMENT_FACTOR)); } else { precisionLossCompensation = maxmath.mad(-divisor_f32_rcp, divisor_f32, math.asfloat(PRECISION_ADJUSTMENT_FACTOR)); } precisionLossCompensation *= divisor_f32_rcp; precisionLossCompensation *= dividend_f32; return(precisionLossCompensation); } else { throw new CPUFeatureCheckException(); } }
//↑を改変 //集計用のVector256<float>で誤差が出ないように配列を分割して計算 //Intrinsics FMA MultiplyAdd float private unsafe long Test23_Intrinsics_FMA_MultiplyAdd_float_MT_Kai(byte[] vs) { long total = 0; int simdLength = Vector256 <int> .Count; //集計用のVector256<float>で扱える最大要素数 = 2064 //これを1区分あたりの要素数(分割サイズ)にする //floatの仮数部24bit(16777215) * 8 / (255 * 255) = 2064.0941 int rangeSize = ((1 << 24) - 1) * Vector256 <float> .Count / (byte.MaxValue * byte.MaxValue); Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <float> vTotal = Vector256.Create(0f); //集計用 fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector256 <int> v = Avx2.ConvertToVector256Int32(p + i); Vector256 <float> f = Avx.ConvertToVector256Single(v); vTotal = Fma.MultiplyAdd(f, f, vTotal); //float } } float *pp = stackalloc float[Vector256 <float> .Count]; Avx.Store(pp, vTotal); for (int i = 0; i < Vector256 <float> .Count; i++) { subtotal += (long)pp[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public override ulong Run(CancellationToken cancellationToken) { if (!Fma.IsSupported && Avx.IsSupported) { return(0uL); } var randomFloatingSpan = new Span <float>(new[] { RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT }); var dst = new Span <float>(Enumerable.Repeat(1f, 8).ToArray()); var iterations = 0uL; unsafe { fixed(float *pdst = dst) fixed(float *psrc = randomFloatingSpan) { var srcVector = Avx.LoadVector256(psrc); var dstVector = Avx.LoadVector256(pdst); while (!cancellationToken.IsCancellationRequested) { for (var j = 0; j < LENGTH; j++) { dstVector = Fma.MultiplyAdd(dstVector, srcVector, srcVector); } Avx.Store(pdst, dstVector); iterations++; } } } return(iterations); }
internal static ushort4 vdiv_ushort(ushort4 dividend, ushort4 divisor) { Assert.AreNotEqual(divisor.x, 0); Assert.AreNotEqual(divisor.y, 0); Assert.AreNotEqual(divisor.z, 0); Assert.AreNotEqual(divisor.w, 0); if (Sse2.IsSse2Supported) { float4 dividend_f32 = dividend; float4 divisor_f32 = divisor; v128 divisor_f32_rcp = Sse.rcp_ps(*(v128 *)&divisor_f32); v128 precisionLossCompensation; if (Fma.IsFmaSupported) { precisionLossCompensation = Fma.fnmadd_ps(divisor_f32_rcp, *(v128 *)&divisor_f32, new v128(PRECISION_ADJUSTMENT_FACTOR)); } else { float4 temp = math.mad(*(float4 *)&divisor_f32_rcp, -divisor_f32, math.asfloat(PRECISION_ADJUSTMENT_FACTOR)); precisionLossCompensation = *(v128 *)&temp; } precisionLossCompensation = Sse.mul_ps(precisionLossCompensation, divisor_f32_rcp); precisionLossCompensation = Sse.mul_ps(precisionLossCompensation, *(v128 *)÷nd_f32); return((ushort4)(*(float4 *)&precisionLossCompensation)); } else { throw new CPUFeatureCheckException(); } }
public static Vector128 <float> Lerp(Vector128 <float> a, Vector128 <float> b, Vector128 <float> t) { // This implementation is based on the DirectX Math Library XMVectorLerp method // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl if (AdvSimd.IsSupported) { return(AdvSimd.FusedMultiplyAdd(a, AdvSimd.Subtract(b, a), t)); } else if (Fma.IsSupported) { return(Fma.MultiplyAdd(Sse.Subtract(b, a), t, a)); } else if (Sse.IsSupported) { return(Sse.Add(Sse.Multiply(a, Sse.Subtract(Vector128.Create(1.0f), t)), Sse.Multiply(b, t))); } else { // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd. throw new PlatformNotSupportedException(); } }
static int Main() { s_success = true; // We expect the AOT compiler generated HW intrinsics with the following characteristics: // // * TRUE = IsSupported assumed to be true, no runtime check // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used // // The test is compiled with multiple defines to test this. #if BASELINE_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = null; bool?AesLzPcl = null; bool?Sse4142 = null; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif NON_VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 32; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = true; bool?FmaBmi12 = null; bool?Avxvnni = null; #else #error Who dis? #endif if (vectorsAccelerated != Vector.IsHardwareAccelerated) { throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}"); } if (byteVectorLength != Vector <byte> .Count) { throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}"); } Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0); Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0); Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0); Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null); Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero)); Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null); Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero)); Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0); Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0); Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0); Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99))); Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null); Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero)); Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null); Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero)); Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null); Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0); Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0); Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0); Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0); Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null); Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32); Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64); Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero)); Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null); Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0); Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0); Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero)); Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null); return(s_success ? 100 : 1); }
static void TestExplicitFmaUsage3(ref Vector128 <double> a, double b) { CompareDoubles(ReferenceMultiplyAdd(_c64.ToScalar(), _c64.ToScalar(), _c64.ToScalar()), Fma.MultiplyAdd(_c64, _c64, _c64).ToScalar()); }
static void TestExplicitFmaUsage2(ref Vector128 <double> a, double b) { CompareDoubles(ReferenceMultiplyAdd(a.ToScalar(), a.ToScalar(), a.ToScalar()), Fma.MultiplyAdd(a, a, a).ToScalar()); }
static void TestExplicitFmaUsage1(ref Vector128 <double> a, double b) { CompareDoubles(ReferenceMultiplyAdd(a.ToScalar(), b, _c64.ToScalar()), Fma.MultiplyAdd(a, Vector128.CreateScalarUnsafe(b), _c64).ToScalar()); }
static void TestExplicitFmaUsage3(ref Vector128 <float> a, float b) { CompareFloats(ReferenceMultiplyAdd(_c32.ToScalar(), _c32.ToScalar(), _c32.ToScalar()), Fma.MultiplyAdd(_c32, _c32, _c32).ToScalar()); }
static void TestExplicitFmaUsage2(ref Vector128 <float> a, float b) { CompareFloats(ReferenceMultiplyAdd(a.ToScalar(), a.ToScalar(), a.ToScalar()), Fma.MultiplyAdd(a, a, a).ToScalar()); }
static public float Dot(Vector v0, Vector v1) { if (v0.lng != v1.lng) { throw new Exception(); } int lng = v0.lng; float *p0 = v0.ptr; float *p1 = v1.ptr; float *tmp = stackalloc float[8]; if (lng < 8) { float sum = 0; for (int i = 0; i < lng; i++) { sum += p0[i] * p1[i]; } return(sum); } if (lng < 64) { var sum0 = Vector256 <float> .Zero; for (int i = 0; i <= lng - 8; i += 8) { sum0 = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum0); } Avx.Store(tmp, sum0); float sum = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (int i = lng / 8 * 8; i < lng; i++) { sum += p0[i] * p1[i]; } return(sum); } else { var sum = Vector256 <float> .Zero; var sum0 = Vector256 <float> .Zero; var sum1 = Vector256 <float> .Zero; var sum2 = Vector256 <float> .Zero; var sum3 = Vector256 <float> .Zero; var sum4 = Vector256 <float> .Zero; var sum5 = Vector256 <float> .Zero; var sum6 = Vector256 <float> .Zero; var sum7 = Vector256 <float> .Zero; float *pp1 = p0; float *pp2 = p1; double dsum = 0; for (int i = 0; i < lng / 64; i++) { sum0 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 00), Avx.LoadVector256(pp2 + 00), sum0); sum1 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 08), Avx.LoadVector256(pp2 + 08), sum1); sum2 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 16), Avx.LoadVector256(pp2 + 16), sum2); sum3 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 24), Avx.LoadVector256(pp2 + 24), sum3); sum4 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 32), Avx.LoadVector256(pp2 + 32), sum4); sum5 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 40), Avx.LoadVector256(pp2 + 40), sum5); sum6 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 48), Avx.LoadVector256(pp2 + 48), sum6); sum7 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 56), Avx.LoadVector256(pp2 + 56), sum7); pp1 += 64; pp2 += 64; //精度改善のためdoubleに結果を保存しておく if (i % 1024 == 1023) { sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7))); Avx.Store(tmp, sum); dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; sum0 = Vector256 <float> .Zero; sum1 = Vector256 <float> .Zero; sum2 = Vector256 <float> .Zero; sum3 = Vector256 <float> .Zero; sum4 = Vector256 <float> .Zero; sum5 = Vector256 <float> .Zero; sum6 = Vector256 <float> .Zero; sum7 = Vector256 <float> .Zero; } } sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7))); for (int i = lng / 64 * 64; i <= lng - 8; i += 8) { sum = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum); } Avx.Store(tmp, sum); dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; for (int i = lng / 8 * 8; i < lng; i++) { dsum += p0[i] * p1[i]; } return((float)dsum); } }
public override void ProcessBlock(float deltaTime, BlockAccessor block) { var rbs = block.GetComponentData <RigidBody>(); var pos = block.GetComponentData <Position>(); var rot = block.GetComponentData <Rotation>(); var vel = block.GetReadOnlyComponentData <Velocity>(); var avel = block.GetReadOnlyComponentData <AngularVelocity>(); if (Fma.IsSupported && Avx.IsSupported) { unsafe { Vector128 <float> deltaF = Vector128.Create(deltaTime); fixed(float *oldPosFloats = pos.Cast <Position, float>()) fixed(float *posFloats = pos.Cast <Position, float>()) fixed(float *velFloats = vel.Cast <Velocity, float>()) { int i = 0; for (; i < block.length; i += 4) { var op = Sse.LoadAlignedVector128(&oldPosFloats[i]); var p = Sse.LoadAlignedVector128(&posFloats[i]); var v = Sse.LoadAlignedVector128(&velFloats[i]); var result = Fma.MultiplyAdd(deltaF, v, p); var bools = Sse.CompareEqual(op, p); Avx.MaskStore(&posFloats[i], bools, result); } for (i -= 4; i < block.length; i++) { if (oldPosFloats[i] == posFloats[i]) { posFloats[i] = posFloats[i] + velFloats[i] * deltaTime; } } } } for (int i = 0; i < block.length; i++) { if (pos[i].value == rbs[i].lastPosition && rot[i].value == rbs[i].lastRotation) { quat x = quat.FromAxisAngle(avel[i].value.x * deltaTime, vec3.UnitX); quat y = quat.FromAxisAngle(avel[i].value.y * deltaTime, vec3.UnitY); quat z = quat.FromAxisAngle(avel[i].value.z * deltaTime, vec3.UnitZ); rot[i].value = rot[i].value * x * y * z; rbs[i].lastPosition = pos[i].value; rbs[i].lastRotation = rot[i].value; } } } else { for (int i = 0; i < block.length; i++) { if (pos[i].value == rbs[i].lastPosition && rot[i].value == rbs[i].lastRotation) { pos[i].value += vel[i].value * deltaTime; quat x = quat.FromAxisAngle(avel[i].value.x * deltaTime, vec3.UnitX); quat y = quat.FromAxisAngle(avel[i].value.y * deltaTime, vec3.UnitY); quat z = quat.FromAxisAngle(avel[i].value.z * deltaTime, vec3.UnitZ); rot[i].value = rot[i].value * x * y * z; rbs[i].lastPosition = pos[i].value; rbs[i].lastRotation = rot[i].value; } } } }
public static __m256 fast_exp256_ps(__m256 V) { return(Vector256.AsSingle(Avx2.ConvertToVector256Int32WithTruncation(Fma.MultiplyAdd(EXP_C2, V, EXP_C1)))); }
public static float DotMultiplyIntrinsicWFmaWSpanPtr(ref Memory <float> vector1, ref Memory <float> vector2) { var span1 = vector1.Span; var span2 = vector2.Span; var cnt = Math.Min(span1.Length, span2.Length); var v3 = Vector256.CreateScalarUnsafe(0f); var vectLen = Vector256 <float> .Count; var vectCnt = cnt / vectLen; var total = 0f; #if TEST var file = Path.GetTempFileName(); using var writer = new StreamWriter(file); Console.WriteLine($"Intrinsic with FmaWPtr Mult. results will be written into {file}"); #endif unsafe { int i; var ptr1 = (float *)Unsafe.AsPointer(ref span1[0]); var ptr2 = (float *)Unsafe.AsPointer(ref span2[0]); for (i = 0; i < vectCnt; i++) { var v1 = Avx.LoadVector256(ptr1); var v2 = Avx.LoadVector256(ptr2); v3 = Fma.MultiplyAdd(v1, v2, v3); ptr1 += vectLen; ptr2 += vectLen; #if TEST writer.WriteLine($"{v1.ToString()}\t{v2.ToString()}\t{v3.ToString()}"); #endif } for (i = 0; i < vectLen; i++) { total += v3.GetElement(i); } i = vectCnt * vectLen; if (cnt % vectLen > 0) { ptr1 = (float *)Unsafe.AsPointer(ref span1[i]); ptr2 = (float *)Unsafe.AsPointer(ref span2[i]); for (; i < cnt; i++) { total += *ptr1++ **ptr2++; } } } if (vector1.Length != vector2.Length) { var h = vector1.Length > vector2.Length ? span1 : span2; for (var j = cnt; j < h.Length; j++) { total += h[j]; } } return(total); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &valueTable[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscal = Vector256.Create(scale); var voffs = Fma.IsSupported ? Vector256.Create(offset * scale) : Vector256.Create(offset); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); if (Fma.IsSupported) { vf0 = Fma.MultiplySubtract(vf0, vscal, voffs); vf1 = Fma.MultiplySubtract(vf1, vscal, voffs); vf2 = Fma.MultiplySubtract(vf2, vscal, voffs); vf3 = Fma.MultiplySubtract(vf3, vscal, voffs); } else { vf0 = Avx.Multiply(Avx.Subtract(vf0, voffs), vscal); vf1 = Avx.Multiply(Avx.Subtract(vf1, voffs), vscal); vf2 = Avx.Multiply(Avx.Subtract(vf2, voffs), vscal); vf3 = Avx.Multiply(Avx.Subtract(vf3, voffs), vscal); } Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscal = Vector128.Create(scale); var voffs = Vector128.Create(offset); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(Sse.Subtract(vf0, voffs), vscal); vf1 = Sse.Multiply(Sse.Subtract(vf1, voffs), vscal); vf2 = Sse.Multiply(Sse.Subtract(vf2, voffs), vscal); vf3 = Sse.Multiply(Sse.Subtract(vf3, voffs), vscal); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #elif VECTOR_CONVERT var vscal = new VectorF(scale); var voffs = new VectorF(offset); ipe -= Vector <byte> .Count; while (ip <= ipe) { var vb = Unsafe.ReadUnaligned <Vector <byte> >(ip); Vector.Widen(vb, out var vs0, out var vs1); Vector.Widen(vs0, out var vi0, out var vi1); Vector.Widen(vs1, out var vi2, out var vi3); ip += Vector <byte> .Count; var vf0 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi0)); var vf1 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi1)); var vf2 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi2)); var vf3 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi3)); vf0 = (vf0 - voffs) * vscal; vf1 = (vf1 - voffs) * vscal; vf2 = (vf2 - voffs) * vscal; vf3 = (vf3 - voffs) * vscal; Unsafe.WriteUnaligned(op, vf0); Unsafe.WriteUnaligned(op + VectorF.Count, vf1); Unsafe.WriteUnaligned(op + VectorF.Count * 2, vf2); Unsafe.WriteUnaligned(op + VectorF.Count * 3, vf3); op += Vector <byte> .Count; } ipe += Vector <byte> .Count; #endif ipe -= 8; while (ip <= ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; float o4 = at[(uint)ip[4]]; float o5 = at[(uint)ip[5]]; float o6 = at[(uint)ip[6]]; float o7 = at[(uint)ip[7]]; ip += 8; op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; op[4] = o4; op[5] = o5; op[6] = o6; op[7] = o7; op += 8; } ipe += 8; while (ip < ipe) { op[0] = at[(uint)ip[0]]; ip++; op++; } } }
public static Vector512 <float> MultiplyAdd(Vector512 <float> left, Vector512 <float> right, Vector512 <float> add) { return(new Vector512 <float>(Fma.MultiplyAdd(left.V1, right.V1, add.V1), Fma.MultiplyAdd(left.V2, right.V2, add.V2))); }
// Multipyly-Add: a*@this[i]+c[i] public static IEnumerable <Vector256 <double> > MulAdd( this IEnumerable <Vector256 <double> > @this, Vector256 <double> a, IEnumerable <Vector256 <double> > c) => @this.Zip(c).Select(v => Fma.MultiplyAdd(a, v.First, v.Second));
public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor) { // Based on https://stackoverflow.com/a/51458507/347870 // Convert to two 32-bit integers Vector128 <int> a_hi_epi32 = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32 = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16); Vector128 <int> b_hi_epi32 = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32 = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16); // Convert to 32-bit floats Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32); Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32); Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32); Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32); // Calculate the reciprocal Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi); Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo); // Calculate the inverse Vector128 <float> b_hi_inv_1; Vector128 <float> b_lo_inv_1; Vector128 <float> two = Vector128.Create(2.00000051757f); if (Fma.IsSupported) { b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two); b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two); } else { Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi); Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo); b_hi_inv_1 = Sse.Subtract(two, b_mul_hi); b_lo_inv_1 = Sse.Subtract(two, b_mul_lo); } // Compensate for the loss Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1); Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1); // Perform the division by multiplication Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1); Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1); // Convert back to integers Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi); Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo); // Zero-out the unnecessary parts Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16); // Blend the bits, and return if (Sse41.IsSupported) { return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA)); } else { Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32()); return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16()); } }
static void TestExplicitFmaUsage1(ref Vector128 <float> a, float b) { CompareFloats(ReferenceMultiplyAdd(a.ToScalar(), b, _c32.ToScalar()), Fma.MultiplyAdd(a, Vector128.CreateScalarUnsafe(b), _c32).ToScalar()); }
public static unsafe void ComputeSingle( uint[,] iterations, int startScanline, int increment, double offsetX, double offsetY, double zoom, uint maxIterations, ref bool cancel) { const int stride = 8; int height = iterations.GetLength(0); int width = iterations.GetLength(1); var maxIter = Vector256.Create((float)maxIterations); var limit = Vector256.Create(4.0f); var one = Vector256.Create(1.0f); var two = Vector256.Create(2.0f); float *results = stackalloc float[stride]; for (int i = startScanline; i < height && !cancel; i += increment) { for (int j = 0; j < width && !cancel; j += stride) { var c0 = Impl.GetPointCoordinate(j + 0, i, width, height, offsetX, offsetY, zoom); var c1 = Impl.GetPointCoordinate(j + 1, i, width, height, offsetX, offsetY, zoom); var c2 = Impl.GetPointCoordinate(j + 2, i, width, height, offsetX, offsetY, zoom); var c3 = Impl.GetPointCoordinate(j + 3, i, width, height, offsetX, offsetY, zoom); var c4 = Impl.GetPointCoordinate(j + 4, i, width, height, offsetX, offsetY, zoom); var c5 = Impl.GetPointCoordinate(j + 5, i, width, height, offsetX, offsetY, zoom); var c6 = Impl.GetPointCoordinate(j + 6, i, width, height, offsetX, offsetY, zoom); var c7 = Impl.GetPointCoordinate(j + 7, i, width, height, offsetX, offsetY, zoom); var cr = Vector256.Create((float)c0.X, (float)c1.X, (float)c2.X, (float)c3.X, (float)c4.X, (float)c5.X, (float)c6.X, (float)c7.X); var ci = Vector256.Create((float)c0.Y, (float)c1.Y, (float)c2.Y, (float)c3.Y, (float)c4.Y, (float)c5.Y, (float)c6.Y, (float)c7.Y); var zr = cr; var zi = ci; var it = Vector256.Create(0f); for (;;) { var zr2 = Avx.Multiply(zr, zr); var zi2 = Avx.Multiply(zi, zi); var squaredMagnitude = Avx.Add(zr2, zi2); var cond = Avx.And( Avx.Compare(squaredMagnitude, limit, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling), Avx.Compare(it, maxIter, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling)); if (Avx.MoveMask(cond) == 0) { Avx.Store(results, it); if (j + 0 < width) { iterations[i, j + 0] = (uint)results[0] % maxIterations; } if (j + 1 < width) { iterations[i, j + 1] = (uint)results[1] % maxIterations; } if (j + 2 < width) { iterations[i, j + 2] = (uint)results[2] % maxIterations; } if (j + 3 < width) { iterations[i, j + 3] = (uint)results[3] % maxIterations; } if (j + 4 < width) { iterations[i, j + 4] = (uint)results[4] % maxIterations; } if (j + 5 < width) { iterations[i, j + 5] = (uint)results[5] % maxIterations; } if (j + 6 < width) { iterations[i, j + 6] = (uint)results[6] % maxIterations; } if (j + 7 < width) { iterations[i, j + 7] = (uint)results[7] % maxIterations; } break; } zi = Fma.MultiplyAdd(two, Avx.Multiply(zr, zi), ci); zr = Avx.Add(Avx.Subtract(zr2, zi2), cr); it = Avx.Add(it, Avx.And(one, cond)); } } } }
unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy) { float *op = (float *)ostart; int xc = ox + ow, tstride = smapy; int vcnt = smapy / Vector128 <float> .Count; while (ox < xc) { int lcnt = vcnt; float *tp = (float *)tstart + ox * tstride; float *mp = (float *)pmapy; Vector128 <float> av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = Vector256 <float> .Zero; for (; lcnt >= 4; lcnt -= 4) { var iv0 = Avx.LoadVector256(tp); var iv1 = Avx.LoadVector256(tp + Vector256 <float> .Count); tp += Vector256 <float> .Count * 2; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); } mp += Vector256 <float> .Count * 2; } if (lcnt >= 2) { lcnt -= 2; var iv0 = Avx.LoadVector256(tp); tp += Vector256 <float> .Count; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); } mp += Vector256 <float> .Count; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(tp); tp += Vector128 <float> .Count; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); } mp += Vector128 <float> .Count; } *op++ = av0.HorizontalAdd(); ox++; } }