Ejemplo n.º 1
0
        public static float2 msubadd(float2 a, float2 b, float2 c)
        {
            if (Fma.IsFmaSupported)
            {
                v128 temp = Fma.fmaddsub_ps(*(v128 *)&a, *(v128 *)&b, *(v128 *)&c);

                return(*(float2 *)&temp);
            }
            else if (Sse.IsSseSupported)
            {
                v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(1 << 31, 0, 0, 0));

                return(math.mad(a, b, *(float2 *)&negate));
            }
            else
            {
                return(new float2(a.x * b.x - c.x, a.y * b.y + c.y));
            }
        }
Ejemplo n.º 2
0
        public static float4 msubadd(float4 a, float4 b, float4 c)
        {
            if (Fma.IsFmaSupported)
            {
                v128 temp = Fma.fmaddsub_ps(*(v128 *)&a, *(v128 *)&b, *(v128 *)&c);

                return(*(float4 *)&temp);
            }
            else if (Sse.IsSseSupported)
            {
                v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(1 << 31, 0, 1 << 31, 0));

                return(math.mad(a, b, *(float4 *)&negate));
            }
            else
            {
                return(new float4(a.x * b.x - c.x, a.y * b.y + c.y, a.z * b.z - c.z, a.w * b.w + c.w));
            }
        }
Ejemplo n.º 3
0
        private unsafe void AddMulScalarU(Span <float> scalar, Span <float> dst)
        {
            fixed(float *pdst = dst)
            fixed(float *psrc = scalar)
            {
                var pDstEnd     = pdst + dst.Length;
                var pDstCurrent = pdst;

                var scalarVector128 = Sse.LoadScalarVector128(psrc);

                while (pDstCurrent < pDstEnd)
                {
                    var dstVector = Sse.LoadVector128(pDstCurrent);
                    dstVector = Fma.MultiplyAdd(dstVector, scalarVector128, scalarVector128);
                    Sse.Store(pDstCurrent, dstVector);

                    pDstCurrent += 4;
                }
            }
        }
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleTernaryOpTest__MultiplySubtractNegatedSingle();

            fixed(Vector256 <Single> *pFld1 = &test._fld1)
            fixed(Vector256 <Single> *pFld2 = &test._fld2)
            fixed(Vector256 <Single> *pFld3 = &test._fld3)
            {
                var result = Fma.MultiplySubtractNegated(
                    Avx.LoadVector256((Single *)(pFld1)),
                    Avx.LoadVector256((Single *)(pFld2)),
                    Avx.LoadVector256((Single *)(pFld3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
            }
        }
        public static float3 dsubadd(float3 a, float3 b, float3 c)
        {
            if (Fma.IsFmaSupported)
            {
                v128 temp = Fma.fmaddsub_ps(*(v128 *)&a, Sse.rcp_ps(*(v128 *)&b), *(v128 *)&c);

                return(*(float3 *)&temp);
            }
            else if (Sse.IsSseSupported)
            {
                b = math.rcp(b);
                v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(1 << 31, 0, 1 << 31, 0));

                return(math.mad(a, b, *(float3 *)&negate));
            }
            else
            {
                return(new float3(a.x / b.x - c.x, a.y / b.y + c.y, a.z / b.z - c.z));
            }
        }
Ejemplo n.º 6
0
        public static float4 dadsub(float4 a, float4 b, float4 c)
        {
            if (Fma.IsFmaSupported)
            {
                v128 temp = Fma.fmsubadd_ps(*(v128 *)&a, Sse.rcp_ps(*(v128 *)&b), *(v128 *)&c);

                return(*(float4 *)&temp);
            }
            else if (Sse.IsSseSupported)
            {
                b = math.rcp(b);
                v128 negate = Sse.xor_ps(*(v128 *)&c, new v128(0, 1 << 31, 0, 1 << 31));

                return(math.mad(a, b, *(float4 *)&negate));
            }
            else
            {
                return(new float4(a.x / b.x + c.x, a.y / b.y - c.y, a.z / b.z + c.z, a.w / b.w - c.w));
            }
        }
Ejemplo n.º 7
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new AlternatingTernaryOpTest__MultiplySubtractAddDouble();

            fixed(Vector128 <Double> *pFld1 = &test._fld1)
            fixed(Vector128 <Double> *pFld2 = &test._fld2)
            fixed(Vector128 <Double> *pFld3 = &test._fld3)
            {
                var result = Fma.MultiplySubtractAdd(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2)),
                    Sse2.LoadVector128((Double *)(pFld3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
            }
        }
Ejemplo n.º 8
0
        //[MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static __m256 exp256_ps(__m256 V)
        {
            __m256 x   = V;
            __m256 tmp = __m256.Zero;
            __m256 one = SET(1.0f);

            x = Avx2.Min(x, exp_hi);
            x = Avx2.Max(x, exp_lo);
            __m256 fx = Avx2.Multiply(x, cLOG2EF);

            fx  = Avx2.Add(fx, SET(0.5f));
            tmp = Avx2.Floor(fx);
            var mask = Avx2.Compare(tmp, fx, FloatComparisonMode.OrderedGreaterThanSignaling);

            mask = Avx2.And(mask, one);
            fx   = Avx2.Subtract(tmp, mask);
            tmp  = Avx2.Multiply(fx, cexp_C1);
            __m256 z = Avx2.Multiply(fx, cexp_C2);

            x = Avx2.Subtract(x, tmp);
            x = Avx2.Subtract(x, z);
            z = Avx2.Multiply(x, x);
            __m256 y = cexp_p0;

            y = Fma.MultiplyAdd(y, x, cexp_p1);
            y = Fma.MultiplyAdd(y, x, cexp_p2);
            y = Fma.MultiplyAdd(y, x, cexp_p3);
            y = Fma.MultiplyAdd(y, x, cexp_p4);
            y = Fma.MultiplyAdd(y, x, cexp_p5);
            y = Fma.MultiplyAdd(y, z, x);
            y = Avx2.Add(y, one);
            var imm0 = Avx2.ConvertToVector256Int32(fx);
            var F7   = Vector256.Create((int)0x7f);

            imm0 = Avx2.Add(imm0, F7);
            imm0 = Avx2.ShiftLeftLogical(imm0, 23);
            __m256 pow2n = Vector256.AsSingle(imm0);

            y = Avx2.Multiply(y, pow2n);
            return(y);
        }
Ejemplo n.º 9
0
        private static float8 vdiv_ushort_AVX(ushort8 dividend, ushort8 divisor)
        {
            Assert.AreNotEqual(divisor.x0, 0);
            Assert.AreNotEqual(divisor.x1, 0);
            Assert.AreNotEqual(divisor.x2, 0);
            Assert.AreNotEqual(divisor.x3, 0);
            Assert.AreNotEqual(divisor.x4, 0);
            Assert.AreNotEqual(divisor.x5, 0);
            Assert.AreNotEqual(divisor.x6, 0);
            Assert.AreNotEqual(divisor.x7, 0);

            if (Avx.IsAvxSupported)
            {
                float8 dividend_f32 = dividend;
                float8 divisor_f32  = divisor;

                float8 divisor_f32_rcp = Avx.mm256_rcp_ps(divisor_f32);


                float8 precisionLossCompensation;

                if (Fma.IsFmaSupported)
                {
                    precisionLossCompensation = Fma.mm256_fnmadd_ps(divisor_f32_rcp, divisor_f32, new v256(PRECISION_ADJUSTMENT_FACTOR));
                }
                else
                {
                    precisionLossCompensation = maxmath.mad(-divisor_f32_rcp, divisor_f32, math.asfloat(PRECISION_ADJUSTMENT_FACTOR));
                }

                precisionLossCompensation *= divisor_f32_rcp;
                precisionLossCompensation *= dividend_f32;

                return(precisionLossCompensation);
            }
            else
            {
                throw new CPUFeatureCheckException();
            }
        }
Ejemplo n.º 10
0
        //↑を改変
        //集計用のVector256<float>で誤差が出ないように配列を分割して計算
        //Intrinsics FMA MultiplyAdd float
        private unsafe long Test23_Intrinsics_FMA_MultiplyAdd_float_MT_Kai(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector256 <int> .Count;
            //集計用のVector256<float>で扱える最大要素数 = 2064
            //これを1区分あたりの要素数(分割サイズ)にする
            //floatの仮数部24bit(16777215) * 8 / (255 * 255) = 2064.0941
            int rangeSize = ((1 << 24) - 1)
                            * Vector256 <float> .Count
                            / (byte.MaxValue * byte.MaxValue);

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal            = 0;
                int lastIndex            = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector256 <float> vTotal = Vector256.Create(0f);   //集計用
                fixed(byte *p            = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector256 <int> v   = Avx2.ConvertToVector256Int32(p + i);
                        Vector256 <float> f = Avx.ConvertToVector256Single(v);
                        vTotal = Fma.MultiplyAdd(f, f, vTotal);    //float
                    }
                }
                float *pp = stackalloc float[Vector256 <float> .Count];
                Avx.Store(pp, vTotal);
                for (int i = 0; i < Vector256 <float> .Count; i++)
                {
                    subtotal += (long)pp[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Ejemplo n.º 11
0
        public override ulong Run(CancellationToken cancellationToken)
        {
            if (!Fma.IsSupported && Avx.IsSupported)
            {
                return(0uL);
            }

            var randomFloatingSpan = new Span <float>(new[]
            {
                RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT,
                RANDOM_FLOAT
            });
            var dst        = new Span <float>(Enumerable.Repeat(1f, 8).ToArray());
            var iterations = 0uL;

            unsafe
            {
                fixed(float *pdst = dst)
                fixed(float *psrc = randomFloatingSpan)
                {
                    var srcVector = Avx.LoadVector256(psrc);
                    var dstVector = Avx.LoadVector256(pdst);

                    while (!cancellationToken.IsCancellationRequested)
                    {
                        for (var j = 0; j < LENGTH; j++)
                        {
                            dstVector = Fma.MultiplyAdd(dstVector, srcVector, srcVector);
                        }

                        Avx.Store(pdst, dstVector);

                        iterations++;
                    }
                }
            }

            return(iterations);
        }
Ejemplo n.º 12
0
        internal static ushort4 vdiv_ushort(ushort4 dividend, ushort4 divisor)
        {
            Assert.AreNotEqual(divisor.x, 0);
            Assert.AreNotEqual(divisor.y, 0);
            Assert.AreNotEqual(divisor.z, 0);
            Assert.AreNotEqual(divisor.w, 0);

            if (Sse2.IsSse2Supported)
            {
                float4 dividend_f32 = dividend;
                float4 divisor_f32  = divisor;

                v128 divisor_f32_rcp = Sse.rcp_ps(*(v128 *)&divisor_f32);


                v128 precisionLossCompensation;

                if (Fma.IsFmaSupported)
                {
                    precisionLossCompensation = Fma.fnmadd_ps(divisor_f32_rcp, *(v128 *)&divisor_f32, new v128(PRECISION_ADJUSTMENT_FACTOR));
                }
                else
                {
                    float4 temp = math.mad(*(float4 *)&divisor_f32_rcp, -divisor_f32, math.asfloat(PRECISION_ADJUSTMENT_FACTOR));

                    precisionLossCompensation = *(v128 *)&temp;
                }

                precisionLossCompensation = Sse.mul_ps(precisionLossCompensation, divisor_f32_rcp);
                precisionLossCompensation = Sse.mul_ps(precisionLossCompensation, *(v128 *)&dividend_f32);

                return((ushort4)(*(float4 *)&precisionLossCompensation));
            }
            else
            {
                throw new CPUFeatureCheckException();
            }
        }
Ejemplo n.º 13
0
        public static Vector128 <float> Lerp(Vector128 <float> a, Vector128 <float> b, Vector128 <float> t)
        {
            // This implementation is based on the DirectX Math Library XMVectorLerp method
            // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl

            if (AdvSimd.IsSupported)
            {
                return(AdvSimd.FusedMultiplyAdd(a, AdvSimd.Subtract(b, a), t));
            }
            else if (Fma.IsSupported)
            {
                return(Fma.MultiplyAdd(Sse.Subtract(b, a), t, a));
            }
            else if (Sse.IsSupported)
            {
                return(Sse.Add(Sse.Multiply(a, Sse.Subtract(Vector128.Create(1.0f), t)), Sse.Multiply(b, t)));
            }
            else
            {
                // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd.
                throw new PlatformNotSupportedException();
            }
        }
Ejemplo n.º 14
0
    static int Main()
    {
        s_success = true;

        // We expect the AOT compiler generated HW intrinsics with the following characteristics:
        //
        // * TRUE = IsSupported assumed to be true, no runtime check
        // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen
        // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used
        //
        // The test is compiled with multiple defines to test this.

#if BASELINE_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = null;
        bool?AesLzPcl           = null;
        bool?Sse4142            = null;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool?Avxvnni  = false;
#elif NON_VEX_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool?Avxvnni  = false;
#elif VEX_INTRINSICS
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 32;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = true;
        bool?FmaBmi12 = null;
        bool?Avxvnni  = null;
#else
#error Who dis?
#endif

        if (vectorsAccelerated != Vector.IsHardwareAccelerated)
        {
            throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}");
        }

        if (byteVectorLength != Vector <byte> .Count)
        {
            throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}");
        }

        Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0);

        Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0);
        Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0);

        Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null);

        Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero));
        Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null);

        Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero));
        Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0);

        Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0);
        Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0);

        Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99)));
        Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null);

        Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero));
        Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null);

        Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero));
        Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null);

        Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0);
        Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0);

        Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0);
        Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0);

        Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null);

        Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32);
        Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64);

        Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero));
        Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null);

        Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0);
        Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0);

        Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero));
        Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null);

        return(s_success ? 100 : 1);
    }
Ejemplo n.º 15
0
 static void TestExplicitFmaUsage3(ref Vector128 <double> a, double b)
 {
     CompareDoubles(ReferenceMultiplyAdd(_c64.ToScalar(), _c64.ToScalar(), _c64.ToScalar()),
                    Fma.MultiplyAdd(_c64, _c64, _c64).ToScalar());
 }
Ejemplo n.º 16
0
 static void TestExplicitFmaUsage2(ref Vector128 <double> a, double b)
 {
     CompareDoubles(ReferenceMultiplyAdd(a.ToScalar(), a.ToScalar(), a.ToScalar()),
                    Fma.MultiplyAdd(a, a, a).ToScalar());
 }
Ejemplo n.º 17
0
 static void TestExplicitFmaUsage1(ref Vector128 <double> a, double b)
 {
     CompareDoubles(ReferenceMultiplyAdd(a.ToScalar(), b, _c64.ToScalar()),
                    Fma.MultiplyAdd(a, Vector128.CreateScalarUnsafe(b), _c64).ToScalar());
 }
Ejemplo n.º 18
0
 static void TestExplicitFmaUsage3(ref Vector128 <float> a, float b)
 {
     CompareFloats(ReferenceMultiplyAdd(_c32.ToScalar(), _c32.ToScalar(), _c32.ToScalar()),
                   Fma.MultiplyAdd(_c32, _c32, _c32).ToScalar());
 }
Ejemplo n.º 19
0
 static void TestExplicitFmaUsage2(ref Vector128 <float> a, float b)
 {
     CompareFloats(ReferenceMultiplyAdd(a.ToScalar(), a.ToScalar(), a.ToScalar()),
                   Fma.MultiplyAdd(a, a, a).ToScalar());
 }
Ejemplo n.º 20
0
        static public float Dot(Vector v0, Vector v1)
        {
            if (v0.lng != v1.lng)
            {
                throw new Exception();
            }
            int lng = v0.lng;

            float *p0  = v0.ptr;
            float *p1  = v1.ptr;
            float *tmp = stackalloc float[8];

            if (lng < 8)
            {
                float sum = 0;
                for (int i = 0; i < lng; i++)
                {
                    sum += p0[i] * p1[i];
                }
                return(sum);
            }
            if (lng < 64)
            {
                var sum0 = Vector256 <float> .Zero;

                for (int i = 0; i <= lng - 8; i += 8)
                {
                    sum0 = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum0);
                }

                Avx.Store(tmp, sum0);
                float sum = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];

                for (int i = lng / 8 * 8; i < lng; i++)
                {
                    sum += p0[i] * p1[i];
                }
                return(sum);
            }
            else
            {
                var    sum  = Vector256 <float> .Zero;
                var    sum0 = Vector256 <float> .Zero;
                var    sum1 = Vector256 <float> .Zero;
                var    sum2 = Vector256 <float> .Zero;
                var    sum3 = Vector256 <float> .Zero;
                var    sum4 = Vector256 <float> .Zero;
                var    sum5 = Vector256 <float> .Zero;
                var    sum6 = Vector256 <float> .Zero;
                var    sum7 = Vector256 <float> .Zero;
                float *pp1  = p0;
                float *pp2  = p1;
                double dsum = 0;
                for (int i = 0; i < lng / 64; i++)
                {
                    sum0 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 00), Avx.LoadVector256(pp2 + 00), sum0);
                    sum1 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 08), Avx.LoadVector256(pp2 + 08), sum1);
                    sum2 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 16), Avx.LoadVector256(pp2 + 16), sum2);
                    sum3 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 24), Avx.LoadVector256(pp2 + 24), sum3);

                    sum4 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 32), Avx.LoadVector256(pp2 + 32), sum4);
                    sum5 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 40), Avx.LoadVector256(pp2 + 40), sum5);
                    sum6 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 48), Avx.LoadVector256(pp2 + 48), sum6);
                    sum7 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 56), Avx.LoadVector256(pp2 + 56), sum7);

                    pp1 += 64;
                    pp2 += 64;
                    //精度改善のためdoubleに結果を保存しておく
                    if (i % 1024 == 1023)
                    {
                        sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7)));
                        Avx.Store(tmp, sum);
                        dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
                        sum0  = Vector256 <float> .Zero;
                        sum1  = Vector256 <float> .Zero;
                        sum2  = Vector256 <float> .Zero;
                        sum3  = Vector256 <float> .Zero;
                        sum4  = Vector256 <float> .Zero;
                        sum5  = Vector256 <float> .Zero;
                        sum6  = Vector256 <float> .Zero;
                        sum7  = Vector256 <float> .Zero;
                    }
                }
                sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7)));

                for (int i = lng / 64 * 64; i <= lng - 8; i += 8)
                {
                    sum = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum);
                }

                Avx.Store(tmp, sum);
                dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];


                for (int i = lng / 8 * 8; i < lng; i++)
                {
                    dsum += p0[i] * p1[i];
                }
                return((float)dsum);
            }
        }
        public override void ProcessBlock(float deltaTime, BlockAccessor block)
        {
            var rbs  = block.GetComponentData <RigidBody>();
            var pos  = block.GetComponentData <Position>();
            var rot  = block.GetComponentData <Rotation>();
            var vel  = block.GetReadOnlyComponentData <Velocity>();
            var avel = block.GetReadOnlyComponentData <AngularVelocity>();

            if (Fma.IsSupported && Avx.IsSupported)
            {
                unsafe {
                    Vector128 <float> deltaF = Vector128.Create(deltaTime);
                    fixed(float *oldPosFloats = pos.Cast <Position, float>())
                    fixed(float *posFloats = pos.Cast <Position, float>())
                    fixed(float *velFloats = vel.Cast <Velocity, float>())
                    {
                        int i = 0;

                        for (; i < block.length; i += 4)
                        {
                            var op     = Sse.LoadAlignedVector128(&oldPosFloats[i]);
                            var p      = Sse.LoadAlignedVector128(&posFloats[i]);
                            var v      = Sse.LoadAlignedVector128(&velFloats[i]);
                            var result = Fma.MultiplyAdd(deltaF, v, p);
                            var bools  = Sse.CompareEqual(op, p);

                            Avx.MaskStore(&posFloats[i], bools, result);
                        }

                        for (i -= 4; i < block.length; i++)
                        {
                            if (oldPosFloats[i] == posFloats[i])
                            {
                                posFloats[i] = posFloats[i] + velFloats[i] * deltaTime;
                            }
                        }
                    }
                }

                for (int i = 0; i < block.length; i++)
                {
                    if (pos[i].value == rbs[i].lastPosition && rot[i].value == rbs[i].lastRotation)
                    {
                        quat x = quat.FromAxisAngle(avel[i].value.x * deltaTime, vec3.UnitX);
                        quat y = quat.FromAxisAngle(avel[i].value.y * deltaTime, vec3.UnitY);
                        quat z = quat.FromAxisAngle(avel[i].value.z * deltaTime, vec3.UnitZ);
                        rot[i].value = rot[i].value * x * y * z;

                        rbs[i].lastPosition = pos[i].value;
                        rbs[i].lastRotation = rot[i].value;
                    }
                }
            }
            else
            {
                for (int i = 0; i < block.length; i++)
                {
                    if (pos[i].value == rbs[i].lastPosition && rot[i].value == rbs[i].lastRotation)
                    {
                        pos[i].value += vel[i].value * deltaTime;

                        quat x = quat.FromAxisAngle(avel[i].value.x * deltaTime, vec3.UnitX);
                        quat y = quat.FromAxisAngle(avel[i].value.y * deltaTime, vec3.UnitY);
                        quat z = quat.FromAxisAngle(avel[i].value.z * deltaTime, vec3.UnitZ);
                        rot[i].value = rot[i].value * x * y * z;

                        rbs[i].lastPosition = pos[i].value;
                        rbs[i].lastRotation = rot[i].value;
                    }
                }
            }
        }
Ejemplo n.º 22
0
 public static __m256 fast_exp256_ps(__m256 V)
 {
     return(Vector256.AsSingle(Avx2.ConvertToVector256Int32WithTruncation(Fma.MultiplyAdd(EXP_C2, V, EXP_C1))));
 }
Ejemplo n.º 23
0
        public static float DotMultiplyIntrinsicWFmaWSpanPtr(ref Memory <float> vector1, ref Memory <float> vector2)
        {
            var span1   = vector1.Span;
            var span2   = vector2.Span;
            var cnt     = Math.Min(span1.Length, span2.Length);
            var v3      = Vector256.CreateScalarUnsafe(0f);
            var vectLen = Vector256 <float> .Count;
            var vectCnt = cnt / vectLen;
            var total   = 0f;

#if TEST
            var file = Path.GetTempFileName();
            using var writer = new StreamWriter(file);
            Console.WriteLine($"Intrinsic with FmaWPtr Mult. results will be written into {file}");
#endif

            unsafe
            {
                int i;
                var ptr1 = (float *)Unsafe.AsPointer(ref span1[0]);
                var ptr2 = (float *)Unsafe.AsPointer(ref span2[0]);

                for (i = 0; i < vectCnt; i++)
                {
                    var v1 = Avx.LoadVector256(ptr1);
                    var v2 = Avx.LoadVector256(ptr2);
                    v3    = Fma.MultiplyAdd(v1, v2, v3);
                    ptr1 += vectLen;
                    ptr2 += vectLen;
#if TEST
                    writer.WriteLine($"{v1.ToString()}\t{v2.ToString()}\t{v3.ToString()}");
#endif
                }

                for (i = 0; i < vectLen; i++)
                {
                    total += v3.GetElement(i);
                }

                i = vectCnt * vectLen;
                if (cnt % vectLen > 0)
                {
                    ptr1 = (float *)Unsafe.AsPointer(ref span1[i]);
                    ptr2 = (float *)Unsafe.AsPointer(ref span2[i]);
                    for (; i < cnt; i++)
                    {
                        total += *ptr1++ **ptr2++;
                    }
                }
            }

            if (vector1.Length != vector2.Length)
            {
                var h = vector1.Length > vector2.Length ? span1 : span2;
                for (var j = cnt; j < h.Length; j++)
                {
                    total += h[j];
                }
            }

            return(total);
        }
Ejemplo n.º 24
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &valueTable[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscal = Vector256.Create(scale);
                        var voffs = Fma.IsSupported ? Vector256.Create(offset * scale) : Vector256.Create(offset);

                        ipe -= Vector256 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3);
                            ip += Vector256 <byte> .Count;

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            if (Fma.IsSupported)
                            {
                                vf0 = Fma.MultiplySubtract(vf0, vscal, voffs);
                                vf1 = Fma.MultiplySubtract(vf1, vscal, voffs);
                                vf2 = Fma.MultiplySubtract(vf2, vscal, voffs);
                                vf3 = Fma.MultiplySubtract(vf3, vscal, voffs);
                            }
                            else
                            {
                                vf0 = Avx.Multiply(Avx.Subtract(vf0, voffs), vscal);
                                vf1 = Avx.Multiply(Avx.Subtract(vf1, voffs), vscal);
                                vf2 = Avx.Multiply(Avx.Subtract(vf2, voffs), vscal);
                                vf3 = Avx.Multiply(Avx.Subtract(vf3, voffs), vscal);
                            }

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <int> .Count, vf1);
                            Avx.Store(op + Vector256 <int> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <int> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscal = Vector128.Create(scale);
                        var voffs = Vector128.Create(offset);

                        ipe -= Vector128 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3);
                            ip += Vector128 <byte> .Count;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(Sse.Subtract(vf0, voffs), vscal);
                            vf1 = Sse.Multiply(Sse.Subtract(vf1, voffs), vscal);
                            vf2 = Sse.Multiply(Sse.Subtract(vf2, voffs), vscal);
                            vf3 = Sse.Multiply(Sse.Subtract(vf3, voffs), vscal);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <int> .Count, vf1);
                            Sse.Store(op + Vector128 <int> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <int> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count;
                    }
#elif VECTOR_CONVERT
                    var vscal = new VectorF(scale);
                    var voffs = new VectorF(offset);

                    ipe -= Vector <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vb = Unsafe.ReadUnaligned <Vector <byte> >(ip);
                        Vector.Widen(vb, out var vs0, out var vs1);
                        Vector.Widen(vs0, out var vi0, out var vi1);
                        Vector.Widen(vs1, out var vi2, out var vi3);
                        ip += Vector <byte> .Count;

                        var vf0 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi0));
                        var vf1 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi1));
                        var vf2 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi2));
                        var vf3 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi3));

                        vf0 = (vf0 - voffs) * vscal;
                        vf1 = (vf1 - voffs) * vscal;
                        vf2 = (vf2 - voffs) * vscal;
                        vf3 = (vf3 - voffs) * vscal;

                        Unsafe.WriteUnaligned(op, vf0);
                        Unsafe.WriteUnaligned(op + VectorF.Count, vf1);
                        Unsafe.WriteUnaligned(op + VectorF.Count * 2, vf2);
                        Unsafe.WriteUnaligned(op + VectorF.Count * 3, vf3);
                        op += Vector <byte> .Count;
                    }
                    ipe += Vector <byte> .Count;
#endif

                    ipe -= 8;
                    while (ip <= ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        float o3 = at[(uint)ip[3]];
                        float o4 = at[(uint)ip[4]];
                        float o5 = at[(uint)ip[5]];
                        float o6 = at[(uint)ip[6]];
                        float o7 = at[(uint)ip[7]];
                        ip += 8;

                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op[3] = o3;
                        op[4] = o4;
                        op[5] = o5;
                        op[6] = o6;
                        op[7] = o7;
                        op   += 8;
                    }
                    ipe += 8;

                    while (ip < ipe)
                    {
                        op[0] = at[(uint)ip[0]];
                        ip++;
                        op++;
                    }
                }
            }
Ejemplo n.º 25
0
 public static Vector512 <float> MultiplyAdd(Vector512 <float> left, Vector512 <float> right, Vector512 <float> add)
 {
     return(new Vector512 <float>(Fma.MultiplyAdd(left.V1, right.V1, add.V1), Fma.MultiplyAdd(left.V2, right.V2, add.V2)));
 }
Ejemplo n.º 26
0
 // Multipyly-Add: a*@this[i]+c[i]
 public static IEnumerable <Vector256 <double> > MulAdd(
     this IEnumerable <Vector256 <double> > @this,
     Vector256 <double> a,
     IEnumerable <Vector256 <double> > c)
 => @this.Zip(c).Select(v => Fma.MultiplyAdd(a, v.First, v.Second));
Ejemplo n.º 27
0
    public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor)
    {
        // Based on https://stackoverflow.com/a/51458507/347870

        // Convert to two 32-bit integers
        Vector128 <int> a_hi_epi32       = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32       = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16);

        Vector128 <int> b_hi_epi32       = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32       = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16);

        // Convert to 32-bit floats
        Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32);
        Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32);
        Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32);
        Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32);

        // Calculate the reciprocal
        Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi);
        Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo);

        // Calculate the inverse
        Vector128 <float> b_hi_inv_1;
        Vector128 <float> b_lo_inv_1;
        Vector128 <float> two = Vector128.Create(2.00000051757f);

        if (Fma.IsSupported)
        {
            b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two);
            b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two);
        }
        else
        {
            Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi);
            Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo);
            b_hi_inv_1 = Sse.Subtract(two, b_mul_hi);
            b_lo_inv_1 = Sse.Subtract(two, b_mul_lo);
        }

        // Compensate for the loss
        Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1);
        Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1);

        // Perform the division by multiplication
        Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1);
        Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1);

        // Convert back to integers
        Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi);
        Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo);

        // Zero-out the unnecessary parts
        Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16);

        // Blend the bits, and return
        if (Sse41.IsSupported)
        {
            return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA));
        }
        else
        {
            Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32());
            return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16());
        }
    }
Ejemplo n.º 28
0
 static void TestExplicitFmaUsage1(ref Vector128 <float> a, float b)
 {
     CompareFloats(ReferenceMultiplyAdd(a.ToScalar(), b, _c32.ToScalar()),
                   Fma.MultiplyAdd(a, Vector128.CreateScalarUnsafe(b), _c32).ToScalar());
 }
Ejemplo n.º 29
0
        public static unsafe void ComputeSingle(
            uint[,] iterations,
            int startScanline, int increment,
            double offsetX, double offsetY,
            double zoom,
            uint maxIterations,
            ref bool cancel)
        {
            const int stride = 8;

            int height = iterations.GetLength(0);
            int width  = iterations.GetLength(1);

            var maxIter = Vector256.Create((float)maxIterations);
            var limit   = Vector256.Create(4.0f);
            var one     = Vector256.Create(1.0f);
            var two     = Vector256.Create(2.0f);

            float *results = stackalloc float[stride];

            for (int i = startScanline; i < height && !cancel; i += increment)
            {
                for (int j = 0; j < width && !cancel; j += stride)
                {
                    var c0 = Impl.GetPointCoordinate(j + 0, i, width, height, offsetX, offsetY, zoom);
                    var c1 = Impl.GetPointCoordinate(j + 1, i, width, height, offsetX, offsetY, zoom);
                    var c2 = Impl.GetPointCoordinate(j + 2, i, width, height, offsetX, offsetY, zoom);
                    var c3 = Impl.GetPointCoordinate(j + 3, i, width, height, offsetX, offsetY, zoom);
                    var c4 = Impl.GetPointCoordinate(j + 4, i, width, height, offsetX, offsetY, zoom);
                    var c5 = Impl.GetPointCoordinate(j + 5, i, width, height, offsetX, offsetY, zoom);
                    var c6 = Impl.GetPointCoordinate(j + 6, i, width, height, offsetX, offsetY, zoom);
                    var c7 = Impl.GetPointCoordinate(j + 7, i, width, height, offsetX, offsetY, zoom);

                    var cr = Vector256.Create((float)c0.X, (float)c1.X, (float)c2.X, (float)c3.X, (float)c4.X, (float)c5.X, (float)c6.X, (float)c7.X);
                    var ci = Vector256.Create((float)c0.Y, (float)c1.Y, (float)c2.Y, (float)c3.Y, (float)c4.Y, (float)c5.Y, (float)c6.Y, (float)c7.Y);
                    var zr = cr;
                    var zi = ci;
                    var it = Vector256.Create(0f);

                    for (;;)
                    {
                        var zr2 = Avx.Multiply(zr, zr);
                        var zi2 = Avx.Multiply(zi, zi);
                        var squaredMagnitude = Avx.Add(zr2, zi2);

                        var cond = Avx.And(
                            Avx.Compare(squaredMagnitude, limit, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling),
                            Avx.Compare(it, maxIter, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling));

                        if (Avx.MoveMask(cond) == 0)
                        {
                            Avx.Store(results, it);

                            if (j + 0 < width)
                            {
                                iterations[i, j + 0] = (uint)results[0] % maxIterations;
                            }
                            if (j + 1 < width)
                            {
                                iterations[i, j + 1] = (uint)results[1] % maxIterations;
                            }
                            if (j + 2 < width)
                            {
                                iterations[i, j + 2] = (uint)results[2] % maxIterations;
                            }
                            if (j + 3 < width)
                            {
                                iterations[i, j + 3] = (uint)results[3] % maxIterations;
                            }
                            if (j + 4 < width)
                            {
                                iterations[i, j + 4] = (uint)results[4] % maxIterations;
                            }
                            if (j + 5 < width)
                            {
                                iterations[i, j + 5] = (uint)results[5] % maxIterations;
                            }
                            if (j + 6 < width)
                            {
                                iterations[i, j + 6] = (uint)results[6] % maxIterations;
                            }
                            if (j + 7 < width)
                            {
                                iterations[i, j + 7] = (uint)results[7] % maxIterations;
                            }
                            break;
                        }

                        zi = Fma.MultiplyAdd(two, Avx.Multiply(zr, zi), ci);
                        zr = Avx.Add(Avx.Subtract(zr2, zi2), cr);
                        it = Avx.Add(it, Avx.And(one, cond));
                    }
                }
            }
        }
Ejemplo n.º 30
0
        unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy)
        {
            float *op = (float *)ostart;
            int    xc = ox + ow, tstride = smapy;
            int    vcnt = smapy / Vector128 <float> .Count;

            while (ox < xc)
            {
                int lcnt = vcnt;

                float *tp = (float *)tstart + ox * tstride;
                float *mp = (float *)pmapy;

                Vector128 <float> av0;

                if (Avx.IsSupported && lcnt >= 2)
                {
                    var ax0 = Vector256 <float> .Zero;

                    for (; lcnt >= 4; lcnt -= 4)
                    {
                        var iv0 = Avx.LoadVector256(tp);
                        var iv1 = Avx.LoadVector256(tp + Vector256 <float> .Count);
                        tp += Vector256 <float> .Count * 2;

                        if (Fma.IsSupported)
                        {
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                        }
                        else
                        {
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                        }
                        mp += Vector256 <float> .Count * 2;
                    }

                    if (lcnt >= 2)
                    {
                        lcnt -= 2;

                        var iv0 = Avx.LoadVector256(tp);
                        tp += Vector256 <float> .Count;

                        if (Fma.IsSupported)
                        {
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                        }
                        else
                        {
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                        }

                        mp += Vector256 <float> .Count;
                    }

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                }
                else
                {
                    av0 = Vector128 <float> .Zero;
                }

                for (; lcnt != 0; lcnt--)
                {
                    var iv0 = Sse.LoadVector128(tp);
                    tp += Vector128 <float> .Count;

                    if (Fma.IsSupported)
                    {
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0);
                    }
                    else
                    {
                        av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp)));
                    }

                    mp += Vector128 <float> .Count;
                }

                *op++ = av0.HorizontalAdd();
                ox++;
            }
        }