示例#1
0
        public static Vector256 <double> DotProduct2D(Vector256 <double> left, Vector256 <double> right)
        {
            // SSE4.1 has a native dot product instruction, dppd
            if (Sse41.IsSupported)
            {
                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                Vector2D   dp      = Sse41.DotProduct(left.GetLower(), right.GetLower(), control);

                return(Helpers.DuplicateToVector256(dp));
            }
            else if (Sse3.IsSupported)
            {
                Vector2D tmp = Sse2.Multiply(left.GetLower(), right.GetLower());
                return(Helpers.DuplicateToVector256(Sse3.HorizontalAdd(tmp, tmp)));
            }
            else if (Sse2.IsSupported)
            {
                Vector2D tmp  = Sse2.Multiply(left.GetLower(), right.GetLower());
                Vector2D shuf = Sse2.Shuffle(tmp, tmp, ShuffleValues.YXYX);

                var dot = Sse2.Add(tmp, shuf);

                return(dot.ToVector256Unsafe().WithUpper(dot));
            }

            return(DotProduct2D_Software(left, right));
        }
        internal static unsafe float AbsMaxAvx(this ReadOnlySpan <float> array)
        {
            const int StepSize = 8; // Vector256<float>.Count;

            Debug.Assert(array.Length >= StepSize, "Input can't be smaller than the vector size.");

            // Constant used to get the absolute value of a Vector<float>
            Vector256 <float> neg = Vector256.Create(-0.0f);

            int len = array.Length;
            int rem = len % StepSize;
            int fit = len - rem;

            fixed(float *p = array)
            {
                Vector256 <float> maxVec = Avx.AndNot(neg, Avx.LoadVector256(p));

                for (int i = StepSize; i < fit; i += StepSize)
                {
                    maxVec = Avx.Max(maxVec, Avx.AndNot(neg, Avx.LoadVector256(p + i)));
                }

                if (rem != 0)
                {
                    maxVec = Avx.Max(maxVec, Avx.AndNot(neg, Avx.LoadVector256(p + len - StepSize)));
                }

                Vector128 <float> maxVec128 = Avx.Max(maxVec.GetLower(), maxVec.GetUpper());

                maxVec128 = Avx.Max(maxVec128, Avx.Permute(maxVec128, 0b00001110));
                maxVec128 = Avx.Max(maxVec128, Avx.Permute(maxVec128, 0b00000001));

                return(maxVec128.GetElement(0));
            }
        }
示例#3
0
        public static Vector256 <float> Multiply(Vector256 <float> left, Vector256 <float> right)
        {
            if (Avx.IsSupported)
            {
                return(Avx.Multiply(left, right));
            }

            return(FromLowHigh(Multiply(left.GetLower(), right.GetLower()), Multiply(left.GetUpper(), right.GetLower())));
        }
示例#4
0
        public static Vector256 <float> Add(Vector256 <float> left, Vector256 <float> right)
        {
            if (Avx.IsSupported)
            {
                return(Avx.Add(left, right));
            }

            return(FromLowHigh(Add(left.GetLower(), right.GetLower()), Add(left.GetUpper(), right.GetLower())));
        }
示例#5
0
        public static Vector256 <float> Subtract(Vector256 <float> left, Vector256 <float> right)
        {
            if (Avx.IsSupported)
            {
                return(Avx.Subtract(left, right));
            }

            return(FromLowHigh(Subtract(left.GetLower(), right.GetLower()), Subtract(left.GetUpper(), right.GetLower())));
        }
示例#6
0
            public static int EvenReduceSum(Vector256 <int> accumulator)
            {
                Vector128 <int> vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper()); // add upper lane to lower lane

                vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));                       // add high to low

                // Vector128<int>.ToScalar() isn't optimized pre-net5.0 https://github.com/dotnet/runtime/pull/37882
                return(Sse2.ConvertToInt32(vsum));
            }
示例#7
0
        public static Vector256 <float> Divide(Vector256 <float> dividend, Vector256 <float> divisor)
        {
            if (Avx.IsSupported)
            {
                return(Avx.Divide(dividend, divisor));
            }

            return(FromLowHigh(Divide(dividend.GetLower(), divisor.GetLower()), Divide(dividend.GetUpper(), divisor.GetLower())));
        }
示例#8
0
        public static Vector256 <float> CompareEqual(Vector256 <float> left, Vector256 <float> right)
        {
            if (Avx.IsSupported)
            {
                return(Avx.Compare(left, right, FloatComparisonMode.UnorderedEqualNonSignaling));
            }

            return(FromLowHigh(CompareEqual(left.GetLower(), right.GetLower()),
                               CompareEqual(left.GetUpper(), right.GetUpper())));
        }
        public unsafe static Vector256 <byte> Reverse(this Vector256 <byte> source)
        {
            var shuffleMask = stackalloc byte[] {
                15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
            };
            var shuffleMaskVector = Avx2.LoadVector128(shuffleMask);

            return(JoinMask(
                       Avx2.Shuffle(source.GetUpper(), shuffleMaskVector),
                       Avx2.Shuffle(source.GetLower(), shuffleMaskVector)
                       ));
        }
示例#10
0
            public static int ReduceSum(Vector256 <int> accumulator)
            {
                // Add upper lane to lower lane.
                Vector128 <int> vsum = Sse2.Add(accumulator.GetLower(), accumulator.GetUpper());

                // Add odd to even.
                vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_11_01_01));

                // Add high to low.
                vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));

                return(Sse2.ConvertToInt32(vsum));
            }
示例#11
0
        public static double ReduceSum(this Vector <double> vector)
        {
#if NETCOREAPP3_0
            if (Avx.IsSupported)
            {
                Vector256 <double> a     = Unsafe.As <Vector <double>, Vector256 <double> >(ref vector);
                Vector256 <double> tmp   = Avx.HorizontalAdd(a, a);
                Vector128 <double> hi128 = tmp.GetUpper();
                Vector128 <double> lo128 = tmp.GetLower();
                Vector128 <double> s     = Sse2.Add(lo128, hi128);

                return(s.ToScalar());
            }
#endif
            return(Vector.Dot(Vector <double> .One, vector));
        }
示例#12
0
            // A little look into C# intrinsics, .NET Core 3+ only
            private static void AdjustVolumeAllSamples(float *inbuffer, float *outbuffer, int length, float volume)
            {
                int i = 0;

                if (Avx.IsSupported)
                {
                    Vector256 <float> volVec = Vector256.Create(volume);

                    while (length - i >= Vector256 <float> .Count)
                    {
                        Vector256 <float> tmp = Avx.Multiply(volVec, Avx.LoadVector256(inbuffer + i)); //Load from input, multiply by volume

                        Avx.Store(outbuffer + i, tmp);                                                 //Store in output

                        i += Vector256 <float> .Count;                                                 //Increment index by the number of vector elements
                    }

                    if (length - i >= Vector128 <float> .Count)
                    {
                        Vector128 <float> tmp = Sse.Multiply(volVec.GetLower(), Sse.LoadVector128(inbuffer + i));
                        Sse.Store(outbuffer + i, tmp);

                        i += Vector128 <float> .Count;
                    }
                }
                else if (Sse.IsSupported)
                {
                    Vector128 <float> volVec = Vector128.Create(volume); //Broadcast the volume value across all vector elements

                    while (length - i >= Vector128 <float> .Count)
                    {
                        Vector128 <float> tmp = Sse.Multiply(volVec, Sse.LoadVector128(inbuffer + i)); //Load from input, multiply by volume

                        Sse.Store(outbuffer + i, tmp);                                                 //Store in output

                        i += Vector128 <float> .Count;
                    }
                }

                //process remaining, if any
                while (i < length)
                {
                    outbuffer[i] = volume * inbuffer[i];
                    i           += 1;
                }
            }
 public static Vector256 <double> Add(Vector256 <double> first, Vector256 <double> second)
 {
     return(System.Runtime.Intrinsics.X86.Avx.IsSupported ?
            System.Runtime.Intrinsics.X86.Avx.Add(first, second) : System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.IsSupported?
            Vector256.Create(
                System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Add(
                    first.GetLower(),
                    second.GetLower()),
                System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Add(
                    first.GetUpper(),
                    second.GetUpper())) :
                Vector256.Create(
                    first.GetElement(0) + second.GetElement(0),
                    first.GetElement(1) + second.GetElement(1),
                    first.GetElement(2) + second.GetElement(2),
                    first.GetElement(3) + second.GetElement(3)));
 }
 public static Vector256 <double> Scale(this Vector256 <double> value, double scale)
 {
     return(System.Runtime.Intrinsics.X86.Avx.IsSupported ?
            System.Runtime.Intrinsics.X86.Avx.Multiply(value, Vector256.Create(scale)) : System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.IsSupported?
            Vector256.Create(
                System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Multiply(
                    value.GetLower(),
                    Vector128.Create(scale)),
                System.Runtime.Intrinsics.Arm.AdvSimd.Arm64.Multiply(
                    value.GetUpper(),
                    Vector128.Create(scale))) :
                Vector256.Create(
                    value.GetElement(0) * scale,
                    value.GetElement(1) * scale,
                    value.GetElement(2) * scale,
                    value.GetElement(3) * scale));
 }
示例#15
0
    public static Vector128 <short> DivideBy10(this Vector128 <short> dividend)
    {
        // Convert to two 32-bit integers
        Vector128 <int> a_hi = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);

        a_lo = Sse2.ShiftRightArithmetic(a_lo, 16);

        Vector128 <int> div10_hi;
        Vector128 <int> div10_lo;

        if (Avx2.IsSupported)
        {
            Vector256 <int> a      = Vector256.Create(a_lo, a_hi);
            Vector256 <int> s0     = Avx2.ShiftRightArithmetic(a, 15);
            Vector256 <int> factor = Vector256.Create(26215);
            Vector256 <int> mul    = Avx2.MultiplyLow(a, factor);
            Vector256 <int> s1     = Avx2.ShiftRightArithmetic(mul, 18);
            Vector256 <int> div10  = Avx2.Subtract(s1, s0);

            div10_hi = div10.GetUpper();
            div10_lo = div10.GetLower();
        }
        else
        {
            Vector128 <int> s0_hi = Sse2.ShiftRightArithmetic(a_hi, 15);
            Vector128 <int> s0_lo = Sse2.ShiftRightArithmetic(a_lo, 15);

            Vector128 <int> factor = Vector128.Create(26215);
            Vector128 <int> mul_hi = Sse41.MultiplyLow(a_hi, factor);
            Vector128 <int> mul_lo = Sse41.MultiplyLow(a_lo, factor);

            Vector128 <int> s1_hi = Sse2.ShiftRightArithmetic(mul_hi, 18);
            Vector128 <int> s1_lo = Sse2.ShiftRightArithmetic(mul_lo, 18);

            div10_hi = Sse2.Subtract(s1_hi, s0_hi);
            div10_lo = Sse2.Subtract(s1_lo, s0_lo);
        }

        //div10_hi = Sse2.ShiftLeftLogical(div10_hi, 16);
        div10_hi = Sse2.ShiftLeftLogical128BitLane(div10_hi, 2);
        return(Sse41.Blend(div10_lo.AsInt16(), div10_hi.AsInt16(), 0xAA));
    }
示例#16
0
        public (double near, double far) IntersectAVX(Ray ray)
        {
            Vector256 <double> origin    = (Vector256 <double>)ray.Origin;
            Vector256 <double> direction = (Vector256 <double>)ray.Direction;

            Vector256 <double> zeroes = new Vector256 <double>();
            Vector256 <double> min    = (Vector256 <double>)Minimum;
            Vector256 <double> max    = (Vector256 <double>)Maximum;

            // Replace slabs that won't be checked (0 direction axis) with infinity so that NaN doesn't propagate
            Vector256 <double> dirInfMask = Avx.And(
                Avx.Compare(direction, zeroes, FloatComparisonMode.OrderedEqualNonSignaling),
                Avx.And(
                    Avx.Compare(origin, min, FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling),
                    Avx.Compare(origin, max, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling)));

            min = Avx.BlendVariable(min, SIMDHelpers.BroadcastScalar4(double.NegativeInfinity), dirInfMask);
            max = Avx.BlendVariable(max, SIMDHelpers.BroadcastScalar4(double.PositiveInfinity), dirInfMask);

            // Flip slabs in direction axes that are negative (using direction as mask takes the most significant bit, the sign.. probably includes -0)
            Vector256 <double> minMasked = Avx.BlendVariable(min, max, direction);
            Vector256 <double> maxMasked = Avx.BlendVariable(max, min, direction);

            direction = Avx.Divide(Vector256.Create(1D), direction);
            Vector256 <double> near4 = Avx.Multiply(Avx.Subtract(minMasked, origin), direction);
            Vector256 <double> far4  = Avx.Multiply(Avx.Subtract(maxMasked, origin), direction);

            Vector128 <double> near2 = Sse2.Max(near4.GetLower(), near4.GetUpper());

            near2 = Sse2.MaxScalar(near2, SIMDHelpers.Swap(near2));
            Vector128 <double> far2 = Sse2.Min(far4.GetLower(), far4.GetUpper());

            far2 = Sse2.MinScalar(far2, SIMDHelpers.Swap(far2));

            if (Sse2.CompareScalarOrderedGreaterThan(near2, far2) | Sse2.CompareScalarOrderedLessThan(far2, new Vector128 <double>()))
            {
                return(double.NaN, double.NaN);
            }

            return(near2.ToScalar(), far2.ToScalar());
        }
示例#17
0
        public void RunBasicScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario));

            Double[] values = new Double[ElementCount];

            for (int i = 0; i < ElementCount; i++)
            {
                values[i] = TestLibrary.Generator.GetDouble();
            }

            Vector256 <Double> value = Vector256.Create(values[0], values[1], values[2], values[3]);

            Vector128 <Double> lowerResult = value.GetLower();
            Vector128 <Double> upperResult = value.GetUpper();

            ValidateGetResult(lowerResult, upperResult, values);

            Vector256 <Double> result = value.WithLower(upperResult);

            result = result.WithUpper(lowerResult);
            ValidateWithResult(result, values);
        }
示例#18
0
        private static double MinMaxCore(Vector <double> vector, bool doMin)
        {
            Vector256 <double> vec256 = Unsafe.As <Vector <double>, Vector256 <double> >(ref vector);
            Vector128 <double> hi128  = vec256.GetUpper();
            Vector128 <double> lo128  = vec256.GetLower();
            Vector128 <double> tmp1   = Avx.Permute(hi128, 0b_01);
            Vector128 <double> tmp2   = Avx.Permute(lo128, 0b_01);

            if (doMin)
            {
                hi128 = Sse2.Min(hi128, tmp1);
                lo128 = Sse2.Min(lo128, tmp2);
                lo128 = Sse2.Min(lo128, hi128);
            }
            else
            {
                hi128 = Sse2.Max(hi128, tmp1);
                lo128 = Sse2.Max(lo128, tmp2);
                lo128 = Sse2.Max(lo128, hi128);
            }

            return(lo128.ToScalar());
        }
示例#19
0
        public void RunBasicScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario));

            Int16[] values = new Int16[ElementCount];

            for (int i = 0; i < ElementCount; i++)
            {
                values[i] = TestLibrary.Generator.GetInt16();
            }

            Vector256 <Int16> value = Vector256.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15]);

            Vector128 <Int16> lowerResult = value.GetLower();
            Vector128 <Int16> upperResult = value.GetUpper();

            ValidateGetResult(lowerResult, upperResult, values);

            Vector256 <Int16> result = value.WithLower(upperResult);

            result = result.WithUpper(lowerResult);
            ValidateWithResult(result, values);
        }
        public void RunBasicScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario));

            Byte[] values = new Byte[ElementCount];

            for (int i = 0; i < ElementCount; i++)
            {
                values[i] = TestLibrary.Generator.GetByte();
            }

            Vector256 <Byte> value = Vector256.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15], values[16], values[17], values[18], values[19], values[20], values[21], values[22], values[23], values[24], values[25], values[26], values[27], values[28], values[29], values[30], values[31]);

            Vector128 <Byte> lowerResult = value.GetLower();
            Vector128 <Byte> upperResult = value.GetUpper();

            ValidateGetResult(lowerResult, upperResult, values);

            Vector256 <Byte> result = value.WithLower(upperResult);

            result = result.WithUpper(lowerResult);
            ValidateWithResult(result, values);
        }
示例#21
0
 public static void GetLowHigh <T>(Vector256 <T> vector, out Vector128 <T> low, out Vector128 <T> high) where T : struct
 {
     low  = vector.GetLower();
     high = vector.GetUpper();
 }
示例#22
0
 private static void _mm256_storeu2_m128i(byte *hiaddr, byte *loaddr, Vector256 <byte> a)
 {
     Sse2.Store(loaddr, a.GetLower());
     Sse2.Store(hiaddr, Avx.ExtractVector128(a, 0x1));
 }
 private static float SumVector256(Vector256 <float> v)
 {
     v = Avx.HorizontalAdd(v, v); //0+1, 2+3, .., .., 4+5, 6+7, .., ..
     v = Avx.HorizontalAdd(v, v); //0+1+2+3, .., .., .., 4+5+6+7, .., .., ..
     return(v.GetUpper().ToScalar() + v.GetLower().ToScalar());
 }
示例#24
0
 public static void GetLowHigh(Vector256 <double> vector, out Vector128 <double> low, out Vector128 <double> high)
 {
     low  = vector.GetLower();
     high = vector.GetUpper();
 }
        // This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf.
        // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part.
        // The linear part is the inner product of linearWeights and featureValues.
        // The latent part is the sum of all intra-field interactions in one field f, for all fields possible
        public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues,
                                                                 float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count)
        {
            Contracts.Assert(Avx.IsSupported);

            // The number of all possible fields.
            int    m              = fieldCount;
            int    d              = latentDim;
            int    c              = count;
            int *  pf             = fieldIndices;
            int *  pi             = featureIndices;
            float *px             = featureValues;
            float *pw             = linearWeights;
            float *pv             = latentWeights;
            float *pq             = latentSum;
            float  linearResponse = 0;
            float  latentResponse = 0;

            Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float)));

            Vector256 <float> y   = Vector256 <float> .Zero;
            Vector256 <float> tmp = Vector256 <float> .Zero;

            for (int i = 0; i < c; i++)
            {
                int f = pf[i];
                int j = pi[i];
                linearResponse += pw[j] * px[i];

                Vector256 <float> x  = Avx.BroadcastScalarToVector256(px + i);
                Vector256 <float> xx = Avx.Multiply(x, x);

                // tmp -= <v_j,f, v_j,f> * x * x
                int vBias = j * m * d + f * d;

                // j-th feature's latent vector in the f-th field hidden space.
                float *vjf = pv + vBias;

                for (int k = 0; k + 8 <= d; k += 8)
                {
                    Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k);
                    tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp);
                }

                for (int fprime = 0; fprime < m; fprime++)
                {
                    vBias = j * m * d + fprime * d;
                    int    qBias    = f * m * d + fprime * d;
                    float *vjfprime = pv + vBias;
                    float *qffprime = pq + qBias;

                    // q_f,f' += v_j,f' * x
                    for (int k = 0; k + 8 <= d; k += 8)
                    {
                        Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k);
                        Vector256 <float> q = Avx.LoadVector256(qffprime + k);
                        q = MultiplyAdd(vjfprimeBuffer, x, q);
                        Avx.Store(qffprime + k, q);
                    }
                }
            }

            for (int f = 0; f < m; f++)
            {
                // tmp += <q_f,f, q_f,f>
                float *qff = pq + f * m * d + f * d;
                for (int k = 0; k + 8 <= d; k += 8)
                {
                    Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k);

                    // Intra-field interactions.
                    tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp);
                }

                // y += <q_f,f', q_f',f>, f != f'
                // Whis loop handles inter - field interactions because f != f'.
                for (int fprime = f + 1; fprime < m; fprime++)
                {
                    float *qffprime = pq + f * m * d + fprime * d;
                    float *qfprimef = pq + fprime * m * d + f * d;
                    for (int k = 0; k + 8 <= d; k += 8)
                    {
                        // Inter-field interaction.
                        Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k);
                        Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k);
                        y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y);
                    }
                }
            }

            y   = MultiplyAdd(_point5, tmp, y);
            tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1));
            tmp = Avx.HorizontalAdd(tmp, tmp);
            y   = Avx.HorizontalAdd(tmp, tmp);
            Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value.
            *response = linearResponse + latentResponse;
        }
示例#26
0
        unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy)
        {
            float *tp = (float *)tstart, tpe = (float *)(tstart + cb);
            float *pmapx   = (float *)mapxstart;
            int    kstride = smapx * channels;
            int    tstride = smapy * 4;
            int    vcnt    = smapx / Vector128 <float> .Count;

            while (tp < tpe)
            {
                int ix   = *(int *)pmapx++;
                int lcnt = vcnt;

                float *ip = (float *)istart + ix * channels;
                float *mp = pmapx;
                pmapx += kstride;

                Vector128 <float> av0, av1, av2;

                if (Avx.IsSupported && lcnt >= 2)
                {
                    Vector256 <float> ax0 = Vector256 <float> .Zero, ax1 = ax0, ax2 = ax0;

                    for (; lcnt >= 2; lcnt -= 2)
                    {
                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2);
                        ip += Vector256 <int> .Count * channels;

                        if (Fma.IsSupported)
                        {
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax1 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax1);
                            ax2 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax2);
                        }
                        else
                        {
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax1 = Avx.Add(ax1, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                            ax2 = Avx.Add(ax2, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2)));
                        }
                        mp += Vector256 <float> .Count * channels;
                    }

                    av0 = Sse.Add(ax0.GetLower(), ax1.GetUpper());
                    av1 = Sse.Add(ax0.GetUpper(), ax2.GetLower());
                    av2 = Sse.Add(ax1.GetLower(), ax2.GetUpper());
                }
                else
                {
                    av0 = av1 = av2 = Vector128 <float> .Zero;
                }

                for (; lcnt != 0; lcnt--)
                {
                    var iv0 = Sse.LoadVector128(ip);
                    var iv1 = Sse.LoadVector128(ip + Vector128 <float> .Count);
                    var iv2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2);
                    ip += Vector128 <float> .Count * channels;

                    if (Fma.IsSupported)
                    {
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0);
                        av1 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av1);
                        av2 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av2);
                    }
                    else
                    {
                        av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp)));
                        av1 = Sse.Add(av1, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count)));
                        av2 = Sse.Add(av2, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2)));
                    }
                    mp += Vector128 <float> .Count * channels;
                }

                var avs0 = Sse.Add(Sse.Add(
                                       Sse.Shuffle(av0, av0, 0b_00_10_01_11),
                                       Sse.Shuffle(av1, av1, 0b_00_01_11_10)),
                                   Sse.Shuffle(av2, av2, 0b_00_11_10_01)
                                   );
                var avs1 = Sse3.IsSupported ?
                           Sse3.MoveHighAndDuplicate(avs0) :
                           Sse.Shuffle(avs0, avs0, 0b_11_11_01_01);
                var avs2 = Sse.UnpackHigh(avs0, avs0);

                tp[0] = Sse.AddScalar(av0, avs0).ToScalar();
                tp[1] = Sse.AddScalar(av1, avs1).ToScalar();
                tp[2] = Sse.AddScalar(av2, avs2).ToScalar();
                tp   += tstride;
            }
        }