public static unsafe RtMatrix operator *(RtMatrix value1, RtMatrix value2)
            if (Avx2.IsSupported && useIntrinsics)
                var row = Avx.LoadVector256(&value1.M11);
                          Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))),
                                  Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41)))));

                // 0x00 is _MM_SHUFFLE(0,0,0,0), 0x55 is _MM_SHUFFLE(1,1,1,1), etc.
                // TODO: Replace with a method once it's added to the API.

                row = Avx.LoadVector256(&value1.M21);
                          Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))),
                                  Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41)))));

                row = Avx.LoadVector256(&value1.M31);
                          Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))),
                                  Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41)))));

                row = Avx.LoadVector256(&value1.M41);
                          Avx.Add(Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0x00), Avx.LoadVector256(&value2.M11)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0x55), Avx.LoadVector256(&value2.M21))),
                                  Avx.Add(Avx.Multiply(Avx2.Permute4x64(row, 0xAA), Avx.LoadVector256(&value2.M31)),
                                          Avx.Multiply(Avx2.Permute4x64(row, 0xFF), Avx.LoadVector256(&value2.M41)))));

            RtMatrix m;

            // First row
            m.M11 = value1.M11 * value2.M11 + value1.M12 * value2.M21 + value1.M13 * value2.M31 + value1.M14 * value2.M41;
            m.M12 = value1.M11 * value2.M12 + value1.M12 * value2.M22 + value1.M13 * value2.M32 + value1.M14 * value2.M42;
            m.M13 = value1.M11 * value2.M13 + value1.M12 * value2.M23 + value1.M13 * value2.M33 + value1.M14 * value2.M43;
            m.M14 = value1.M11 * value2.M14 + value1.M12 * value2.M24 + value1.M13 * value2.M34 + value1.M14 * value2.M44;

            // Second row
            m.M21 = value1.M21 * value2.M11 + value1.M22 * value2.M21 + value1.M23 * value2.M31 + value1.M24 * value2.M41;
            m.M22 = value1.M21 * value2.M12 + value1.M22 * value2.M22 + value1.M23 * value2.M32 + value1.M24 * value2.M42;
            m.M23 = value1.M21 * value2.M13 + value1.M22 * value2.M23 + value1.M23 * value2.M33 + value1.M24 * value2.M43;
            m.M24 = value1.M21 * value2.M14 + value1.M22 * value2.M24 + value1.M23 * value2.M34 + value1.M24 * value2.M44;

            // Third row
            m.M31 = value1.M31 * value2.M11 + value1.M32 * value2.M21 + value1.M33 * value2.M31 + value1.M34 * value2.M41;
            m.M32 = value1.M31 * value2.M12 + value1.M32 * value2.M22 + value1.M33 * value2.M32 + value1.M34 * value2.M42;
            m.M33 = value1.M31 * value2.M13 + value1.M32 * value2.M23 + value1.M33 * value2.M33 + value1.M34 * value2.M43;
            m.M34 = value1.M31 * value2.M14 + value1.M32 * value2.M24 + value1.M33 * value2.M34 + value1.M34 * value2.M44;

            // Fourth row
            m.M41 = value1.M41 * value2.M11 + value1.M42 * value2.M21 + value1.M43 * value2.M31 + value1.M44 * value2.M41;
            m.M42 = value1.M41 * value2.M12 + value1.M42 * value2.M22 + value1.M43 * value2.M32 + value1.M44 * value2.M42;
            m.M43 = value1.M41 * value2.M13 + value1.M42 * value2.M23 + value1.M43 * value2.M33 + value1.M44 * value2.M43;
            m.M44 = value1.M41 * value2.M14 + value1.M42 * value2.M24 + value1.M43 * value2.M34 + value1.M44 * value2.M44;

Beispiel #2
 public static Vector256 <double> op_Addition(Vector256 <double> left, Vector256 <double> right)
 => Avx.Add(left, right);
        public VectorArg256 Change(float f)
            Vector256 <float> t = Avx.SetAllVector256(f);

            return(new VectorArg256(Avx.Add(t, _rgb)));
Beispiel #4
        // Generic math

        public static f32 Add(f32 lhs, f32 rhs) => Avx.Add(lhs, rhs);
Beispiel #5
        public static Vector128 <float> GetBrucePsmeAbgrGrowthEffectiveAge(SiteConstants site, float timeStepInYears, Vector128 <float> treeHeight, out Vector128 <float> potentialHeightGrowth)
            Vector128 <float> B1     = AvxExtensions.BroadcastScalarToVector128(site.B1);
            Vector128 <float> B2     = AvxExtensions.BroadcastScalarToVector128(site.B2);
            Vector128 <float> X2toB2 = AvxExtensions.BroadcastScalarToVector128(site.X2toB2);
            Vector128 <float> siteIndexFromGround128 = AvxExtensions.BroadcastScalarToVector128(site.SiteIndexFromGround);
            Vector128 <float> X1 = AvxExtensions.BroadcastScalarToVector128(site.X1);

            Vector128 <float> XX1                = Avx.Add(Avx.Divide(MathV.Ln(Avx.Divide(treeHeight, siteIndexFromGround128)), B1), X2toB2);
            Vector128 <float> xx1lessThanZero    = Avx.CompareLessThanOrEqual(XX1, Vector128 <float> .Zero);
            Vector128 <float> growthEffectiveAge = Avx.Subtract(MathV.Pow(XX1, Avx.Reciprocal(B2)), X1);

            growthEffectiveAge = Avx.BlendVariable(growthEffectiveAge, AvxExtensions.BroadcastScalarToVector128(500.0F), xx1lessThanZero);

            Vector128 <float> timeStepInYearsPlusX1 = AvxExtensions.BroadcastScalarToVector128(timeStepInYears + site.X1);
            Vector128 <float> potentialHeightPower  = Avx.Multiply(B1, Avx.Subtract(MathV.Pow(Avx.Add(growthEffectiveAge, timeStepInYearsPlusX1), B2), X2toB2));
            Vector128 <float> potentialHeight       = Avx.Multiply(siteIndexFromGround128, MathV.Exp(potentialHeightPower));

            potentialHeightGrowth = Avx.Subtract(potentialHeight, treeHeight);

        public unsafe override double[] Applay(double[] values, int halfWindow)
            var windowSize = 2 * halfWindow + 1;
            var resultSize = values.Length - windowSize + 1;

            if (resultSize == 0)

            var a   = new double[resultSize];
            var sum = 0d;

            fixed(double *valueStart = values, aStart = a)
                var valueCurrent       = valueStart;
                var valueEndwindowSize = valueCurrent + windowSize;

                while (valueCurrent < valueEndwindowSize)
                    sum += *valueCurrent;

                var aCurrent     = aStart + 1;
                var aEnd         = aStart + resultSize;
                var aUnrolledEnd = aStart + (((resultSize - 1) >> 4) << 4);

                valueCurrent = valueStart;

                var valueWindowSize = valueStart + windowSize;
                var vWindowSize     = Vector256.Create((double)windowSize);

                var vCurrent = Vector256.Create(
                    (ulong)aCurrent + 4 * sizeof(double),
                    (ulong)aCurrent + 8 * sizeof(double),
                    (ulong)aCurrent + 12 * sizeof(double));

                var vValueCurrent = Vector256.Create(
                    (ulong)valueCurrent + 4 * sizeof(double),
                    (ulong)valueCurrent + 8 * sizeof(double),
                    (ulong)valueCurrent + 12 * sizeof(double));

                var vValueWindowSize = Vector256.Create(
                    (ulong)valueWindowSize + 4 * sizeof(double),
                    (ulong)valueWindowSize + 8 * sizeof(double),
                    (ulong)valueWindowSize + 12 * sizeof(double));

                var vShiftIndex1 = Vector256.Create(16ul * sizeof(double));

                while (aCurrent < aUnrolledEnd)
                    #region  1

                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(0)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(0))),


                    #region  2

                        (double *)vCurrent.GetElement(1),
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(1)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(1))),


                    #region  3

                        (double *)vCurrent.GetElement(2),
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(2)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(2))),


                    #region  4

                        (double *)vCurrent.GetElement(3),
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(3)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(3))),


                    vCurrent         = Avx.Add(vCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64();
                    vValueCurrent    = Avx.Add(vValueCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64();
                    vValueWindowSize = Avx.Add(vValueWindowSize.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64();

                    aCurrent = (double *)vCurrent.GetElement(0);

                valueWindowSize = (double *)vValueWindowSize.GetElement(0);
                valueCurrent    = (double *)vValueCurrent.GetElement(0);

                while (aCurrent < aEnd)
                    *aCurrent = (*valueWindowSize - *valueCurrent) / windowSize;

                var aPrev = aStart;
                aCurrent = aStart + 1;
                aEnd     = aStart + resultSize;

                *aPrev = sum / windowSize;

                aUnrolledEnd = aStart + (((resultSize - 1) >> 2) << 2);

                vCurrent = Vector256.Create(
                    (ulong)aCurrent + sizeof(double),
                    (ulong)aCurrent + 2 * sizeof(double),
                    (ulong)aCurrent + 3 * sizeof(double));

                var vPrev = Vector256.Create(
                    (ulong)aPrev + sizeof(double),
                    (ulong)aPrev + 2 * sizeof(double),
                    (ulong)aPrev + 3 * sizeof(double));

                var vShiftIndex = Vector256.Create(4ul * sizeof(double));

                while (aCurrent < aUnrolledEnd)
                    #region  1

                    *aCurrent += *(double *)vPrev.GetElement(0);


                    #region  2

                    *(double *)vCurrent.GetElement(1) += *(double *)vPrev.GetElement(1);


                    #region  3

                    *(double *)vCurrent.GetElement(2) += *(double *)vPrev.GetElement(2);


                    #region  4

                    *(double *)vCurrent.GetElement(3) += *(double *)vPrev.GetElement(3);


                    vCurrent = Avx.Add(vCurrent.AsDouble(), vShiftIndex.AsDouble()).AsUInt64();
                    vPrev    = Avx.Add(vPrev.AsDouble(), vShiftIndex.AsDouble()).AsUInt64();

                    aCurrent = (double *)vCurrent.GetElement(0);

                aPrev = (double *)vPrev.GetElement(0);

                while (aCurrent < aEnd)
                    *aCurrent += *aPrev;

        unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy)
            float *op = (float *)ostart;
            int    xc = ox + ow, tstride = smapy;
            int    vcnt = smapy / Vector128 <float> .Count;

            while (ox < xc)
                int lcnt = vcnt;

                float *tp = (float *)tstart + ox * tstride;
                float *mp = (float *)pmapy;

                Vector128 <float> av0;

                if (Avx.IsSupported && lcnt >= 2)
                    var ax0 = Vector256 <float> .Zero;

                    for (; lcnt >= 4; lcnt -= 4)
                        var iv0 = Avx.LoadVector256(tp);
                        var iv1 = Avx.LoadVector256(tp + Vector256 <float> .Count);
                        tp += Vector256 <float> .Count * 2;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                        mp += Vector256 <float> .Count * 2;

                    if (lcnt >= 2)
                        lcnt -= 2;

                        var iv0 = Avx.LoadVector256(tp);
                        tp += Vector256 <float> .Count;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));

                        mp += Vector256 <float> .Count;

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                    av0 = Vector128 <float> .Zero;

                for (; lcnt != 0; lcnt--)
                    var iv0 = Sse.LoadVector128(tp);
                    tp += Vector128 <float> .Count;

                    if (Fma.IsSupported)
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0);
                        av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp)));

                    mp += Vector128 <float> .Count;

                *op++ = av0.HorizontalAdd();
Beispiel #8
        public unsafe void Process(MutableByteImage currentPicture, MutableByteImage nextPicture)
            float MaxFactor = 1;

            float[] attackAr = new float[] { Attack, Attack, Attack, Attack };
            float[] decayAr  = new float[] { Decay, Decay, Decay, Decay };

            int length = nextPicture.Data.Length;

            float *MaxFactorPtr = &MaxFactor;

            fixed(float *AttackPtr = attackAr)
            fixed(float *DecayPtr     = decayAr)
            fixed(byte *currentPicPtr = currentPicture.Data)
            fixed(byte *nextPicPtr    = nextPicture.Data)
                byte *currentPxPtr = currentPicPtr;
                byte *nextPxPtr    = nextPicPtr;

                int remainingLength = length % 4;

                for (int i = 0; i < length; i += 4)
                    var currentColor     = *nextPxPtr;
                    var workingDataColor = *currentPxPtr;

                    var currentColorPtr     = nextPxPtr;
                    var workingDataColorPtr = currentPxPtr;

                    var cmpResult = Avx.ConvertToVector128Single(

                    var pixelFactor = Avx.Add(
                        Avx.And(cmpResult, Avx.BroadcastScalarToVector128(AttackPtr)),
                        Avx.AndNot(cmpResult, Avx.BroadcastScalarToVector128(DecayPtr))

                    var result = Avx.Add(

                    // TODO improve Store
                    *currentPxPtr = (byte)Avx.Extract(result, 0);
                    *currentPxPtr = (byte)Avx.Extract(result, 1);
                    *currentPxPtr = (byte)Avx.Extract(result, 2);
                    *currentPxPtr = (byte)Avx.Extract(result, 3);

                    nextPxPtr += 4;

                for (int i = 0; i < remainingLength; i++)
                    var currentColor     = *nextPxPtr;
                    var workingDataColor = *currentPxPtr;

                    var newPixelFactor = workingDataColor < currentColor ? Attack : Decay;

                    var newPixelValue = (byte)((currentColor * newPixelFactor) + (workingDataColor * (1 - newPixelFactor)));

                    *currentPxPtr = newPixelValue;
        unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy)
            float *tp = (float *)tstart, tpe = (float *)(tstart + cb);
            float *pmapx   = (float *)mapxstart;
            int    kstride = smapx;
            int    tstride = smapy;
            int    vcnt    = smapx / Vector128 <float> .Count;

            while (tp < tpe)
                int ix   = *(int *)pmapx++;
                int lcnt = vcnt;

                float *ip = (float *)istart + ix;
                float *mp = pmapx;
                pmapx += kstride;

                Vector128 <float> av0;

                if (Avx.IsSupported && lcnt >= 2)
                    var ax0 = Vector256 <float> .Zero;

                    for (; lcnt >= 8; lcnt -= 8)
                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2);
                        var iv3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3);
                        ip += Vector256 <float> .Count * 4;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 3), iv3, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv3, Avx.LoadVector256(mp + Vector256 <float> .Count * 3)));
                        mp += Vector256 <float> .Count * 4;

                    if (lcnt >= 6)
                        lcnt -= 6;

                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2);
                        ip += Vector256 <float> .Count * 3;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2)));
                        mp += Vector256 <float> .Count * 3;
                    else if (lcnt >= 4)
                        lcnt -= 4;

                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        ip += Vector256 <float> .Count * 2;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                        mp += Vector256 <float> .Count * 2;
                    else if (lcnt >= 2)
                        lcnt -= 2;

                        var iv0 = Avx.LoadVector256(ip);
                        ip += Vector256 <float> .Count;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));

                        mp += Vector256 <float> .Count;

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                    av0 = Vector128 <float> .Zero;

                for (; lcnt != 0; lcnt--)
                    var iv0 = Sse.LoadVector128(ip);
                    ip += Vector128 <float> .Count;

                    if (Fma.IsSupported)
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0);
                        av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp)));

                    mp += Vector128 <float> .Count;

                tp[0] = av0.HorizontalAdd();
                tp   += tstride;
        unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy)
            float *tp = (float *)tstart, tpe = (float *)(tstart + cb);
            float *pmapx   = (float *)mapxstart;
            int    kstride = smapx * channels;
            int    tstride = smapy * channels;
            int    vcnt    = smapx / Vector128 <float> .Count;

            while (tp < tpe)
                int ix   = *(int *)pmapx++;
                int lcnt = vcnt;

                float *ip = (float *)istart + ix * channels;
                float *mp = pmapx;
                pmapx += kstride;

                Vector128 <float> av0;

                if (Avx.IsSupported && lcnt >= 2)
                    var ax0 = Vector256 <float> .Zero;

                    for (; lcnt >= 2; lcnt -= 2)
                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2);
                        var iv3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3);
                        ip += Vector256 <int> .Count * channels;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 3), iv3, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv3, Avx.LoadVector256(mp + Vector256 <float> .Count * 3)));
                        mp += Vector256 <float> .Count * channels;

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                    av0 = Vector128 <float> .Zero;

                for (; lcnt != 0; lcnt--)
                    var iv0 = Sse.LoadVector128(ip);
                    var iv1 = Sse.LoadVector128(ip + Vector128 <float> .Count);
                    var iv2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2);
                    var iv3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3);
                    ip += Vector128 <float> .Count * channels;

                    if (Fma.IsSupported)
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0);
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av0);
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av0);
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 3), iv3, av0);
                        av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp)));
                        av0 = Sse.Add(av0, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count)));
                        av0 = Sse.Add(av0, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2)));
                        av0 = Sse.Add(av0, Sse.Multiply(iv3, Sse.LoadVector128(mp + Vector128 <float> .Count * 3)));
                    mp += Vector128 <float> .Count * channels;

                tp[0] = av0.ToScalar();
                tp[1] = Sse.Shuffle(av0, av0, 0b_11_10_01_01).ToScalar();
                tp[2] = Sse.UnpackHigh(av0, av0).ToScalar();
                tp[3] = Sse.Shuffle(av0, av0, 0b_11_10_01_11).ToScalar();
                tp   += tstride;
        unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy)
            float *op = (float *)ostart;
            int    xc = ox + ow, tstride = smapy * channels;
            int    vcnt = smapy / Vector128 <float> .Count;

            while (ox < xc)
                int lcnt = vcnt;

                float *tp = (float *)tstart + ox * tstride;
                float *mp = (float *)pmapy;

                Vector128 <float> av0;

                if (Avx.IsSupported && lcnt >= 2)
                    var ax0 = Vector256 <float> .Zero;

                    for (; lcnt >= 2; lcnt -= 2)
                        var iv0 = Avx.LoadVector256(tp);
                        var iv1 = Avx.LoadVector256(tp + Vector256 <float> .Count);
                        var iv2 = Avx.LoadVector256(tp + Vector256 <float> .Count * 2);
                        var iv3 = Avx.LoadVector256(tp + Vector256 <float> .Count * 3);
                        tp += Vector256 <int> .Count * channels;

                        if (Fma.IsSupported)
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax0);
                            ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 3), iv3, ax0);
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2)));
                            ax0 = Avx.Add(ax0, Avx.Multiply(iv3, Avx.LoadVector256(mp + Vector256 <float> .Count * 3)));
                        mp += Vector256 <float> .Count * channels;

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                    av0 = Vector128 <float> .Zero;

                for (; lcnt != 0; lcnt--)
                    var iv0 = Sse.LoadVector128(tp);
                    var iv1 = Sse.LoadVector128(tp + Vector128 <float> .Count);
                    var iv2 = Sse.LoadVector128(tp + Vector128 <float> .Count * 2);
                    var iv3 = Sse.LoadVector128(tp + Vector128 <float> .Count * 3);
                    tp += Vector128 <float> .Count * channels;

                    if (Fma.IsSupported)
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0);
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av0);
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av0);
                        av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 3), iv3, av0);
                        av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp)));
                        av0 = Sse.Add(av0, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count)));
                        av0 = Sse.Add(av0, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2)));
                        av0 = Sse.Add(av0, Sse.Multiply(iv3, Sse.LoadVector128(mp + Vector128 <float> .Count * 3)));
                    mp += Vector128 <float> .Count * channels;

                op[0] = av0.ToScalar();
                op[1] = Sse.Shuffle(av0, av0, 0b_11_10_01_01).ToScalar();
                op[2] = Sse.UnpackHigh(av0, av0).ToScalar();
                op[3] = Sse.Shuffle(av0, av0, 0b_11_10_01_11).ToScalar();
                op   += channels;
Beispiel #12
        public static unsafe float GetScribnerBoardFeetPerAcre(Trees trees)
            // for now, assume all trees are of the same species
            if (trees.Species != FiaCode.PseudotsugaMenziesii)
                throw new NotSupportedException();
            if (trees.Units != Units.English)
                throw new NotSupportedException();

            // Douglas-fir
            #if DEBUG
            Vector128 <float> v6p8 = AvxExtensions.BroadcastScalarToVector128(6.8F);
            Vector128 <float> v10k = AvxExtensions.BroadcastScalarToVector128(10.0F * 1000.0F);

            // constants
            Vector128 <float> forestersEnglish = AvxExtensions.BroadcastScalarToVector128(Constant.ForestersEnglish);
            Vector128 <float> one = AvxExtensions.BroadcastScalarToVector128(1.0F);
            Vector128 <float> six = AvxExtensions.BroadcastScalarToVector128(6.0F);

            Vector128 <float> vm3p21809   = AvxExtensions.BroadcastScalarToVector128(-3.21809F); // b4
            Vector128 <float> v0p04948    = AvxExtensions.BroadcastScalarToVector128(0.04948F);
            Vector128 <float> vm0p15664   = AvxExtensions.BroadcastScalarToVector128(-0.15664F);
            Vector128 <float> v2p02132    = AvxExtensions.BroadcastScalarToVector128(2.02132F);
            Vector128 <float> v1p63408    = AvxExtensions.BroadcastScalarToVector128(1.63408F);
            Vector128 <float> vm0p16184   = AvxExtensions.BroadcastScalarToVector128(-0.16184F);
            Vector128 <float> v1p033      = AvxExtensions.BroadcastScalarToVector128(1.033F);
            Vector128 <float> v1p382937   = AvxExtensions.BroadcastScalarToVector128(1.382937F);
            Vector128 <float> vm0p4015292 = AvxExtensions.BroadcastScalarToVector128(-0.4015292F);
            Vector128 <float> v0p087266   = AvxExtensions.BroadcastScalarToVector128(0.087266F);
            Vector128 <float> vm0p174533  = AvxExtensions.BroadcastScalarToVector128(-0.174533F);

            Vector128 <float> vm0p6896598794 = AvxExtensions.BroadcastScalarToVector128(-0.6896598794F); // rc6-rs632
            Vector128 <float> v0p993         = AvxExtensions.BroadcastScalarToVector128(0.993F);
            Vector128 <float> v0p174439      = AvxExtensions.BroadcastScalarToVector128(0.174439F);
            Vector128 <float> v0p117594      = AvxExtensions.BroadcastScalarToVector128(0.117594F);
            Vector128 <float> vm8p210585     = AvxExtensions.BroadcastScalarToVector128(-8.210585F);
            Vector128 <float> v0p236693      = AvxExtensions.BroadcastScalarToVector128(0.236693F);
            Vector128 <float> v0p00001345    = AvxExtensions.BroadcastScalarToVector128(0.00001345F);
            Vector128 <float> v0p00001937    = AvxExtensions.BroadcastScalarToVector128(0.00001937F);
            Vector128 <float> v1p001491      = AvxExtensions.BroadcastScalarToVector128(1.001491F);
            Vector128 <float> vm6p924097     = AvxExtensions.BroadcastScalarToVector128(-6.924097F);
            Vector128 <float> v0p912733      = AvxExtensions.BroadcastScalarToVector128(0.912733F);
            Vector128 <float> v0p00001351    = AvxExtensions.BroadcastScalarToVector128(0.00001351F);

            fixed(float *dbh = &trees.Dbh[0], expansionFactors = &trees.LiveExpansionFactor[0], height = &trees.Height[0])
                Vector128 <float> standBoardFeetPerAcre = Vector128 <float> .Zero;

                for (int treeIndex = 0; treeIndex < trees.Count; treeIndex += Constant.Simd128x4.Width)
                    Vector128 <float> dbhInInches  = Avx.LoadVector128(dbh + treeIndex);
                    Vector128 <float> heightInFeet = Avx.LoadVector128(height + treeIndex);

                    Vector128 <float> logDbhInInches  = MathV.Log10(dbhInInches);
                    Vector128 <float> logHeightInFeet = MathV.Log10(heightInFeet);
                    // FiaCode.PseudotsugaMenziesii => -3.21809F + 0.04948F * logHeightInFeet * logDbhInInches - 0.15664F * logDbhInInches * logDbhInInches +
                    //                                  2.02132F * logDbhInInches + 1.63408F * logHeightInFeet - 0.16184F * logHeightInFeet * logHeightInFeet,
                    Vector128 <float> cvtsl = Avx.Add(vm3p21809, Avx.Multiply(v0p04948, Avx.Multiply(logHeightInFeet, logDbhInInches)));
                    cvtsl = Avx.Add(cvtsl, Avx.Multiply(vm0p15664, Avx.Multiply(logDbhInInches, logDbhInInches)));
                    cvtsl = Avx.Add(cvtsl, Avx.Multiply(v2p02132, logDbhInInches));
                    cvtsl = Avx.Add(cvtsl, Avx.Multiply(v1p63408, logHeightInFeet));
                    cvtsl = Avx.Add(cvtsl, Avx.Multiply(vm0p16184, Avx.Multiply(logHeightInFeet, logHeightInFeet)));
                    Vector128 <float> cubicFeet = MathV.Exp10(cvtsl);

                    Vector128 <float> dbhSquared            = Avx.Multiply(dbhInInches, dbhInInches); // could be consolidated by merging other scaling constants with Forester's constant for basal area
                    Vector128 <float> basalAreaInSquareFeet = Avx.Multiply(forestersEnglish, dbhSquared);
                    // b4 = cubicFeet / (1.033F * (1.0F + 1.382937F * MathV.Exp(-4.015292F * dbhInInches / 10.0F)) * (basalAreaInSquareFeet + 0.087266F) - 0.174533F);
                    Vector128 <float> b4 = Avx.Divide(cubicFeet, Avx.Add(Avx.Multiply(v1p033,
                                                                                      Avx.Multiply(Avx.Add(one, Avx.Multiply(v1p382937,
                                                                                                   Avx.Add(basalAreaInSquareFeet, v0p087266))),
                    Vector128 <float> cv4 = Avx.Multiply(b4, Avx.Subtract(basalAreaInSquareFeet, v0p087266));

                    // conversion to Scribner volumes for 32 foot trees
                    // Waddell 2014:32
                    // rc6 = 0.993F * (1.0F - MathF.Pow(0.62F, dbhInInches - 6.0F));
                    Vector128 <float> rc6   = Avx.Multiply(v0p993, Avx.Subtract(one, MathV.Exp(Avx.Multiply(vm0p6896598794, Avx.Subtract(dbhInInches, six))))); // log2(0.62) = -0.6896598794
                    Vector128 <float> cv6   = Avx.Multiply(rc6, cv4);
                    Vector128 <float> logB4 = MathV.Log10(b4);
                    // float rs616 = MathF.Pow(10.0F, 0.174439F + 0.117594F * logDbhInInches * logB4 - 8.210585F / (dbhInInches * dbhInInches) + 0.236693F * logB4 - 0.00001345F * b4 * b4 - 0.00001937F * dbhInInches * dbhInInches);
                    Vector128 <float> rs616l = Avx.Add(v0p174439, Avx.Multiply(v0p117594, Avx.Multiply(logDbhInInches, logB4)));
                    rs616l = Avx.Add(rs616l, Avx.Divide(vm8p210585, dbhSquared));
                    rs616l = Avx.Add(rs616l, Avx.Multiply(v0p236693, logB4));
                    rs616l = Avx.Subtract(rs616l, Avx.Multiply(v0p00001345, Avx.Multiply(b4, b4)));
                    rs616l = Avx.Subtract(rs616l, Avx.Multiply(v0p00001937, dbhSquared));
                    Vector128 <float> rs616 = MathV.Exp10(rs616l);
                    Vector128 <float> sv616 = Avx.Multiply(rs616, cv6); // Scribner board foot volume to a 6 inch top for 16 foot logs
                    // float rs632 = 1.001491F - 6.924097F / tarif + 0.00001351F * dbhInInches * dbhInInches;
                    Vector128 <float> rs632 = Avx.Add(v1p001491, Avx.Divide(vm6p924097, Avx.Multiply(v0p912733, b4)));
                    rs632 = Avx.Add(rs632, Avx.Multiply(v0p00001351, dbhSquared));
                    Vector128 <float> zeroVolumeMask = Avx.CompareLessThanOrEqual(dbhInInches, six);
                    Vector128 <float> sv632          = Avx.Multiply(rs632, sv616); // Scribner board foot volume to a 6 inch top for 32 foot logs
                    sv632 = Avx.BlendVariable(sv632, Vector128 <float> .Zero, zeroVolumeMask);

                    #if DEBUG
                    DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(rc6, Vector128 <float> .Zero, zeroVolumeMask), Vector128 <float> .Zero));
                    DebugV.Assert(Avx.CompareLessThanOrEqual(rc6, one));
                    DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(rs616, one, zeroVolumeMask), one));
                    DebugV.Assert(Avx.CompareLessThanOrEqual(Avx.BlendVariable(rs616, Vector128 <float> .Zero, zeroVolumeMask), v6p8));
                    DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(rs632, Vector128 <float> .Zero, zeroVolumeMask), Vector128 <float> .Zero));
                    DebugV.Assert(Avx.CompareLessThanOrEqual(Avx.BlendVariable(rs632, Vector128 <float> .Zero, zeroVolumeMask), one));
                    DebugV.Assert(Avx.CompareGreaterThanOrEqual(Avx.BlendVariable(sv632, Vector128 <float> .Zero, zeroVolumeMask), Vector128 <float> .Zero));
                    DebugV.Assert(Avx.CompareLessThanOrEqual(Avx.BlendVariable(sv632, Vector128 <float> .Zero, zeroVolumeMask), v10k));

                    Vector128 <float> expansionFactor = Avx.LoadVector128(expansionFactors + treeIndex);
                    standBoardFeetPerAcre = Avx.Add(standBoardFeetPerAcre, Avx.Multiply(expansionFactor, sv632));

                standBoardFeetPerAcre = Avx.HorizontalAdd(standBoardFeetPerAcre, standBoardFeetPerAcre);
                standBoardFeetPerAcre = Avx.HorizontalAdd(standBoardFeetPerAcre, standBoardFeetPerAcre);
Beispiel #13
        public static unsafe ComplexFloat[] Kernel32(ComplexFloat[] i, ref ComplexFloat[][] omegas)
            ComplexFloat[] result = new ComplexFloat[32];
            ComplexFloat[] tmp    = new ComplexFloat[48];

            ComplexFloat ami = i[0] - i[8];
            ComplexFloat api = i[0] + i[8];
            ComplexFloat fmn = i[5] - i[13];
            ComplexFloat fpn = i[5] + i[13];

            ComplexFloat xami = i[16] - i[24];
            ComplexFloat xapi = i[16] + i[24];
            ComplexFloat xfmn = i[21] - i[29];
            ComplexFloat xfpn = i[21] + i[29];

            tmp[0] = api + i[2] + i[4] + i[6] + i[10] + i[12] + i[14];
            tmp[1] = ami + (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI();
            tmp[2] = api - i[4] - i[12] + (i[2] - i[6] + i[10] - i[14]).TimesMinusI();
            tmp[3] = ami - (i[4] - i[12]).TimesMinusI() - (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()) * omegas[3][3];
            tmp[4] = api - i[2] + i[4] - i[6] - i[10] + i[12] - i[14];
            tmp[5] = ami - (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI();
            tmp[6] = api - i[4] - i[12] - (i[2] - i[6] + i[10] - i[14]).TimesMinusI();
            tmp[7] = ami - (i[4] - i[12]).TimesMinusI() + (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()).TimesMinusI();

            tmp[8]  = i[1] + i[3] + fpn + i[7] + i[9] + i[11] + i[15];
            tmp[9]  = omegas[4][1] * (i[1] - i[9] + (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI());
            tmp[10] = omegas[4][2] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() + i[1] - fpn + i[9]);
            tmp[11] = omegas[4][3] * (omegas[3][3] * (i[11] - i[3] + (i[7] - i[15]).TimesMinusI()) - i[1] + i[9] + (fmn).TimesMinusI());
            tmp[12] = (i[1] - i[3] + fpn - i[7] + i[9] - i[11] - i[15]).TimesMinusI();
            tmp[13] = (i[1] - i[9] - (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()) * omegas[4][5];
            tmp[14] = omegas[4][6] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() - i[1] + fpn - i[9]);
            tmp[15] = omegas[4][7] * ((i[11] - i[3] + (i[7] - i[15]).TimesMinusI()).TimesMinusI() + i[1] - i[9] - (fmn).TimesMinusI());

            tmp[16] = xapi + i[18] + i[20] + i[22] + i[28] + i[26] + i[30];
            tmp[17] = xami + (i[18] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[28]).TimesMinusI();
            tmp[18] = xapi - i[20] - i[28] + (i[18] - i[22] + i[26] - i[30]).TimesMinusI();
            tmp[19] = xami - (i[20] - i[28]).TimesMinusI() - (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()) * omegas[3][3];
            tmp[20] = xapi - i[28] + i[20] - i[22] - i[26] + i[28] - i[30];
            tmp[21] = xami - (i[28] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[22]).TimesMinusI();
            tmp[22] = xapi - i[20] - i[28] - (i[18] - i[22] + i[26] - i[30]).TimesMinusI();
            tmp[23] = xami - (i[20] - i[28]).TimesMinusI() + (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()).TimesMinusI();

            tmp[24] = i[17] + i[19] + xfpn + i[23] + i[25] + i[27] + i[31];
            tmp[25] = omegas[4][1] * (i[17] - i[25] + (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI());
            tmp[26] = omegas[4][2] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() + i[17] - xfpn + i[25]);
            tmp[27] = omegas[4][3] * (omegas[3][3] * (i[27] - i[19] + (i[23] - i[31]).TimesMinusI()) - i[17] + i[25] + (xfmn).TimesMinusI());
            tmp[28] = (i[17] - i[19] + xfpn - i[23] + i[25] - i[27] - i[31]).TimesMinusI();
            tmp[29] = (i[17] - i[25] - (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()) * omegas[4][5];
            tmp[30] = omegas[4][6] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() - i[17] + xfpn - i[25]);
            tmp[31] = omegas[4][7] * ((i[27] - i[19] + (i[23] - i[31]).TimesMinusI()).TimesMinusI() + i[17] - i[25] - (xfmn).TimesMinusI());

            //32 complex floats = 64 floats
            //Divided into 4 parts A, B, C, D = each containing 8 complex floats, so 16 floats
            //AVX takes 8 floats at once, so will calculate in halves of those parts
            //Tmp will ocntain 6 octets

            fixed(ComplexFloat *entry = result, om5 = omegas[5], t = tmp)
                Vector256 <float> a;
                Vector256 <float> b;
                Vector256 <float> bSwap;
                Vector256 <float> aIm;
                Vector256 <float> aRe;
                Vector256 <float> aIM_bSwap;

                float *partA    = (float *)entry;
                float *partB    = partA + 16;
                float *partC    = partA + 32;
                float *partD    = partA + 48;
                float *omPart1  = (float *)om5;
                float *omPart2  = omPart1 + 16;
                float *tmpPart1 = (float *)t;
                float *tmpPart2 = tmpPart1 + 16;
                float *tmpPart3 = tmpPart1 + 32;
                float *tmpPart4 = tmpPart1 + 48;
                float *tmpPart5 = tmpPart1 + 64;
                float *tmpPart6 = tmpPart1 + 80;

                //Summing up result

                Avx2.Store(partA, Avx2.Add(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2)));
                Avx2.Store(partA + 8, Avx2.Add(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8)));
                Avx2.Store(partB, Avx2.Subtract(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2)));
                Avx2.Store(partB + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8)));

                Avx2.Store(partC, Avx2.Add(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4)));
                Avx2.Store(partC + 8, Avx2.Add(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8)));
                Avx2.Store(partD, Avx2.Subtract(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4)));
                Avx2.Store(partD + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8)));


                //First part of each 8 complex part

                //Tmp[0] = A + B
                Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB)));
                //Tmp[1] = A - B
                Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB)));
                //Tmp[2] = C + D
                Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD)));
                //Tmp[3] = C - D
                Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD)));

                //Complex multiplication based on:

                //Tmp[4] = omega * (C+D)
                a         = Avx2.LoadVector256(tmpPart3);
                b         = Avx2.LoadVector256(omPart1);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //Tmp[4] = omega * (C-D)
                a         = Avx2.LoadVector256(tmpPart4);
                b         = Avx2.LoadVector256(omPart2);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //(A+B) + (C+D)
                Avx2.Store(partA, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3)));
                //(A-B) + (C-D)
                Avx2.Store(partB, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4)));
                //(A+B) + omega*(C+D)
                Avx2.Store(partC, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5)));
                //(A-B) + omega*(C-D)
                Avx2.Store(partD, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6)));


                //Second part of each 8 complex part

                //Tmp[0] = A + B
                Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8)));
                //Tmp[1] = A - B
                Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8)));
                //Tmp[2] = C + D
                Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8)));
                //Tmp[2] = C - D
                Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8)));

                //Complex multiplication based on:

                //Tmp[4] = omega * (C+D)
                a         = Avx2.LoadVector256(tmpPart3);
                b         = Avx2.LoadVector256(omPart1 + 8);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //Tmp[4] = omega * (C-D)
                a         = Avx2.LoadVector256(tmpPart4);
                b         = Avx2.LoadVector256(omPart2 + 8);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //(A+B) + (C+D)
                Avx2.Store(partA + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3)));
                //(A-B) + (C-D)
                Avx2.Store(partB + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4)));
                //(A+B) + omega*(C+D)
                Avx2.Store(partC + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5)));
                //(A-B) + omega*(C-D)
                Avx2.Store(partD + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6)));


            //ComplexFloat[] result = new ComplexFloat[32];
            //ArraySegment<ComplexFloat> partA = new ArraySegment<ComplexFloat>(i, 0, 16);
            //ArraySegment<ComplexFloat> partB = new ArraySegment<ComplexFloat>(i, 16, 16);
            //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 0);
            //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 16);
            //return result;
Beispiel #14
        protected override unsafe double CalculateImpl(double x, double stepThreshold, int maxN)
            if (!Avx.IsSupported)
                Status = TaylorSeriesStatus.NotSupported;

            const int vectorSize = 256 / 8 / sizeof(double);

            // v8888 <- (8, 8, 8, 8)
            var value8 = 8.0;
            var v8888  = Avx.BroadcastScalarToVector256(&value8);

            // xPow8 <- (x^8, x^8, x^8, x^8)
            var xPow8 = Avx.BroadcastScalarToVector256(&x);

            xPow8 = Avx.Multiply(xPow8, xPow8);
            xPow8 = Avx.Multiply(xPow8, xPow8);
            xPow8 = Avx.Multiply(xPow8, xPow8);

            // up <- (x^(-1), x^(-3), x^(-5), x^(-7))
            var upSa        = stackalloc double[vectorSize];
            var xDiv2iPlus1 = 1 / x;

            for (var i = 0; i < vectorSize; i++)
                upSa[i]      = xDiv2iPlus1;
                xDiv2iPlus1 /= x * x;

            var up = Avx.LoadVector256(upSa);

            // down <- (1, 3, 5, 7)
            var downSa = stackalloc double[vectorSize] {
                1, 3, 5, 7
            var down = Avx.LoadVector256(downSa);

            // sum <- (0, 0, 0, 0)
            var sum = Vector256 <double> .Zero;

            N = 0;
            while (N < maxN)
                // div <- up / down
                var div = Avx.Divide(up, down);
                // sum <- sum + div
                sum = Avx.Add(sum, div);
                // div = (x1, x2, x3, last)
                var last = div.GetElement(vectorSize - 1);
                N += vectorSize;
                if (Math.Abs(last) < stepThreshold)

                // up <- up / (x^8, x^8, x^8, x^8)
                up = Avx.Divide(up, xPow8);
                // down <- down + (8, 8, 8, 8)
                down = Avx.Add(down, v8888);

            var resultSa = stackalloc double[vectorSize];

            Avx.Store(resultSa, sum);

            Status = N >= maxN ? TaylorSeriesStatus.TooManyIterations : TaylorSeriesStatus.Success;

            return(resultSa[0] + resultSa[1] + resultSa[2] + resultSa[3]);
Beispiel #15
 // Element-wise addition.
 public static IEnumerable <Vector256 <double> > Add(
     this IEnumerable <Vector256 <double> > @this,
     IEnumerable <Vector256 <double> > other)
 => @this.Zip(other).Select(ab => Avx.Add(ab.First, ab.Second));
Beispiel #16
        static unsafe int Main(string[] args)
            int testResult = Pass;

            if (Avx.IsSupported)
                using (TestTable <float> floatTable = new TestTable <float>(new float[8] {
                    1, -5, 100, 0, 1, -5, 100, 0
                }, new float[8] {
                    22, -1, -50, 0, 22, -1, -50, 0
                }, new float[8]))
                    using (TestTable <double> doubleTable = new TestTable <double>(new double[4] {
                        1, -5, 100, 0
                    }, new double[4] {
                        22, -1, -50, 0
                    }, new double[4]))
                        var vf1 = Unsafe.Read <Vector256 <float> >(floatTable.inArray1Ptr);
                        var vf2 = Unsafe.Read <Vector256 <float> >(floatTable.inArray2Ptr);
                        var vf3 = Avx.Add(vf1, vf2);
                        Unsafe.Write(floatTable.outArrayPtr, vf3);

                        if (!floatTable.CheckResult((x, y, z) => x + y == z))
                            Console.WriteLine("AVX Add failed on float:");
                            foreach (var item in floatTable.outArray)
                                Console.Write(item + ", ");
                            testResult = Fail;

                        vf3 = (Vector256 <float>) typeof(Avx).GetMethod(nameof(Avx.Add), new Type[] { vf1.GetType(), vf2.GetType() }).Invoke(null, new object[] { vf1, vf2 });
                        Unsafe.Write(floatTable.outArrayPtr, vf3);

                        if (!floatTable.CheckResult((x, y, z) => x + y == z))
                            Console.WriteLine("AVX Add failed via reflection on float:");
                            foreach (var item in floatTable.outArray)
                                Console.Write(item + ", ");
                            testResult = Fail;

                        var vd1 = Unsafe.Read <Vector256 <double> >(doubleTable.inArray1Ptr);
                        var vd2 = Unsafe.Read <Vector256 <double> >(doubleTable.inArray2Ptr);
                        var vd3 = Avx.Add(vd1, vd2);
                        Unsafe.Write(doubleTable.outArrayPtr, vd3);

                        if (!doubleTable.CheckResult((x, y, z) => x + y == z))
                            Console.WriteLine("AVX Add failed on double:");
                            foreach (var item in doubleTable.outArray)
                                Console.Write(item + ", ");
                            testResult = Fail;

                        vd3 = (Vector256 <double>) typeof(Avx).GetMethod(nameof(Avx.Add), new Type[] { vd1.GetType(), vd2.GetType() }).Invoke(null, new object[] { vd1, vd2 });
                        Unsafe.Write(doubleTable.outArrayPtr, vd3);

                        if (!doubleTable.CheckResult((x, y, z) => x + y == z))
                            Console.WriteLine("AVX Add failed via reflection on double:");
                            foreach (var item in doubleTable.outArray)
                                Console.Write(item + ", ");
                            testResult = Fail;

Beispiel #17
    static int Main()
        s_success = true;

        // We expect the AOT compiler generated HW intrinsics with the following characteristics:
        // * TRUE = IsSupported assumed to be true, no runtime check
        // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen
        // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used
        // The test is compiled with multiple defines to test this.

        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = null;
        bool?AesLzPcl           = null;
        bool?Sse4142            = null;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 16;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = false;
        bool?FmaBmi12 = false;
        bool vectorsAccelerated = true;
        int  byteVectorLength   = 32;
        bool?Sse2AndBelow       = true;
        bool?Sse3Group          = true;
        bool?AesLzPcl           = null;
        bool?Sse4142            = true;
        bool?PopCnt             = null;
        bool?Avx12    = true;
        bool?FmaBmi12 = null;
#error Who dis?

        if (vectorsAccelerated != Vector.IsHardwareAccelerated)
            throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}");

        if (byteVectorLength != Vector <byte> .Count)
            throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}");

        Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0);

        Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0);
        Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0);

        Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null);

        Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero));
        Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null);

        Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero));
        Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0);

        Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0);
        Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0);

        Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99)));
        Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null);

        Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero));
        Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null);

        Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero));
        Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null);

        Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0);
        Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0);

        Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0);
        Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0);

        Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero));
        Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null);

        Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32);
        Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64);

        Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero));
        Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null);

        Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0);
        Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0);

        return(s_success ? 100 : 1);
Beispiel #18
 return(new AvxVec3()
     x = Avx.Add(x, other.x),
     y = Avx.Add(y, other.y),
     z = Avx.Add(z, other.z)
Beispiel #19
        public static float DotMultiplyIntrinsicWAvxWSpanPtr(ref Memory <float> vector1, ref Memory <float> vector2)
            var span1   = vector1.Span;
            var span2   = vector2.Span;
            var cnt     = Math.Min(span1.Length, span2.Length);
            var v3      = Vector256.CreateScalarUnsafe(0f);
            var vectLen = Vector256 <float> .Count;
            var vectCnt = cnt / vectLen;
            var total   = 0f;

#if TEST
            var file = Path.GetTempFileName();
            using var writer = new StreamWriter(file);
            Console.WriteLine($"Intrinsic with AvxWPtr Mult. results will be written into {file}");
            int i;
                var ptr1 = (float *)Unsafe.AsPointer(ref span1[0]);
                var ptr2 = (float *)Unsafe.AsPointer(ref span2[0]);
                for (i = 0; i < vectCnt; i++)
                    var v1 = Avx.LoadVector256(ptr1);
                    var v2 = Avx.LoadVector256(ptr2);
                    var t  = Avx.Multiply(v1, v2);
                    v3    = Avx.Add(v3, t);
                    ptr1 += vectLen;
                    ptr2 += vectLen;
#if TEST

                for (i = 0; i < vectLen; i++)
                    total += v3.GetElement(i);

                i = vectCnt * vectLen;
                if (cnt % vectLen > 0)
                    ptr1 = (float *)Unsafe.AsPointer(ref span1[i]);
                    ptr2 = (float *)Unsafe.AsPointer(ref span2[i]);
                    for (; i < cnt; i++)
                        total += *ptr1++ **ptr2++;

            if (vector1.Length != vector2.Length)
                var h = vector1.Length > vector2.Length ? span1 : span2;
                for (var j = cnt; j < h.Length; j++)
                    total += h[j];

Beispiel #20
        unsafe void IConvolver.SharpenLine(byte *cstart, byte *ystart, byte *bstart, byte *ostart, int ox, int ow, float amt, float thresh, bool gamma)
            float *ip = (float *)cstart + (uint)ox * channels, yp = (float *)ystart + (uint)ox, bp = (float *)bstart, op = (float *)ostart;
            float *ipe = ip + (uint)ow * channels;

            bool threshold = thresh > 0f;

            if (Avx.IsSupported && ip <= ipe - VectorAvx.Count)
                var vthresh = Vector256.Create(threshold ? thresh : -1f);
                var vmsk    = Vector256.Create(0x7fffffff).AsSingle();
                var vamt    = Vector256.Create(amt);
                var vmin    = VectorAvx.Zero;

                ipe -= VectorAvx.Count;
                    var vd = Avx.Subtract(Avx.LoadVector256(yp), Avx.LoadVector256(bp));
                    yp += VectorAvx.Count;
                    bp += VectorAvx.Count;

                    if (threshold)
                        var sm = HWIntrinsics.AvxCompareGreaterThan(Avx.And(vd, vmsk), vthresh);
                        vd = Avx.And(vd, sm);
                    vd = Avx.Multiply(vd, vamt);

                    var v0 = Avx.LoadVector256(ip);
                    ip += VectorAvx.Count;

                    if (gamma)
                        v0 = Avx.Max(v0, vmin);
                        v0 = Avx.Multiply(v0, Avx.ReciprocalSqrt(v0));
                        v0 = Avx.Add(v0, vd);
                        v0 = Avx.Max(v0, vmin);
                        v0 = Avx.Multiply(v0, v0);
                        v0 = Avx.Add(v0, vd);

                    Avx.Store(op, v0);
                    op += VectorAvx.Count;
                } while (ip <= ipe);
                ipe += VectorAvx.Count;
            else if (ip <= ipe - VectorSse.Count)
                var vthresh = Vector128.Create(threshold ? thresh : -1f);
                var vmsk    = Vector128.Create(0x7fffffff).AsSingle();
                var vamt    = Vector128.Create(amt);
                var vmin    = VectorSse.Zero;

                ipe -= VectorSse.Count;
                    var vd = Sse.Subtract(Sse.LoadVector128(yp), Sse.LoadVector128(bp));
                    yp += VectorSse.Count;
                    bp += VectorSse.Count;

                    if (threshold)
                        var sm = Sse.CompareGreaterThan(Sse.And(vd, vmsk), vthresh);
                        vd = Sse.And(vd, sm);
                    vd = Sse.Multiply(vd, vamt);

                    var v0 = Sse.LoadVector128(ip);
                    ip += VectorSse.Count;

                    if (gamma)
                        v0 = Sse.Max(v0, vmin);
                        v0 = Sse.Multiply(v0, Sse.ReciprocalSqrt(v0));
                        v0 = Sse.Add(v0, vd);
                        v0 = Sse.Max(v0, vmin);
                        v0 = Sse.Multiply(v0, v0);
                        v0 = Sse.Add(v0, vd);

                    Sse.Store(op, v0);
                    op += VectorSse.Count;
                } while (ip <= ipe);
                ipe += VectorSse.Count;

            float fmin = VectorSse.Zero.ToScalar();

            while (ip < ipe)
                float dif = *yp++ - *bp++;
                float c0  = *ip++;

                if (!threshold || Math.Abs(dif) > thresh)
                    dif *= amt;

                    if (gamma)
                        c0  = MathUtil.MaxF(c0, fmin).Sqrt();
                        c0  = MathUtil.MaxF(c0 + dif, fmin);
                        c0 *= c0;
                        c0 += dif;

                *op++ = c0;
        public unsafe void Vector256Mandel()
            int floatL3Size = TOTALBYTES / sizeof(float);

            resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x));
            if (resolutionX % 8 != 0)
                resolutionX -= resolutionX % 8;
            resolutionY = (int)MathF.Floor(resolutionX * ratioy_x);
            if (resolutionY % 8 != 0)
                resolutionY -= resolutionY % 8;
            STEP_X         = (RIGHT_X - LEFT_X) / resolutionX;
            STEP_Y         = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment
            numberOfPoints = resolutionX * resolutionY;
            results2       = new float[numberOfPoints];

            xPoints = new float[resolutionX];
            yPoints = new float[resolutionY];
            for (int i = 0; i < resolutionX; i++)
                xPoints.Span[i] = LEFT_X + i * STEP_X;
            for (int i = 0; i < resolutionY; i++)
                yPoints.Span[i] = TOP_Y - i * STEP_Y;

            int countX = 0, countY = 0;
            int maxInter = 256;
            int inter;
            ReadOnlySpan <float> ySpan = yPoints.Span;// MemoryMarshal.Cast<float, Vector256<float>>(yPoints.Span);
            ReadOnlySpan <Vector256 <float> > xSpan    = MemoryMarshal.Cast <float, Vector256 <float> >(xPoints.Span);
            Span <Vector256 <float> >         res      = MemoryMarshal.Cast <float, Vector256 <float> >(results2.Span);
            Span <Vector256 <float> >         testSpan = MemoryMarshal.Cast <float, Vector256 <float> >(testValue2.Span);
            int resVectorNumber = 0;

            Vector256 <float> xVec, yVec;
            var oneVec  = Vector256.Create(1.0f);
            var fourVec = Vector256.Create(4.0f);

            while (countY < ySpan.Length)
                var currYVec = Vector256.Create(ySpan[countY]);
                while (countX < xSpan.Length)
                    Vector256 <float> currXVec = xSpan[countX];
                    var xSquVec  = Vector256.Create(0.0f);
                    var ySquVec  = Vector256.Create(0.0f);
                    var zSquVec  = Vector256.Create(0.0f);
                    var interVec = Vector256.Create(0.0f);
                    Vector256 <float> sumVector = oneVec;
                    inter = 0;
                    bool goOn = true;
                    while (goOn)
                        xVec    = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec);
                        yVec    = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec);
                        xSquVec = Avx.Multiply(xVec, xVec);
                        ySquVec = Avx.Multiply(yVec, yVec);
                        zSquVec = Avx.Multiply(Avx.Add(xVec, yVec), Avx.Add(xVec, yVec));
                        Vector256 <float> test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0?
                        sumVector = Avx.BlendVariable(Vector256 <float> .Zero, sumVector, test);                                                          // selects from second if true, from first otherwise
                        goOn      = (Avx.MoveMask(test) > 0) & (inter < maxInter);                                                                        //any of the values still alive, and inter still below cutoff value?
                        if (goOn)
                            interVec = Avx.Add(interVec, sumVector);
                        inter = goOn ? inter + 1 : inter;
                    testSpan[resVectorNumber] = Avx.Add(xSquVec, ySquVec);
                    res[resVectorNumber]      = interVec;
                countX = 0;
        // This function implements Algorithm 1 in
        // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part.
        // The linear part is the inner product of linearWeights and featureValues.
        // The latent part is the sum of all intra-field interactions in one field f, for all fields possible
        public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues,
                                                                 float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count)

            // The number of all possible fields.
            int    m              = fieldCount;
            int    d              = latentDim;
            int    c              = count;
            int *  pf             = fieldIndices;
            int *  pi             = featureIndices;
            float *px             = featureValues;
            float *pw             = linearWeights;
            float *pv             = latentWeights;
            float *pq             = latentSum;
            float  linearResponse = 0;
            float  latentResponse = 0;

            Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float)));

            Vector256 <float> y   = Vector256 <float> .Zero;
            Vector256 <float> tmp = Vector256 <float> .Zero;

            for (int i = 0; i < c; i++)
                int f = pf[i];
                int j = pi[i];
                linearResponse += pw[j] * px[i];

                Vector256 <float> x  = Avx.BroadcastScalarToVector256(px + i);
                Vector256 <float> xx = Avx.Multiply(x, x);

                // tmp -= <v_j,f, v_j,f> * x * x
                int vBias = j * m * d + f * d;

                // j-th feature's latent vector in the f-th field hidden space.
                float *vjf = pv + vBias;

                for (int k = 0; k + 8 <= d; k += 8)
                    Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k);
                    tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp);

                for (int fprime = 0; fprime < m; fprime++)
                    vBias = j * m * d + fprime * d;
                    int    qBias    = f * m * d + fprime * d;
                    float *vjfprime = pv + vBias;
                    float *qffprime = pq + qBias;

                    // q_f,f' += v_j,f' * x
                    for (int k = 0; k + 8 <= d; k += 8)
                        Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k);
                        Vector256 <float> q = Avx.LoadVector256(qffprime + k);
                        q = MultiplyAdd(vjfprimeBuffer, x, q);
                        Avx.Store(qffprime + k, q);

            for (int f = 0; f < m; f++)
                // tmp += <q_f,f, q_f,f>
                float *qff = pq + f * m * d + f * d;
                for (int k = 0; k + 8 <= d; k += 8)
                    Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k);

                    // Intra-field interactions.
                    tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp);

                // y += <q_f,f', q_f',f>, f != f'
                // Whis loop handles inter - field interactions because f != f'.
                for (int fprime = f + 1; fprime < m; fprime++)
                    float *qffprime = pq + f * m * d + fprime * d;
                    float *qfprimef = pq + fprime * m * d + f * d;
                    for (int k = 0; k + 8 <= d; k += 8)
                        // Inter-field interaction.
                        Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k);
                        Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k);
                        y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y);

            y   = MultiplyAdd(_point5, tmp, y);
            tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1));
            tmp = Avx.HorizontalAdd(tmp, tmp);
            y   = Avx.HorizontalAdd(tmp, tmp);
            Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value.
            *response = linearResponse + latentResponse;
Beispiel #23
        static public float Dot(Vector v0, Vector v1)
            if (v0.lng != v1.lng)
                throw new Exception();
            int lng = v0.lng;

            float *p0  = v0.ptr;
            float *p1  = v1.ptr;
            float *tmp = stackalloc float[8];

            if (lng < 8)
                float sum = 0;
                for (int i = 0; i < lng; i++)
                    sum += p0[i] * p1[i];
            if (lng < 64)
                var sum0 = Vector256 <float> .Zero;

                for (int i = 0; i <= lng - 8; i += 8)
                    sum0 = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum0);

                Avx.Store(tmp, sum0);
                float sum = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];

                for (int i = lng / 8 * 8; i < lng; i++)
                    sum += p0[i] * p1[i];
                var    sum  = Vector256 <float> .Zero;
                var    sum0 = Vector256 <float> .Zero;
                var    sum1 = Vector256 <float> .Zero;
                var    sum2 = Vector256 <float> .Zero;
                var    sum3 = Vector256 <float> .Zero;
                var    sum4 = Vector256 <float> .Zero;
                var    sum5 = Vector256 <float> .Zero;
                var    sum6 = Vector256 <float> .Zero;
                var    sum7 = Vector256 <float> .Zero;
                float *pp1  = p0;
                float *pp2  = p1;
                double dsum = 0;
                for (int i = 0; i < lng / 64; i++)
                    sum0 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 00), Avx.LoadVector256(pp2 + 00), sum0);
                    sum1 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 08), Avx.LoadVector256(pp2 + 08), sum1);
                    sum2 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 16), Avx.LoadVector256(pp2 + 16), sum2);
                    sum3 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 24), Avx.LoadVector256(pp2 + 24), sum3);

                    sum4 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 32), Avx.LoadVector256(pp2 + 32), sum4);
                    sum5 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 40), Avx.LoadVector256(pp2 + 40), sum5);
                    sum6 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 48), Avx.LoadVector256(pp2 + 48), sum6);
                    sum7 = Fma.MultiplyAdd(Avx.LoadVector256(pp1 + 56), Avx.LoadVector256(pp2 + 56), sum7);

                    pp1 += 64;
                    pp2 += 64;
                    if (i % 1024 == 1023)
                        sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7)));
                        Avx.Store(tmp, sum);
                        dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
                        sum0  = Vector256 <float> .Zero;
                        sum1  = Vector256 <float> .Zero;
                        sum2  = Vector256 <float> .Zero;
                        sum3  = Vector256 <float> .Zero;
                        sum4  = Vector256 <float> .Zero;
                        sum5  = Vector256 <float> .Zero;
                        sum6  = Vector256 <float> .Zero;
                        sum7  = Vector256 <float> .Zero;
                sum = Avx.Add(Avx.Add(Avx.Add(sum0, sum1), Avx.Add(sum2, sum3)), Avx.Add(Avx.Add(sum4, sum5), Avx.Add(sum6, sum7)));

                for (int i = lng / 64 * 64; i <= lng - 8; i += 8)
                    sum = Fma.MultiplyAdd(Avx.LoadVector256(p0 + i), Avx.LoadVector256(p1 + i), sum);

                Avx.Store(tmp, sum);
                dsum += tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];

                for (int i = lng / 8 * 8; i < lng; i++)
                    dsum += p0[i] * p1[i];
Beispiel #24
        public VectorArg256 Change(float f)
            Vector256 <float> t = Vector256.Create(f);

            return(new VectorArg256(Avx.Add(t, _rgb)));
 public static Vector256 <float> op_Addition(Vector256 <float> left, Vector256 <float> right)
 => Avx.Add(left, right);
Beispiel #26
    private static unsafe double[] BilinearInterpol_AVX(
        double[] x,
        double[] A,
        double minXA,
        double maxXA,
        double[] B,
        double minXB,
        double maxXB,
        double weightB)
        double[] z = new double[outputVectorSize];

        fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0])
            Vector256 <double> vWeightB = Vector256.Create(weightB);
            Vector256 <double> vWeightA = Vector256.Create(1 - weightB);

            Vector256 <double> vMinXA = Vector256.Create(minXA);
            Vector256 <double> vMaxXA = Vector256.Create(maxXA);
            Vector256 <double> vMinXB = Vector256.Create(minXB);
            Vector256 <double> vMaxXB = Vector256.Create(maxXB);

            double             deltaA  = (maxXA - minXA) / (double)(A.Length - 1);
            double             deltaB  = (maxXB - minXB) / (double)(B.Length - 1);
            Vector256 <double> vDeltaA = Vector256.Create(deltaA);
            Vector256 <double> vDeltaB = Vector256.Create(deltaB);

            double             invDeltaA  = 1.0 / deltaA;
            double             invDeltaB  = 1.0 / deltaB;
            Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA);
            Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB);

            Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1);
            Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1);
            Vector128 <int> One             = Vector128.Create(1);

            for (var i = 0; i < x.Length; i += Vector256 <double> .Count)
                Vector256 <double> currentX = Avx.LoadVector256(pX + i);

                // Determine the largest a, such that A[i] = f(xA) and xA <= x[i].
                // This involves casting from double to int; here we use a Vector conversion.
                Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA);
                Vector128 <int>    a       = Avx.ConvertToVector128Int32WithTruncation(aDouble);
                a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne);
                Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne);

                // Now, get the reference input, xA, for our index a.
                // This involves casting from  int to double.
                Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA);

                // Now, compute the lambda for our A reference point.
                Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA));
                Vector256 <double> lambdaA       = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA);

                // Now, we need to load up our reference points using Vector Gather operations.
                Vector256 <double> AVector        = Avx2.GatherVector256(pA, a, 8);
                Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8);

                // Now, do the all of the above for our B reference point.
                Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB);
                Vector128 <int>    b       = Avx.ConvertToVector128Int32WithTruncation(bDouble);
                b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne);
                Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne);

                Vector256 <double> xB            = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB);
                Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB));
                Vector256 <double> lambdaB       = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB);

                Vector256 <double> BVector        = Avx2.GatherVector256(pB, b, 8);
                Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8);

                Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))),
                                                  Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector)))));
                Avx.Store(pZ + i, newZ);

        public Intro()
            var middleVector = Vector128.Create(1.0f);                      // middleVector = <1,1,1,1>

            middleVector = Vector128.CreateScalar(-1.0f);                   // middleVector = <-1,0,0,0>
            var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191>

            if (Avx.IsSupported)
                var left  = Vector256.Create(-2.5f);                     // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5>
                var right = Vector256.Create(5.0f);                      // <5, 5, 5, 5, 5, 5, 5, 5>
                Vector256 <float> result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit
                left   = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, -70.0f, -80.0f);
                right  = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f);
                result = Avx.UnpackHigh(left, right);              // result = <-3, 3, -4, 4, -70, 70, -80, 80>
                result = Avx.UnpackLow(left, right);               // result = <-1, 1, -2, 2, -50, 50, -60, 60>
                result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0>
                bool testResult = Avx.TestC(left, right);          // testResult = true
                testResult = Avx.TestC(right, left);               // testResult = false
                Vector256 <float> result1 = Avx.Divide(left, right);
                var plusOne = Vector256.Create(1.0f);
                result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
                result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling);
                left   = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f);
                right  = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                Vector256 <float> nanInFirstPosition = Avx.Divide(left, right);
                left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f);
                Vector256 <float> InfInFirstPosition = Avx.Divide(left, right);

                left  = Vector256.Create(-1.1f, 3.0f, 1.0f / 3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f);
                right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                Vector256 <float> compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
                Vector256 <float> mixed         = Avx.BlendVariable(left, right, compareResult);                                //  mixed = <-1, 2, -3, 2, -50, -60, -70, -80>

                //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f);
                //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f);
                Vector256 <float> other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                bool bRes    = Avx.TestZ(plusOne, compareResult);
                bool bRes2   = Avx.TestC(plusOne, compareResult);
                bool allTrue = !Avx.TestZ(compareResult, compareResult);
                compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
                compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling);
                compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling);
                compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling);
                var left128  = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f);
                var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f);
                Vector128 <float> compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0>

                int res = Avx.MoveMask(compareResult);
                if (Fma.IsSupported)
                    Vector256 <float> resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element
                    resultFma = Fma.MultiplyAddNegated(left, right, other);            // = -(left * right + other) for each element
                    resultFma = Fma.MultiplySubtract(left, right, other);              // = left * right - other for each element
                    Fma.MultiplyAddSubtract(left, right, other);                       // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract
                result = Avx.DotProduct(left, right, 0b1010_0001);                     // result = <-20, 0, 0, 0, -10000, 0, 0, 0>
                result = Avx.Floor(left);                                              // result = <-3, -3, -3, -3, -3, -3, -3, -3>
                result = Avx.Add(left, right);                                         // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5>
                result = Avx.Ceiling(left);                                            // result = <-2, -2, -2, -2, -2, -2, -2, -2>
                result = Avx.Multiply(left, right);                                    // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5>
                result = Avx.HorizontalAdd(left, right);                               // result = <-5, -5, 10, 10, -5, -5, 10, 10>
                result = Avx.HorizontalSubtract(left, right);                          // result = <0, 0, 0, 0, 0, 0, 0, 0>
                double[] someDoubles      = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 };
                double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
                double[] someResult       = new double[someDoubles.Length];
                float[]  someFloats       = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 };
                float[]  someOtherFloats  = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 };
                    fixed(double *ptr = &someDoubles[1])
                        fixed(double *ptr2 = &someResult[0])
                            Vector256 <double> res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8>

                            Avx.Store(ptr2, res2);

                    fixed(float *ptr = &someFloats[0])
                        fixed(float *ptr2 = &someOtherFloats[0])
                            Vector256 <float> res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001);
                            //Avx.Store(ptr2, res2);