예제 #1
0
        public void RunClassLclFldScenario()
        {
            var test   = new AlternatingTernaryOpTest__MultiplyAddSubtractSingle();
            var result = Fma.MultiplyAddSubtract(test._fld1, test._fld2, test._fld3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
        }
예제 #2
0
        public void RunStructLclFldScenario()
        {
            var test   = TestStruct.Create();
            var result = Fma.MultiplyAddSubtract(test._fld1, test._fld2, test._fld3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
        }
        public void RunClassFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario));

            var result = Fma.MultiplyAddSubtract(_fld1, _fld2, _fld3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr);
        }
예제 #4
0
        public void RunLclVarScenario_UnsafeRead()
        {
            var firstOp  = Unsafe.Read <Vector256 <Single> >(_dataTable.inArray1Ptr);
            var secondOp = Unsafe.Read <Vector256 <Single> >(_dataTable.inArray2Ptr);
            var thirdOp  = Unsafe.Read <Vector256 <Single> >(_dataTable.inArray3Ptr);
            var result   = Fma.MultiplyAddSubtract(firstOp, secondOp, thirdOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr);
        }
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test   = TestStruct.Create();
            var result = Fma.MultiplyAddSubtract(test._fld1, test._fld2, test._fld3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
        }
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new AlternatingTernaryOpTest__MultiplyAddSubtractDouble();
            var result = Fma.MultiplyAddSubtract(test._fld1, test._fld2, test._fld3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
        }
예제 #7
0
        public void RunLclVarScenario_LoadAligned()
        {
            var firstOp  = Avx.LoadAlignedVector256((Single *)(_dataTable.inArray1Ptr));
            var secondOp = Avx.LoadAlignedVector256((Single *)(_dataTable.inArray2Ptr));
            var thirdOp  = Avx.LoadAlignedVector256((Single *)(_dataTable.inArray3Ptr));
            var result   = Fma.MultiplyAddSubtract(firstOp, secondOp, thirdOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr);
        }
예제 #8
0
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Fma.MultiplyAddSubtract(
                Unsafe.Read <Vector256 <Single> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector256 <Single> >(_dataTable.inArray2Ptr),
                Unsafe.Read <Vector256 <Single> >(_dataTable.inArray3Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_Load()
        {
            var result = Fma.MultiplyAddSubtract(
                Sse2.LoadVector128((Double *)(_dataTable.inArray1Ptr)),
                Sse2.LoadVector128((Double *)(_dataTable.inArray2Ptr)),
                Sse2.LoadVector128((Double *)(_dataTable.inArray3Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr);
        }
예제 #10
0
        public void RunClsVarScenario()
        {
            var result = Fma.MultiplyAddSubtract(
                _clsVar1,
                _clsVar2,
                _clsVar3
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr);
        }
예제 #11
0
        public void RunBasicScenario_LoadAligned()
        {
            var result = Fma.MultiplyAddSubtract(
                Avx.LoadAlignedVector256((Single *)(_dataTable.inArray1Ptr)),
                Avx.LoadAlignedVector256((Single *)(_dataTable.inArray2Ptr)),
                Avx.LoadAlignedVector256((Single *)(_dataTable.inArray3Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr);
        }
예제 #12
0
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var firstOp  = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr);
            var secondOp = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr);
            var thirdOp  = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray3Ptr);
            var result   = Fma.MultiplyAddSubtract(firstOp, secondOp, thirdOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var op1    = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr);
            var op2    = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr);
            var op3    = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray3Ptr);
            var result = Fma.MultiplyAddSubtract(op1, op2, op3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, op3, _dataTable.outArrayPtr);
        }
예제 #14
0
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var firstOp  = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr));
            var secondOp = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr));
            var thirdOp  = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray3Ptr));
            var result   = Fma.MultiplyAddSubtract(firstOp, secondOp, thirdOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var op1    = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray1Ptr));
            var op2    = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray2Ptr));
            var op3    = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray3Ptr));
            var result = Fma.MultiplyAddSubtract(op1, op2, op3);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, op3, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Fma.MultiplyAddSubtract(
                Sse2.LoadVector128((Double *)(_dataTable.inArray1Ptr)),
                Sse2.LoadVector128((Double *)(_dataTable.inArray2Ptr)),
                Sse2.LoadVector128((Double *)(_dataTable.inArray3Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr);
        }
예제 #17
0
        public void RunBasicScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned));

            var result = Fma.MultiplyAddSubtract(
                Avx.LoadAlignedVector256((Single *)(_dataTable.inArray1Ptr)),
                Avx.LoadAlignedVector256((Single *)(_dataTable.inArray2Ptr)),
                Avx.LoadAlignedVector256((Single *)(_dataTable.inArray3Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr);
        }
예제 #18
0
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Fma.MultiplyAddSubtract(
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr),
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray3Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr);
        }
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Fma.MultiplyAddSubtract(
                Sse2.LoadVector128((Double *)(&test._fld1)),
                Sse2.LoadVector128((Double *)(&test._fld2)),
                Sse2.LoadVector128((Double *)(&test._fld3))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
        }
            public void RunStructFldScenario_Load(AlternatingTernaryOpTest__MultiplyAddSubtractDouble testClass)
            {
                fixed(Vector128 <Double> *pFld1 = &_fld1)
                fixed(Vector128 <Double> *pFld2 = &_fld2)
                fixed(Vector128 <Double> *pFld3 = &_fld3)
                {
                    var result = Fma.MultiplyAddSubtract(
                        Sse2.LoadVector128((Double *)(pFld1)),
                        Sse2.LoadVector128((Double *)(pFld2)),
                        Sse2.LoadVector128((Double *)(pFld3))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, _fld3, testClass._dataTable.outArrayPtr);
                }
            }
예제 #21
0
        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector256 <Single> *pClsVar1 = &_clsVar1)
            fixed(Vector256 <Single> *pClsVar2 = &_clsVar2)
            fixed(Vector256 <Single> *pClsVar3 = &_clsVar3)
            {
                var result = Fma.MultiplyAddSubtract(
                    Avx.LoadVector256((Single *)(pClsVar1)),
                    Avx.LoadVector256((Single *)(pClsVar2)),
                    Avx.LoadVector256((Single *)(pClsVar3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr);
            }
        }
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <Double> *pFld1 = &_fld1)
            fixed(Vector128 <Double> *pFld2 = &_fld2)
            fixed(Vector128 <Double> *pFld3 = &_fld3)
            {
                var result = Fma.MultiplyAddSubtract(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2)),
                    Sse2.LoadVector128((Double *)(pFld3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr);
            }
        }
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new AlternatingTernaryOpTest__MultiplyAddSubtractDouble();

            fixed(Vector128 <Double> *pFld1 = &test._fld1)
            fixed(Vector128 <Double> *pFld2 = &test._fld2)
            fixed(Vector128 <Double> *pFld3 = &test._fld3)
            {
                var result = Fma.MultiplyAddSubtract(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2)),
                    Sse2.LoadVector128((Double *)(pFld3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
            }
        }
예제 #24
0
        public Intro()
        {
            var middleVector = Vector128.Create(1.0f);                      // middleVector = <1,1,1,1>

            middleVector = Vector128.CreateScalar(-1.0f);                   // middleVector = <-1,0,0,0>
            var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191>

            if (Avx.IsSupported)
            {
                var left  = Vector256.Create(-2.5f);                     // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5>
                var right = Vector256.Create(5.0f);                      // <5, 5, 5, 5, 5, 5, 5, 5>
                Vector256 <float> result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit
                left   = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, -70.0f, -80.0f);
                right  = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f);
                result = Avx.UnpackHigh(left, right);              // result = <-3, 3, -4, 4, -70, 70, -80, 80>
                result = Avx.UnpackLow(left, right);               // result = <-1, 1, -2, 2, -50, 50, -60, 60>
                result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0>
                bool testResult = Avx.TestC(left, right);          // testResult = true
                testResult = Avx.TestC(right, left);               // testResult = false
                Vector256 <float> result1 = Avx.Divide(left, right);
                var plusOne = Vector256.Create(1.0f);
                result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
                result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling);
                left   = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f);
                right  = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                Vector256 <float> nanInFirstPosition = Avx.Divide(left, right);
                left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f);
                Vector256 <float> InfInFirstPosition = Avx.Divide(left, right);

                left  = Vector256.Create(-1.1f, 3.0f, 1.0f / 3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f);
                right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                Vector256 <float> compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
                Vector256 <float> mixed         = Avx.BlendVariable(left, right, compareResult);                                //  mixed = <-1, 2, -3, 2, -50, -60, -70, -80>

                //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f);
                //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f);
                Vector256 <float> other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                bool bRes    = Avx.TestZ(plusOne, compareResult);
                bool bRes2   = Avx.TestC(plusOne, compareResult);
                bool allTrue = !Avx.TestZ(compareResult, compareResult);
                compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
                compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling);
                compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling);
                compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling);
                var left128  = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f);
                var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f);
                Vector128 <float> compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0>

                int res = Avx.MoveMask(compareResult);
                if (Fma.IsSupported)
                {
                    Vector256 <float> resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element
                    resultFma = Fma.MultiplyAddNegated(left, right, other);            // = -(left * right + other) for each element
                    resultFma = Fma.MultiplySubtract(left, right, other);              // = left * right - other for each element
                    Fma.MultiplyAddSubtract(left, right, other);                       // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract
                }
                result = Avx.DotProduct(left, right, 0b1010_0001);                     // result = <-20, 0, 0, 0, -10000, 0, 0, 0>
                result = Avx.Floor(left);                                              // result = <-3, -3, -3, -3, -3, -3, -3, -3>
                result = Avx.Add(left, right);                                         // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5>
                result = Avx.Ceiling(left);                                            // result = <-2, -2, -2, -2, -2, -2, -2, -2>
                result = Avx.Multiply(left, right);                                    // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5>
                result = Avx.HorizontalAdd(left, right);                               // result = <-5, -5, 10, 10, -5, -5, 10, 10>
                result = Avx.HorizontalSubtract(left, right);                          // result = <0, 0, 0, 0, 0, 0, 0, 0>
                double[] someDoubles      = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 };
                double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
                double[] someResult       = new double[someDoubles.Length];
                float[]  someFloats       = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 };
                float[]  someOtherFloats  = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 };
                unsafe
                {
                    fixed(double *ptr = &someDoubles[1])
                    {
                        fixed(double *ptr2 = &someResult[0])
                        {
                            Vector256 <double> res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8>

                            Avx.Store(ptr2, res2);
                        }
                    }

                    fixed(float *ptr = &someFloats[0])
                    {
                        fixed(float *ptr2 = &someOtherFloats[0])
                        {
                            Vector256 <float> res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001);
                            //Avx.Store(ptr2, res2);
                        }
                    }
                }
            }
        }
예제 #25
0
        public static unsafe ComplexFloat[] Kernel32(ComplexFloat[] i, ref ComplexFloat[][] omegas)
        {
            ComplexFloat[] result = new ComplexFloat[32];
            ComplexFloat[] tmp    = new ComplexFloat[48];

            ComplexFloat ami = i[0] - i[8];
            ComplexFloat api = i[0] + i[8];
            ComplexFloat fmn = i[5] - i[13];
            ComplexFloat fpn = i[5] + i[13];

            ComplexFloat xami = i[16] - i[24];
            ComplexFloat xapi = i[16] + i[24];
            ComplexFloat xfmn = i[21] - i[29];
            ComplexFloat xfpn = i[21] + i[29];

            tmp[0] = api + i[2] + i[4] + i[6] + i[10] + i[12] + i[14];
            tmp[1] = ami + (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI();
            tmp[2] = api - i[4] - i[12] + (i[2] - i[6] + i[10] - i[14]).TimesMinusI();
            tmp[3] = ami - (i[4] - i[12]).TimesMinusI() - (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()) * omegas[3][3];
            tmp[4] = api - i[2] + i[4] - i[6] - i[10] + i[12] - i[14];
            tmp[5] = ami - (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI();
            tmp[6] = api - i[4] - i[12] - (i[2] - i[6] + i[10] - i[14]).TimesMinusI();
            tmp[7] = ami - (i[4] - i[12]).TimesMinusI() + (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()).TimesMinusI();

            tmp[8]  = i[1] + i[3] + fpn + i[7] + i[9] + i[11] + i[15];
            tmp[9]  = omegas[4][1] * (i[1] - i[9] + (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI());
            tmp[10] = omegas[4][2] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() + i[1] - fpn + i[9]);
            tmp[11] = omegas[4][3] * (omegas[3][3] * (i[11] - i[3] + (i[7] - i[15]).TimesMinusI()) - i[1] + i[9] + (fmn).TimesMinusI());
            tmp[12] = (i[1] - i[3] + fpn - i[7] + i[9] - i[11] - i[15]).TimesMinusI();
            tmp[13] = (i[1] - i[9] - (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()) * omegas[4][5];
            tmp[14] = omegas[4][6] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() - i[1] + fpn - i[9]);
            tmp[15] = omegas[4][7] * ((i[11] - i[3] + (i[7] - i[15]).TimesMinusI()).TimesMinusI() + i[1] - i[9] - (fmn).TimesMinusI());

            tmp[16] = xapi + i[18] + i[20] + i[22] + i[28] + i[26] + i[30];
            tmp[17] = xami + (i[18] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[28]).TimesMinusI();
            tmp[18] = xapi - i[20] - i[28] + (i[18] - i[22] + i[26] - i[30]).TimesMinusI();
            tmp[19] = xami - (i[20] - i[28]).TimesMinusI() - (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()) * omegas[3][3];
            tmp[20] = xapi - i[28] + i[20] - i[22] - i[26] + i[28] - i[30];
            tmp[21] = xami - (i[28] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[22]).TimesMinusI();
            tmp[22] = xapi - i[20] - i[28] - (i[18] - i[22] + i[26] - i[30]).TimesMinusI();
            tmp[23] = xami - (i[20] - i[28]).TimesMinusI() + (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()).TimesMinusI();

            tmp[24] = i[17] + i[19] + xfpn + i[23] + i[25] + i[27] + i[31];
            tmp[25] = omegas[4][1] * (i[17] - i[25] + (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI());
            tmp[26] = omegas[4][2] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() + i[17] - xfpn + i[25]);
            tmp[27] = omegas[4][3] * (omegas[3][3] * (i[27] - i[19] + (i[23] - i[31]).TimesMinusI()) - i[17] + i[25] + (xfmn).TimesMinusI());
            tmp[28] = (i[17] - i[19] + xfpn - i[23] + i[25] - i[27] - i[31]).TimesMinusI();
            tmp[29] = (i[17] - i[25] - (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()) * omegas[4][5];
            tmp[30] = omegas[4][6] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() - i[17] + xfpn - i[25]);
            tmp[31] = omegas[4][7] * ((i[27] - i[19] + (i[23] - i[31]).TimesMinusI()).TimesMinusI() + i[17] - i[25] - (xfmn).TimesMinusI());


            //32 complex floats = 64 floats
            //Divided into 4 parts A, B, C, D = each containing 8 complex floats, so 16 floats
            //AVX takes 8 floats at once, so will calculate in halves of those parts
            //Tmp will ocntain 6 octets

            fixed(ComplexFloat *entry = result, om5 = omegas[5], t = tmp)
            {
                Vector256 <float> a;
                Vector256 <float> b;
                Vector256 <float> bSwap;
                Vector256 <float> aIm;
                Vector256 <float> aRe;
                Vector256 <float> aIM_bSwap;

                float *partA    = (float *)entry;
                float *partB    = partA + 16;
                float *partC    = partA + 32;
                float *partD    = partA + 48;
                float *omPart1  = (float *)om5;
                float *omPart2  = omPart1 + 16;
                float *tmpPart1 = (float *)t;
                float *tmpPart2 = tmpPart1 + 16;
                float *tmpPart3 = tmpPart1 + 32;
                float *tmpPart4 = tmpPart1 + 48;
                float *tmpPart5 = tmpPart1 + 64;
                float *tmpPart6 = tmpPart1 + 80;

                //Summing up result

                Avx2.Store(partA, Avx2.Add(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2)));
                Avx2.Store(partA + 8, Avx2.Add(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8)));
                Avx2.Store(partB, Avx2.Subtract(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2)));
                Avx2.Store(partB + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8)));

                Avx2.Store(partC, Avx2.Add(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4)));
                Avx2.Store(partC + 8, Avx2.Add(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8)));
                Avx2.Store(partD, Avx2.Subtract(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4)));
                Avx2.Store(partD + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8)));



                //-------------------------------------------------------------------------------------------------------------

                //First part of each 8 complex part

                //Tmp[0] = A + B
                Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB)));
                //Tmp[1] = A - B
                Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB)));
                //Tmp[2] = C + D
                Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD)));
                //Tmp[3] = C - D
                Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD)));

                //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904

                //Tmp[4] = omega * (C+D)
                a         = Avx2.LoadVector256(tmpPart3);
                b         = Avx2.LoadVector256(omPart1);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //Tmp[4] = omega * (C-D)
                a         = Avx2.LoadVector256(tmpPart4);
                b         = Avx2.LoadVector256(omPart2);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //(A+B) + (C+D)
                Avx2.Store(partA, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3)));
                //(A-B) + (C-D)
                Avx2.Store(partB, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4)));
                //(A+B) + omega*(C+D)
                Avx2.Store(partC, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5)));
                //(A-B) + omega*(C-D)
                Avx2.Store(partD, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6)));

                //--------------------------------------------------------------------------------------------------------------

                //Second part of each 8 complex part

                //Tmp[0] = A + B
                Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8)));
                //Tmp[1] = A - B
                Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8)));
                //Tmp[2] = C + D
                Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8)));
                //Tmp[2] = C - D
                Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8)));

                //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904

                //Tmp[4] = omega * (C+D)
                a         = Avx2.LoadVector256(tmpPart3);
                b         = Avx2.LoadVector256(omPart1 + 8);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //Tmp[4] = omega * (C-D)
                a         = Avx2.LoadVector256(tmpPart4);
                b         = Avx2.LoadVector256(omPart2 + 8);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //(A+B) + (C+D)
                Avx2.Store(partA + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3)));
                //(A-B) + (C-D)
                Avx2.Store(partB + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4)));
                //(A+B) + omega*(C+D)
                Avx2.Store(partC + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5)));
                //(A-B) + omega*(C-D)
                Avx2.Store(partD + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6)));
            }

            return(result);

            //ComplexFloat[] result = new ComplexFloat[32];
            //ArraySegment<ComplexFloat> partA = new ArraySegment<ComplexFloat>(i, 0, 16);
            //ArraySegment<ComplexFloat> partB = new ArraySegment<ComplexFloat>(i, 16, 16);
            //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 0);
            //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 16);
            //return result;
        }