Ejemplo n.º 1
0
        public static unsafe ComplexFloat[] Kernel32(ComplexFloat[] i, ref ComplexFloat[][] omegas)
        {
            ComplexFloat[] result = new ComplexFloat[32];
            ComplexFloat[] tmp    = new ComplexFloat[48];

            ComplexFloat ami = i[0] - i[8];
            ComplexFloat api = i[0] + i[8];
            ComplexFloat fmn = i[5] - i[13];
            ComplexFloat fpn = i[5] + i[13];

            ComplexFloat xami = i[16] - i[24];
            ComplexFloat xapi = i[16] + i[24];
            ComplexFloat xfmn = i[21] - i[29];
            ComplexFloat xfpn = i[21] + i[29];

            tmp[0] = api + i[2] + i[4] + i[6] + i[10] + i[12] + i[14];
            tmp[1] = ami + (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI();
            tmp[2] = api - i[4] - i[12] + (i[2] - i[6] + i[10] - i[14]).TimesMinusI();
            tmp[3] = ami - (i[4] - i[12]).TimesMinusI() - (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()) * omegas[3][3];
            tmp[4] = api - i[2] + i[4] - i[6] - i[10] + i[12] - i[14];
            tmp[5] = ami - (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI();
            tmp[6] = api - i[4] - i[12] - (i[2] - i[6] + i[10] - i[14]).TimesMinusI();
            tmp[7] = ami - (i[4] - i[12]).TimesMinusI() + (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()).TimesMinusI();

            tmp[8]  = i[1] + i[3] + fpn + i[7] + i[9] + i[11] + i[15];
            tmp[9]  = omegas[4][1] * (i[1] - i[9] + (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI());
            tmp[10] = omegas[4][2] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() + i[1] - fpn + i[9]);
            tmp[11] = omegas[4][3] * (omegas[3][3] * (i[11] - i[3] + (i[7] - i[15]).TimesMinusI()) - i[1] + i[9] + (fmn).TimesMinusI());
            tmp[12] = (i[1] - i[3] + fpn - i[7] + i[9] - i[11] - i[15]).TimesMinusI();
            tmp[13] = (i[1] - i[9] - (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()) * omegas[4][5];
            tmp[14] = omegas[4][6] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() - i[1] + fpn - i[9]);
            tmp[15] = omegas[4][7] * ((i[11] - i[3] + (i[7] - i[15]).TimesMinusI()).TimesMinusI() + i[1] - i[9] - (fmn).TimesMinusI());

            tmp[16] = xapi + i[18] + i[20] + i[22] + i[28] + i[26] + i[30];
            tmp[17] = xami + (i[18] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[28]).TimesMinusI();
            tmp[18] = xapi - i[20] - i[28] + (i[18] - i[22] + i[26] - i[30]).TimesMinusI();
            tmp[19] = xami - (i[20] - i[28]).TimesMinusI() - (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()) * omegas[3][3];
            tmp[20] = xapi - i[28] + i[20] - i[22] - i[26] + i[28] - i[30];
            tmp[21] = xami - (i[28] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[22]).TimesMinusI();
            tmp[22] = xapi - i[20] - i[28] - (i[18] - i[22] + i[26] - i[30]).TimesMinusI();
            tmp[23] = xami - (i[20] - i[28]).TimesMinusI() + (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()).TimesMinusI();

            tmp[24] = i[17] + i[19] + xfpn + i[23] + i[25] + i[27] + i[31];
            tmp[25] = omegas[4][1] * (i[17] - i[25] + (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI());
            tmp[26] = omegas[4][2] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() + i[17] - xfpn + i[25]);
            tmp[27] = omegas[4][3] * (omegas[3][3] * (i[27] - i[19] + (i[23] - i[31]).TimesMinusI()) - i[17] + i[25] + (xfmn).TimesMinusI());
            tmp[28] = (i[17] - i[19] + xfpn - i[23] + i[25] - i[27] - i[31]).TimesMinusI();
            tmp[29] = (i[17] - i[25] - (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()) * omegas[4][5];
            tmp[30] = omegas[4][6] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() - i[17] + xfpn - i[25]);
            tmp[31] = omegas[4][7] * ((i[27] - i[19] + (i[23] - i[31]).TimesMinusI()).TimesMinusI() + i[17] - i[25] - (xfmn).TimesMinusI());


            //32 complex floats = 64 floats
            //Divided into 4 parts A, B, C, D = each containing 8 complex floats, so 16 floats
            //AVX takes 8 floats at once, so will calculate in halves of those parts
            //Tmp will ocntain 6 octets

            fixed(ComplexFloat *entry = result, om5 = omegas[5], t = tmp)
            {
                Vector256 <float> a;
                Vector256 <float> b;
                Vector256 <float> bSwap;
                Vector256 <float> aIm;
                Vector256 <float> aRe;
                Vector256 <float> aIM_bSwap;

                float *partA    = (float *)entry;
                float *partB    = partA + 16;
                float *partC    = partA + 32;
                float *partD    = partA + 48;
                float *omPart1  = (float *)om5;
                float *omPart2  = omPart1 + 16;
                float *tmpPart1 = (float *)t;
                float *tmpPart2 = tmpPart1 + 16;
                float *tmpPart3 = tmpPart1 + 32;
                float *tmpPart4 = tmpPart1 + 48;
                float *tmpPart5 = tmpPart1 + 64;
                float *tmpPart6 = tmpPart1 + 80;

                //Summing up result

                Avx2.Store(partA, Avx2.Add(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2)));
                Avx2.Store(partA + 8, Avx2.Add(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8)));
                Avx2.Store(partB, Avx2.Subtract(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2)));
                Avx2.Store(partB + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8)));

                Avx2.Store(partC, Avx2.Add(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4)));
                Avx2.Store(partC + 8, Avx2.Add(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8)));
                Avx2.Store(partD, Avx2.Subtract(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4)));
                Avx2.Store(partD + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8)));



                //-------------------------------------------------------------------------------------------------------------

                //First part of each 8 complex part

                //Tmp[0] = A + B
                Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB)));
                //Tmp[1] = A - B
                Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB)));
                //Tmp[2] = C + D
                Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD)));
                //Tmp[3] = C - D
                Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD)));

                //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904

                //Tmp[4] = omega * (C+D)
                a         = Avx2.LoadVector256(tmpPart3);
                b         = Avx2.LoadVector256(omPart1);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //Tmp[4] = omega * (C-D)
                a         = Avx2.LoadVector256(tmpPart4);
                b         = Avx2.LoadVector256(omPart2);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //(A+B) + (C+D)
                Avx2.Store(partA, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3)));
                //(A-B) + (C-D)
                Avx2.Store(partB, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4)));
                //(A+B) + omega*(C+D)
                Avx2.Store(partC, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5)));
                //(A-B) + omega*(C-D)
                Avx2.Store(partD, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6)));

                //--------------------------------------------------------------------------------------------------------------

                //Second part of each 8 complex part

                //Tmp[0] = A + B
                Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8)));
                //Tmp[1] = A - B
                Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8)));
                //Tmp[2] = C + D
                Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8)));
                //Tmp[2] = C - D
                Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8)));

                //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904

                //Tmp[4] = omega * (C+D)
                a         = Avx2.LoadVector256(tmpPart3);
                b         = Avx2.LoadVector256(omPart1 + 8);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //Tmp[4] = omega * (C-D)
                a         = Avx2.LoadVector256(tmpPart4);
                b         = Avx2.LoadVector256(omPart2 + 8);
                bSwap     = Avx2.Shuffle(b, b, imm8bShuffle);
                aIm       = Avx2.Shuffle(a, a, imm8aImShuffle);
                aRe       = Avx2.Shuffle(a, a, imm8aReShuffle);
                aIM_bSwap = Avx.Multiply(aIm, bSwap);
                Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap));

                //(A+B) + (C+D)
                Avx2.Store(partA + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3)));
                //(A-B) + (C-D)
                Avx2.Store(partB + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4)));
                //(A+B) + omega*(C+D)
                Avx2.Store(partC + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5)));
                //(A-B) + omega*(C-D)
                Avx2.Store(partD + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6)));
            }

            return(result);

            //ComplexFloat[] result = new ComplexFloat[32];
            //ArraySegment<ComplexFloat> partA = new ArraySegment<ComplexFloat>(i, 0, 16);
            //ArraySegment<ComplexFloat> partB = new ArraySegment<ComplexFloat>(i, 16, 16);
            //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 0);
            //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 16);
            //return result;
        }