public static unsafe ComplexFloat[] Kernel32(ComplexFloat[] i, ref ComplexFloat[][] omegas) { ComplexFloat[] result = new ComplexFloat[32]; ComplexFloat[] tmp = new ComplexFloat[48]; ComplexFloat ami = i[0] - i[8]; ComplexFloat api = i[0] + i[8]; ComplexFloat fmn = i[5] - i[13]; ComplexFloat fpn = i[5] + i[13]; ComplexFloat xami = i[16] - i[24]; ComplexFloat xapi = i[16] + i[24]; ComplexFloat xfmn = i[21] - i[29]; ComplexFloat xfpn = i[21] + i[29]; tmp[0] = api + i[2] + i[4] + i[6] + i[10] + i[12] + i[14]; tmp[1] = ami + (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI(); tmp[2] = api - i[4] - i[12] + (i[2] - i[6] + i[10] - i[14]).TimesMinusI(); tmp[3] = ami - (i[4] - i[12]).TimesMinusI() - (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()) * omegas[3][3]; tmp[4] = api - i[2] + i[4] - i[6] - i[10] + i[12] - i[14]; tmp[5] = ami - (i[2] - i[10] + (i[6] - i[14]).TimesMinusI()) * omegas[3][1] + (i[4] - i[12]).TimesMinusI(); tmp[6] = api - i[4] - i[12] - (i[2] - i[6] + i[10] - i[14]).TimesMinusI(); tmp[7] = ami - (i[4] - i[12]).TimesMinusI() + (i[10] - i[2] + (i[6] - i[14]).TimesMinusI()).TimesMinusI(); tmp[8] = i[1] + i[3] + fpn + i[7] + i[9] + i[11] + i[15]; tmp[9] = omegas[4][1] * (i[1] - i[9] + (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()); tmp[10] = omegas[4][2] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() + i[1] - fpn + i[9]); tmp[11] = omegas[4][3] * (omegas[3][3] * (i[11] - i[3] + (i[7] - i[15]).TimesMinusI()) - i[1] + i[9] + (fmn).TimesMinusI()); tmp[12] = (i[1] - i[3] + fpn - i[7] + i[9] - i[11] - i[15]).TimesMinusI(); tmp[13] = (i[1] - i[9] - (i[3] - i[11] + (i[7] - i[15]).TimesMinusI()) * omegas[3][1] + (fmn).TimesMinusI()) * omegas[4][5]; tmp[14] = omegas[4][6] * ((i[3] - i[7] + i[11] - i[15]).TimesMinusI() - i[1] + fpn - i[9]); tmp[15] = omegas[4][7] * ((i[11] - i[3] + (i[7] - i[15]).TimesMinusI()).TimesMinusI() + i[1] - i[9] - (fmn).TimesMinusI()); tmp[16] = xapi + i[18] + i[20] + i[22] + i[28] + i[26] + i[30]; tmp[17] = xami + (i[18] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[28]).TimesMinusI(); tmp[18] = xapi - i[20] - i[28] + (i[18] - i[22] + i[26] - i[30]).TimesMinusI(); tmp[19] = xami - (i[20] - i[28]).TimesMinusI() - (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()) * omegas[3][3]; tmp[20] = xapi - i[28] + i[20] - i[22] - i[26] + i[28] - i[30]; tmp[21] = xami - (i[28] - i[26] + (i[22] - i[30]).TimesMinusI()) * omegas[3][1] + (i[20] - i[22]).TimesMinusI(); tmp[22] = xapi - i[20] - i[28] - (i[18] - i[22] + i[26] - i[30]).TimesMinusI(); tmp[23] = xami - (i[20] - i[28]).TimesMinusI() + (i[26] - i[18] + (i[22] - i[30]).TimesMinusI()).TimesMinusI(); tmp[24] = i[17] + i[19] + xfpn + i[23] + i[25] + i[27] + i[31]; tmp[25] = omegas[4][1] * (i[17] - i[25] + (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()); tmp[26] = omegas[4][2] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() + i[17] - xfpn + i[25]); tmp[27] = omegas[4][3] * (omegas[3][3] * (i[27] - i[19] + (i[23] - i[31]).TimesMinusI()) - i[17] + i[25] + (xfmn).TimesMinusI()); tmp[28] = (i[17] - i[19] + xfpn - i[23] + i[25] - i[27] - i[31]).TimesMinusI(); tmp[29] = (i[17] - i[25] - (i[19] - i[27] + (i[23] - i[31]).TimesMinusI()) * omegas[3][1] + (xfmn).TimesMinusI()) * omegas[4][5]; tmp[30] = omegas[4][6] * ((i[19] - i[23] + i[27] - i[31]).TimesMinusI() - i[17] + xfpn - i[25]); tmp[31] = omegas[4][7] * ((i[27] - i[19] + (i[23] - i[31]).TimesMinusI()).TimesMinusI() + i[17] - i[25] - (xfmn).TimesMinusI()); //32 complex floats = 64 floats //Divided into 4 parts A, B, C, D = each containing 8 complex floats, so 16 floats //AVX takes 8 floats at once, so will calculate in halves of those parts //Tmp will ocntain 6 octets fixed(ComplexFloat *entry = result, om5 = omegas[5], t = tmp) { Vector256 <float> a; Vector256 <float> b; Vector256 <float> bSwap; Vector256 <float> aIm; Vector256 <float> aRe; Vector256 <float> aIM_bSwap; float *partA = (float *)entry; float *partB = partA + 16; float *partC = partA + 32; float *partD = partA + 48; float *omPart1 = (float *)om5; float *omPart2 = omPart1 + 16; float *tmpPart1 = (float *)t; float *tmpPart2 = tmpPart1 + 16; float *tmpPart3 = tmpPart1 + 32; float *tmpPart4 = tmpPart1 + 48; float *tmpPart5 = tmpPart1 + 64; float *tmpPart6 = tmpPart1 + 80; //Summing up result Avx2.Store(partA, Avx2.Add(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2))); Avx2.Store(partA + 8, Avx2.Add(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8))); Avx2.Store(partB, Avx2.Subtract(Avx2.LoadVector256(tmpPart1), Avx2.LoadVector256(tmpPart2))); Avx2.Store(partB + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart1 + 8), Avx2.LoadVector256(tmpPart2 + 8))); Avx2.Store(partC, Avx2.Add(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4))); Avx2.Store(partC + 8, Avx2.Add(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8))); Avx2.Store(partD, Avx2.Subtract(Avx2.LoadVector256(tmpPart3), Avx2.LoadVector256(tmpPart4))); Avx2.Store(partD + 8, Avx2.Subtract(Avx2.LoadVector256(tmpPart3 + 8), Avx2.LoadVector256(tmpPart4 + 8))); //------------------------------------------------------------------------------------------------------------- //First part of each 8 complex part //Tmp[0] = A + B Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB))); //Tmp[1] = A - B Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA), Avx2.LoadVector256(partB))); //Tmp[2] = C + D Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD))); //Tmp[3] = C - D Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC), Avx2.LoadVector256(partD))); //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904 //Tmp[4] = omega * (C+D) a = Avx2.LoadVector256(tmpPart3); b = Avx2.LoadVector256(omPart1); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //Tmp[4] = omega * (C-D) a = Avx2.LoadVector256(tmpPart4); b = Avx2.LoadVector256(omPart2); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //(A+B) + (C+D) Avx2.Store(partA, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3))); //(A-B) + (C-D) Avx2.Store(partB, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4))); //(A+B) + omega*(C+D) Avx2.Store(partC, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5))); //(A-B) + omega*(C-D) Avx2.Store(partD, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6))); //-------------------------------------------------------------------------------------------------------------- //Second part of each 8 complex part //Tmp[0] = A + B Avx2.Store(tmpPart1, Avx2.Add(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8))); //Tmp[1] = A - B Avx2.Store(tmpPart2, Avx2.Subtract(Avx2.LoadVector256(partA + 8), Avx2.LoadVector256(partB + 8))); //Tmp[2] = C + D Avx2.Store(tmpPart3, Avx2.Add(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8))); //Tmp[2] = C - D Avx2.Store(tmpPart4, Avx2.Subtract(Avx2.LoadVector256(partC + 8), Avx2.LoadVector256(partD + 8))); //Complex multiplication based on: https://www.researchgate.net/figure/Vectorized-complex-multiplication-using-AVX-2_fig2_337532904 //Tmp[4] = omega * (C+D) a = Avx2.LoadVector256(tmpPart3); b = Avx2.LoadVector256(omPart1 + 8); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart5, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //Tmp[4] = omega * (C-D) a = Avx2.LoadVector256(tmpPart4); b = Avx2.LoadVector256(omPart2 + 8); bSwap = Avx2.Shuffle(b, b, imm8bShuffle); aIm = Avx2.Shuffle(a, a, imm8aImShuffle); aRe = Avx2.Shuffle(a, a, imm8aReShuffle); aIM_bSwap = Avx.Multiply(aIm, bSwap); Avx2.Store(tmpPart6, Fma.MultiplyAddSubtract(aRe, b, aIM_bSwap)); //(A+B) + (C+D) Avx2.Store(partA + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart3))); //(A-B) + (C-D) Avx2.Store(partB + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart4))); //(A+B) + omega*(C+D) Avx2.Store(partC + 8, Avx.Add(Avx.LoadVector256(tmpPart1), Avx.LoadVector256(tmpPart5))); //(A-B) + omega*(C-D) Avx2.Store(partD + 8, Avx.Add(Avx.LoadVector256(tmpPart2), Avx.LoadVector256(tmpPart6))); } return(result); //ComplexFloat[] result = new ComplexFloat[32]; //ArraySegment<ComplexFloat> partA = new ArraySegment<ComplexFloat>(i, 0, 16); //ArraySegment<ComplexFloat> partB = new ArraySegment<ComplexFloat>(i, 16, 16); //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 0); //Kernel16(partA.ToArray(), ref omegas).CopyTo(result, 16); //return result; }