internal static unsafe void opus_fft_impl(FFTState st, int[] fout, int fout_ptr) { int m2, m; int p; int L; int[] fstride = new int[MAXFACTORS]; int i; int shift; /* st.shift can be -1 */ shift = st.shift > 0 ? st.shift : 0; fstride[0] = 1; L = 0; do { p = st.factors[2 * L]; m = st.factors[2 * L + 1]; fstride[L + 1] = fstride[L] * p; L++; } while (m != 1); fixed(int *_fixed_fout = fout) { int *pfout = _fixed_fout + fout_ptr; m = st.factors[2 * L - 1]; for (i = L - 1; i >= 0; i--) { if (i != 0) { m2 = st.factors[2 * i - 1]; } else { m2 = 1; } switch (st.factors[2 * i]) { case 2: kf_bfly2(pfout, m, fstride[i]); break; case 4: kf_bfly4(pfout, fstride[i] << shift, st, m, fstride[i], m2); break; case 3: kf_bfly3(pfout, fstride[i] << shift, st, m, fstride[i], m2); break; case 5: kf_bfly5(pfout, fstride[i] << shift, st, m, fstride[i], m2); break; } m = m2; } } }
internal static void kf_bfly3( int[] Fout, int fout_ptr, int fstride, FFTState st, int m, int N, int mm ) { int i; int k; int m1 = 2 * m; int m2 = 4 * m; int tw1, tw2; int scratch0, scratch1, scratch2, scratch3, scratch4, scratch5, scratch6, scratch7; int Fout_beg = fout_ptr; for (i = 0; i < N; i++) { fout_ptr = Fout_beg + 2 * i * mm; tw1 = tw2 = 0; /* For non-custom modes, m is guaranteed to be a multiple of 4. */ k = m; do { scratch2 = (S_MUL(Fout[fout_ptr + m1], st.twiddles[tw1]) - S_MUL(Fout[fout_ptr + m1 + 1], st.twiddles[tw1 + 1])); scratch3 = (S_MUL(Fout[fout_ptr + m1], st.twiddles[tw1 + 1]) + S_MUL(Fout[fout_ptr + m1 + 1], st.twiddles[tw1])); scratch4 = (S_MUL(Fout[fout_ptr + m2], st.twiddles[tw2]) - S_MUL(Fout[fout_ptr + m2 + 1], st.twiddles[tw2 + 1])); scratch5 = (S_MUL(Fout[fout_ptr + m2], st.twiddles[tw2 + 1]) + S_MUL(Fout[fout_ptr + m2 + 1], st.twiddles[tw2])); scratch6 = scratch2 + scratch4; scratch7 = scratch3 + scratch5; scratch0 = scratch2 - scratch4; scratch1 = scratch3 - scratch5; tw1 += fstride * 2; tw2 += fstride * 4; Fout[fout_ptr + m1] = Fout[fout_ptr + 0] - HALF_OF(scratch6); Fout[fout_ptr + m1 + 1] = Fout[fout_ptr + 1] - HALF_OF(scratch7); scratch0 = S_MUL(scratch0, -28378); scratch1 = S_MUL(scratch1, -28378); Fout[fout_ptr + 0] += scratch6; Fout[fout_ptr + 1] += scratch7; Fout[fout_ptr + m2] = Fout[fout_ptr + m1] + scratch1; Fout[fout_ptr + m2 + 1] = Fout[fout_ptr + m1 + 1] - scratch0; Fout[fout_ptr + m1] -= scratch1; Fout[fout_ptr + m1 + 1] += scratch0; fout_ptr += 2; } while ((--k) != 0); } }
internal static unsafe void kf_bfly3( int *Fout, int fstride, FFTState st, int m, int N, int mm ) { int i; int k; int m1 = 2 * m; int m2 = 4 * m; int tw1, tw2; int scratch0, scratch1, scratch2, scratch3, scratch4, scratch5, scratch6, scratch7; int *Fout_beg = Fout; for (i = 0; i < N; i++) { Fout = Fout_beg + 2 * i * mm; tw1 = tw2 = 0; /* For non-custom modes, m is guaranteed to be a multiple of 4. */ k = m; do { scratch2 = (S_MUL(*(Fout + m1), st.twiddles[tw1]) - S_MUL(*(Fout + m1 + 1), st.twiddles[tw1 + 1])); scratch3 = (S_MUL(*(Fout + m1), st.twiddles[tw1 + 1]) + S_MUL(*(Fout + m1 + 1), st.twiddles[tw1])); scratch4 = (S_MUL(*(Fout + m2), st.twiddles[tw2]) - S_MUL(*(Fout + m2 + 1), st.twiddles[tw2 + 1])); scratch5 = (S_MUL(*(Fout + m2), st.twiddles[tw2 + 1]) + S_MUL(*(Fout + m2 + 1), st.twiddles[tw2])); scratch6 = scratch2 + scratch4; scratch7 = scratch3 + scratch5; scratch0 = scratch2 - scratch4; scratch1 = scratch3 - scratch5; tw1 += fstride * 2; tw2 += fstride * 4; *(Fout + m1) = *(Fout) - HALF_OF(scratch6); *(Fout + m1 + 1) = *(Fout + 1) - HALF_OF(scratch7); scratch0 = S_MUL(scratch0, -28378); scratch1 = S_MUL(scratch1, -28378); *(Fout) += scratch6; *(Fout + 1) += scratch7; *(Fout + m2) = *(Fout + m1) + scratch1; *(Fout + m2 + 1) = *(Fout + m1 + 1) - scratch0; *(Fout + m1) -= scratch1; *(Fout + m1 + 1) += scratch0; Fout += 2; } while ((--k) != 0); } }
internal static void opus_fft_impl(FFTState st, int[] fout, int fout_ptr) { int m2, m; int p; int L; int[] fstride = fstrides.Value; int i; int shift; /* st.shift can be -1 */ shift = st.shift > 0 ? st.shift : 0; fstride[0] = 1; L = 0; do { p = st.factors[2 * L]; m = st.factors[2 * L + 1]; fstride[L + 1] = fstride[L] * p; L++; } while (m != 1); m = st.factors[2 * L - 1]; for (i = L - 1; i >= 0; i--) { if (i != 0) { m2 = st.factors[2 * i - 1]; } else { m2 = 1; } switch (st.factors[2 * i]) { case 2: kf_bfly2(fout, fout_ptr, m, fstride[i]); break; case 4: kf_bfly4(fout, fout_ptr, fstride[i] << shift, st, m, fstride[i], m2); break; case 3: kf_bfly3(fout, fout_ptr, fstride[i] << shift, st, m, fstride[i], m2); break; case 5: kf_bfly5(fout, fout_ptr, fstride[i] << shift, st, m, fstride[i], m2); break; } m = m2; } }
internal static void opus_fft(FFTState st, int[] fin, int[] fout) { int i; /* Allows us to scale with MULT16_32_Q16() */ int scale_shift = st.scale_shift - 1; short scale = st.scale; Inlines.OpusAssert(fin != fout, "In-place FFT not supported"); /* Bit-reverse the input */ for (i = 0; i < st.nfft; i++) { fout[(2 * st.bitrev[i])] = Inlines.SHR32(Inlines.MULT16_32_Q16(scale, fin[(2 * i)]), scale_shift); fout[(2 * st.bitrev[i] + 1)] = Inlines.SHR32(Inlines.MULT16_32_Q16(scale, fin[(2 * i) + 1]), scale_shift); } opus_fft_impl(st, fout, 0); }
/* Forward MDCT trashes the input array */ internal static void clt_mdct_forward(MDCTLookup l, int[] input, int input_ptr, int[] output, int output_ptr, int[] window, int overlap, int shift, int stride) { int i; int N, N2, N4; int[] f; int[] f2; FFTState st = l.kfft[shift]; short[] trig; int trig_ptr = 0; int scale; int scale_shift = st.scale_shift - 1; scale = st.scale; N = l.n; trig = l.trig; for (i = 0; i < shift; i++) { N = N >> 1; trig_ptr += N; } N2 = N >> 1; N4 = N >> 2; f = new int[N2]; f2 = new int[N4 * 2]; /* Consider the input to be composed of four blocks: [a, b, c, d] */ /* Window, shuffle, fold */ { /* Temp pointers to make it really clear to the compiler what we're doing */ int xp1 = input_ptr + (overlap >> 1); int xp2 = input_ptr + N2 - 1 + (overlap >> 1); int yp = 0; int wp1 = (overlap >> 1); int wp2 = ((overlap >> 1) - 1); for (i = 0; i < ((overlap + 3) >> 2); i++) { /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ f[yp++] = Inlines.MULT16_32_Q15(window[wp2], input[xp1 + N2]) + Inlines.MULT16_32_Q15(window[wp1], input[xp2]); f[yp++] = Inlines.MULT16_32_Q15(window[wp1], input[xp1]) - Inlines.MULT16_32_Q15(window[wp2], input[xp2 - N2]); xp1 += 2; xp2 -= 2; wp1 += 2; wp2 -= 2; } wp1 = 0; wp2 = (overlap - 1); for (; i < N4 - ((overlap + 3) >> 2); i++) { /* Real part arranged as a-bR, Imag part arranged as -c-dR */ f[yp++] = input[xp2]; f[yp++] = input[xp1]; xp1 += 2; xp2 -= 2; } for (; i < N4; i++) { /* Real part arranged as a-bR, Imag part arranged as -c-dR */ f[yp++] = Inlines.MULT16_32_Q15(window[wp2], input[xp2]) - Inlines.MULT16_32_Q15(window[wp1], input[xp1 - N2]); f[yp++] = Inlines.MULT16_32_Q15(window[wp2], input[xp1]) + Inlines.MULT16_32_Q15(window[wp1], input[xp2 + N2]); xp1 += 2; xp2 -= 2; wp1 += 2; wp2 -= 2; } } /* Pre-rotation */ { int yp = 0; int t = trig_ptr; for (i = 0; i < N4; i++) { short t0, t1; int re, im, yr, yi; t0 = trig[t + i]; t1 = trig[t + N4 + i]; re = f[yp++]; im = f[yp++]; yr = KissFFT.S_MUL(re, t0) - KissFFT.S_MUL(im, t1); yi = KissFFT.S_MUL(im, t0) + KissFFT.S_MUL(re, t1); f2[2 * st.bitrev[i]] = Inlines.PSHR32(Inlines.MULT16_32_Q16(scale, yr), scale_shift); f2[2 * st.bitrev[i] + 1] = Inlines.PSHR32(Inlines.MULT16_32_Q16(scale, yi), scale_shift); } } /* N/4 complex FFT, does not downscale anymore */ KissFFT.opus_fft_impl(st, f2, 0); /* Post-rotate */ { /* Temp pointers to make it really clear to the compiler what we're doing */ int fp = 0; int yp1 = output_ptr; int yp2 = output_ptr + (stride * (N2 - 1)); int t = trig_ptr; for (i = 0; i < N4; i++) { int yr, yi; yr = KissFFT.S_MUL(f2[fp + 1], trig[t + N4 + i]) - KissFFT.S_MUL(f2[fp], trig[t + i]); yi = KissFFT.S_MUL(f2[fp], trig[t + N4 + i]) + KissFFT.S_MUL(f2[fp + 1], trig[t + i]); output[yp1] = yr; output[yp2] = yi; fp += 2; yp1 += (2 * stride); yp2 -= (2 * stride); } } }
internal static void kf_bfly5( int[] Fout, int fout_ptr, int fstride, FFTState st, int m, int N, int mm ) { int Fout0, Fout1, Fout2, Fout3, Fout4; int i, u; int scratch0, scratch1, scratch2, scratch3, scratch4, scratch5, scratch6, scratch7, scratch8, scratch9, scratch10, scratch11, scratch12, scratch13, scratch14, scratch15, scratch16, scratch17, scratch18, scratch19, scratch20, scratch21, scratch22, scratch23, scratch24, scratch25; int Fout_beg = fout_ptr; short ya_r = 10126; short ya_i = -31164; short yb_r = -26510; short yb_i = -19261; int tw1, tw2, tw3, tw4; for (i = 0; i < N; i++) { tw1 = tw2 = tw3 = tw4 = 0; fout_ptr = Fout_beg + 2 * i * mm; Fout0 = fout_ptr; Fout1 = fout_ptr + (2 * m); Fout2 = fout_ptr + (4 * m); Fout3 = fout_ptr + (6 * m); Fout4 = fout_ptr + (8 * m); /* For non-custom modes, m is guaranteed to be a multiple of 4. */ for (u = 0; u < m; ++u) { scratch0 = Fout[Fout0 + 0]; scratch1 = Fout[Fout0 + 1]; scratch2 = (S_MUL(Fout[Fout1 + 0], st.twiddles[tw1]) - S_MUL(Fout[Fout1 + 1], st.twiddles[tw1 + 1])); scratch3 = (S_MUL(Fout[Fout1 + 0], st.twiddles[tw1 + 1]) + S_MUL(Fout[Fout1 + 1], st.twiddles[tw1])); scratch4 = (S_MUL(Fout[Fout2 + 0], st.twiddles[tw2]) - S_MUL(Fout[Fout2 + 1], st.twiddles[tw2 + 1])); scratch5 = (S_MUL(Fout[Fout2 + 0], st.twiddles[tw2 + 1]) + S_MUL(Fout[Fout2 + 1], st.twiddles[tw2])); scratch6 = (S_MUL(Fout[Fout3 + 0], st.twiddles[tw3]) - S_MUL(Fout[Fout3 + 1], st.twiddles[tw3 + 1])); scratch7 = (S_MUL(Fout[Fout3 + 0], st.twiddles[tw3 + 1]) + S_MUL(Fout[Fout3 + 1], st.twiddles[tw3])); scratch8 = (S_MUL(Fout[Fout4 + 0], st.twiddles[tw4]) - S_MUL(Fout[Fout4 + 1], st.twiddles[tw4 + 1])); scratch9 = (S_MUL(Fout[Fout4 + 0], st.twiddles[tw4 + 1]) + S_MUL(Fout[Fout4 + 1], st.twiddles[tw4])); tw1 += (2 * fstride); tw2 += (4 * fstride); tw3 += (6 * fstride); tw4 += (8 * fstride); scratch14 = scratch2 + scratch8; scratch15 = scratch3 + scratch9; scratch20 = scratch2 - scratch8; scratch21 = scratch3 - scratch9; scratch16 = scratch4 + scratch6; scratch17 = scratch5 + scratch7; scratch18 = scratch4 - scratch6; scratch19 = scratch5 - scratch7; Fout[Fout0 + 0] += scratch14 + scratch16; Fout[Fout0 + 1] += scratch15 + scratch17; scratch10 = scratch0 + S_MUL(scratch14, ya_r) + S_MUL(scratch16, yb_r); scratch11 = scratch1 + S_MUL(scratch15, ya_r) + S_MUL(scratch17, yb_r); scratch12 = S_MUL(scratch21, ya_i) + S_MUL(scratch19, yb_i); scratch13 = 0 - S_MUL(scratch20, ya_i) - S_MUL(scratch18, yb_i); Fout[Fout1 + 0] = scratch10 - scratch12; Fout[Fout1 + 1] = scratch11 - scratch13; Fout[Fout4 + 0] = scratch10 + scratch12; Fout[Fout4 + 1] = scratch11 + scratch13; scratch22 = scratch0 + S_MUL(scratch14, yb_r) + S_MUL(scratch16, ya_r); scratch23 = scratch1 + S_MUL(scratch15, yb_r) + S_MUL(scratch17, ya_r); scratch24 = 0 - S_MUL(scratch21, yb_i) + S_MUL(scratch19, ya_i); scratch25 = S_MUL(scratch20, yb_i) - S_MUL(scratch18, ya_i); Fout[Fout2 + 0] = scratch22 + scratch24; Fout[Fout2 + 1] = scratch23 + scratch25; Fout[Fout3 + 0] = scratch22 - scratch24; Fout[Fout3 + 1] = scratch23 - scratch25; Fout0 += 2; Fout1 += 2; Fout2 += 2; Fout3 += 2; Fout4 += 2; } } }
internal static void kf_bfly4( int[] Fout, int fout_ptr, int fstride, FFTState st, int m, int N, int mm) { int i; if (m == 1) { /* Degenerate case where all the twiddles are 1. */ int scratch0, scratch1, scratch2, scratch3; for (i = 0; i < N; i++) { scratch0 = Fout[fout_ptr + 0] - Fout[fout_ptr + 4]; scratch1 = Fout[fout_ptr + 1] - Fout[fout_ptr + 5]; Fout[fout_ptr + 0] += Fout[fout_ptr + 4]; Fout[fout_ptr + 1] += Fout[fout_ptr + 5]; scratch2 = Fout[fout_ptr + 2] + Fout[fout_ptr + 6]; scratch3 = Fout[fout_ptr + 3] + Fout[fout_ptr + 7]; Fout[fout_ptr + 4] = Fout[fout_ptr + 0] - scratch2; Fout[fout_ptr + 5] = Fout[fout_ptr + 1] - scratch3; Fout[fout_ptr + 0] += scratch2; Fout[fout_ptr + 1] += scratch3; scratch2 = Fout[fout_ptr + 2] - Fout[fout_ptr + 6]; scratch3 = Fout[fout_ptr + 3] - Fout[fout_ptr + 7]; Fout[fout_ptr + 2] = scratch0 + scratch3; Fout[fout_ptr + 3] = scratch1 - scratch2; Fout[fout_ptr + 6] = scratch0 - scratch3; Fout[fout_ptr + 7] = scratch1 + scratch2; fout_ptr += 8; } } else { int j; int scratch0, scratch1, scratch2, scratch3, scratch4, scratch5, scratch6, scratch7, scratch8, scratch9, scratch10, scratch11; int tw1, tw2, tw3; int Fout_beg = fout_ptr; for (i = 0; i < N; i++) { fout_ptr = Fout_beg + 2 * i * mm; int m1 = fout_ptr + (2 * m); int m2 = fout_ptr + (4 * m); int m3 = fout_ptr + (6 * m); tw3 = tw2 = tw1 = 0; /* m is guaranteed to be a multiple of 4. */ for (j = 0; j < m; j++) { scratch0 = (S_MUL(Fout[m1], st.twiddles[tw1]) - S_MUL(Fout[m1 + 1], st.twiddles[tw1 + 1])); scratch1 = (S_MUL(Fout[m1], st.twiddles[tw1 + 1]) + S_MUL(Fout[m1 + 1], st.twiddles[tw1])); scratch2 = (S_MUL(Fout[m2], st.twiddles[tw2]) - S_MUL(Fout[m2 + 1], st.twiddles[tw2 + 1])); scratch3 = (S_MUL(Fout[m2], st.twiddles[tw2 + 1]) + S_MUL(Fout[m2 + 1], st.twiddles[tw2])); scratch4 = (S_MUL(Fout[m3], st.twiddles[tw3]) - S_MUL(Fout[m3 + 1], st.twiddles[tw3 + 1])); scratch5 = (S_MUL(Fout[m3], st.twiddles[tw3 + 1]) + S_MUL(Fout[m3 + 1], st.twiddles[tw3])); scratch10 = Fout[fout_ptr] - scratch2; scratch11 = Fout[fout_ptr + 1] - scratch3; Fout[fout_ptr] += scratch2; Fout[fout_ptr + 1] += scratch3; scratch6 = scratch0 + scratch4; scratch7 = scratch1 + scratch5; scratch8 = scratch0 - scratch4; scratch9 = scratch1 - scratch5; Fout[m2] = Fout[fout_ptr] - scratch6; Fout[m2 + 1] = Fout[fout_ptr + 1] - scratch7; tw1 += fstride * 2; tw2 += fstride * 4; tw3 += fstride * 6; Fout[fout_ptr] += scratch6; Fout[fout_ptr + 1] += scratch7; Fout[m1] = scratch10 + scratch9; Fout[m1 + 1] = scratch11 - scratch8; Fout[m3] = scratch10 - scratch9; Fout[m3 + 1] = scratch11 + scratch8; fout_ptr += 2; m1 += 2; m2 += 2; m3 += 2; } } } }
/* Forward MDCT trashes the input array */ internal static unsafe void clt_mdct_forward(MDCTLookup l, int[] input, int input_ptr, int[] output, int output_ptr, int[] window, int overlap, int shift, int stride) { int i; int N, N2, N4; int[] f; int[] f2; FFTState st = l.kfft[shift]; int scale; int scale_shift = st.scale_shift - 1; scale = st.scale; N = l.n; fixed(short *ptrig_base = l.trig) { short *trig = ptrig_base; for (i = 0; i < shift; i++) { N = N >> 1; trig += N; } N2 = N >> 1; N4 = N >> 2; f = new int[N2]; f2 = new int[N4 * 2]; fixed(int *pinput_base = input, pwindow = window, pf = f, pf2 = f2) { int *pinput = pinput_base + input_ptr; /* Consider the input to be composed of four blocks: [a, b, c, d] */ /* Window, shuffle, fold */ { /* Temp pointers to make it really clear to the compiler what we're doing */ int *xp1 = pinput + (overlap >> 1); int *xp2 = pinput + N2 - 1 + (overlap >> 1); int *yp = pf; int *wp1 = pwindow + (overlap >> 1); int *wp2 = pwindow + ((overlap >> 1) - 1); for (i = 0; i < ((overlap + 3) >> 2); i++) { /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ *yp++ = Inlines.MULT16_32_Q15(*wp2, xp1[N2]) + Inlines.MULT16_32_Q15(*wp1, *xp2); *yp++ = Inlines.MULT16_32_Q15(*wp1, *xp1) - Inlines.MULT16_32_Q15(*wp2, xp2[0 - N2]); xp1 += 2; xp2 -= 2; wp1 += 2; wp2 -= 2; } wp1 = pwindow; wp2 = pwindow + (overlap - 1); for (; i < N4 - ((overlap + 3) >> 2); i++) { /* Real part arranged as a-bR, Imag part arranged as -c-dR */ *yp++ = *xp2; *yp++ = *xp1; xp1 += 2; xp2 -= 2; } for (; i < N4; i++) { /* Real part arranged as a-bR, Imag part arranged as -c-dR */ *yp++ = Inlines.MULT16_32_Q15(*wp2, *xp2) - Inlines.MULT16_32_Q15(*wp1, xp1[0 - N2]); *yp++ = Inlines.MULT16_32_Q15(*wp2, *xp1) + Inlines.MULT16_32_Q15(*wp1, xp2[N2]); xp1 += 2; xp2 -= 2; wp1 += 2; wp2 -= 2; } } /* Pre-rotation */ { int * yp = pf; short *t = trig; for (i = 0; i < N4; i++) { short t0, t1; int re, im, yr, yi; t0 = t[i]; t1 = t[N4 + i]; re = *yp++; im = *yp++; yr = KissFFT.S_MUL(re, t0) - KissFFT.S_MUL(im, t1); yi = KissFFT.S_MUL(im, t0) + KissFFT.S_MUL(re, t1); pf2[2 * st.bitrev[i]] = Inlines.PSHR32(Inlines.MULT16_32_Q16(scale, yr), scale_shift); pf2[2 * st.bitrev[i] + 1] = Inlines.PSHR32(Inlines.MULT16_32_Q16(scale, yi), scale_shift); } } /* N/4 complex FFT, does not downscale anymore */ KissFFT.opus_fft_impl(st, f2, 0); /* Post-rotate */ fixed(int *poutput_base = output) { /* Temp pointers to make it really clear to the compiler what we're doing */ int * fp = pf2; int * yp1 = poutput_base + output_ptr; int * yp2 = poutput_base + output_ptr + (stride * (N2 - 1)); short *t = trig; for (i = 0; i < N4; i++) { int yr, yi; yr = KissFFT.S_MUL(fp[1], t[N4 + i]) - KissFFT.S_MUL(fp[0], t[i]); yi = KissFFT.S_MUL(fp[0], t[N4 + i]) + KissFFT.S_MUL(fp[1], t[i]); *yp1 = yr; *yp2 = yi; fp += 2; yp1 += (2 * stride); yp2 -= (2 * stride); } } } } }
internal static unsafe void kf_bfly4( int *Fout, int fstride, FFTState st, int m, int N, int mm) { int i; if (m == 1) { /* Degenerate case where all the twiddles are 1. */ int scratch0, scratch1, scratch2, scratch3; for (i = 0; i < N; i++) { scratch0 = *(Fout) - *(Fout + 4); scratch1 = *(Fout + 1) - *(Fout + 5); *(Fout + 0) += *(Fout + 4); *(Fout + 1) += *(Fout + 5); scratch2 = *(Fout + 2) + *(Fout + 6); scratch3 = *(Fout + 3) + *(Fout + 7); *(Fout + 4) = *(Fout + 0) - scratch2; *(Fout + 5) = *(Fout + 1) - scratch3; *(Fout + 0) += scratch2; *(Fout + 1) += scratch3; scratch2 = *(Fout + 2) - *(Fout + 6); scratch3 = *(Fout + 3) - *(Fout + 7); *(Fout + 2) = scratch0 + scratch3; *(Fout + 3) = scratch1 - scratch2; *(Fout + 6) = scratch0 - scratch3; *(Fout + 7) = scratch1 + scratch2; Fout += 8; } } else { int j; int scratch0, scratch1, scratch2, scratch3, scratch4, scratch5, scratch6, scratch7, scratch8, scratch9, scratch10, scratch11; int tw1, tw2, tw3; int *Fout_beg = Fout; for (i = 0; i < N; i++) { Fout = Fout_beg + 2 * i * mm; int *m1 = Fout + (2 * m); int *m2 = Fout + (4 * m); int *m3 = Fout + (6 * m); tw3 = tw2 = tw1 = 0; /* m is guaranteed to be a multiple of 4. */ for (j = 0; j < m; j++) { scratch0 = (S_MUL(*m1, st.twiddles[tw1]) - S_MUL(*(m1 + 1), st.twiddles[tw1 + 1])); scratch1 = (S_MUL(*m1, st.twiddles[tw1 + 1]) + S_MUL(*(m1 + 1), st.twiddles[tw1])); scratch2 = (S_MUL(*m2, st.twiddles[tw2]) - S_MUL(*(m2 + 1), st.twiddles[tw2 + 1])); scratch3 = (S_MUL(*m2, st.twiddles[tw2 + 1]) + S_MUL(*(m2 + 1), st.twiddles[tw2])); scratch4 = (S_MUL(*m3, st.twiddles[tw3]) - S_MUL(*(m3 + 1), st.twiddles[tw3 + 1])); scratch5 = (S_MUL(*m3, st.twiddles[tw3 + 1]) + S_MUL(*(m3 + 1), st.twiddles[tw3])); scratch10 = *(Fout) - scratch2; scratch11 = *(Fout + 1) - scratch3; *(Fout) += scratch2; *(Fout + 1) += scratch3; scratch6 = scratch0 + scratch4; scratch7 = scratch1 + scratch5; scratch8 = scratch0 - scratch4; scratch9 = scratch1 - scratch5; *m2 = *(Fout) - scratch6; *(m2 + 1) = *(Fout + 1) - scratch7; tw1 += fstride * 2; tw2 += fstride * 4; tw3 += fstride * 6; *(Fout) += scratch6; *(Fout + 1) += scratch7; *m1 = scratch10 + scratch9; *(m1 + 1) = scratch11 - scratch8; *m3 = scratch10 - scratch9; *(m3 + 1) = scratch11 + scratch8; Fout += 2; m1 += 2; m2 += 2; m3 += 2; } } } }