1/*
2 *  Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12#include <emmintrin.h>
13#include <assert.h>
14
15/**
16 * Two data formats are used by the FFT routines, internally. The
17 * interface to the main external FFT routines use interleaved complex
18 * values where the real part is followed by the imaginary part.
19 *
20 * One is the split format where a complex vector of real and imaginary
21 * values are split such that all of the real values are placed in the
22 * first half of the vector and the corresponding values are placed in
23 * the second half, in the same order. The conversion from interleaved
24 * complex values to split format and back is transparent to the
25 * external FFT interface.
26 *
27 * VComplex uses split format.
28 */
29
30/** VComplex hold 4 complex float elements, with the real parts stored
31 * in real and corresponding imaginary parts in imag.
32 */
33typedef struct VComplex {
34  __m128 real;
35  __m128 imag;
36} VC;
37
38/* out = a * b */
39static __inline void VC_MUL(VC *out, VC *a, VC *b) {
40  out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
41      _mm_mul_ps(a->imag, b->imag));
42  out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
43      _mm_mul_ps(a->imag, b->real));
44}
45
46/* out = conj(a) * b */
47static __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
48  out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
49      _mm_mul_ps(a->imag, b->imag));
50  out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
51      _mm_mul_ps(a->imag, b->real));
52}
53
54/* Scale complex by a real factor */
55static __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
56  out->real = _mm_mul_ps(factor, a->real);
57  out->imag = _mm_mul_ps(factor, a->imag);
58}
59
60/* out = a + b */
61static __inline void VC_ADD(VC *out, VC *a, VC *b) {
62  out->real = _mm_add_ps(a->real, b->real);
63  out->imag = _mm_add_ps(a->imag, b->imag);
64}
65
66/**
67 * out.real = a.real + b.imag
68 * out.imag = a.imag + b.real
69 */
70static __inline void VC_ADD_X(VC *out, VC *a, VC *b) {
71  out->real = _mm_add_ps(a->real, b->imag);
72  out->imag = _mm_add_ps(b->real, a->imag);
73}
74
75/* VC_ADD and store the result with Split format. */
76static __inline void VC_ADD_STORE_SPLIT(
77    OMX_F32 *out,
78    VC *a,
79    VC *b,
80    OMX_INT offset) {
81  _mm_store_ps(out, _mm_add_ps(a->real, b->real));
82  _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
83}
84
85/* out = a - b */
86static __inline void VC_SUB(VC *out, VC *a, VC *b) {
87  out->real = _mm_sub_ps(a->real, b->real);
88  out->imag = _mm_sub_ps(a->imag, b->imag);
89}
90
91/**
92 * out.real = a.real - b.imag
93 * out.imag = a.imag - b.real
94 */
95static __inline void VC_SUB_X(VC *out, VC *a, VC *b) {
96  out->real = _mm_sub_ps(a->real, b->imag);
97  out->imag = _mm_sub_ps(b->real, a->imag);
98}
99
100/* VC_SUB and store the result with Split format. */
101static __inline void VC_SUB_STORE_SPLIT(
102    OMX_F32 *out,
103    VC *a,
104    VC *b,
105    OMX_INT offset) {
106  _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
107  _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
108}
109
110/**
111 * out.real = a.real + b.real
112 * out.imag = a.imag - b.imag
113 */
114static __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
115  out->real = _mm_add_ps(a->real, b->real);
116  out->imag = _mm_sub_ps(a->imag, b->imag);
117}
118
119/**
120 * out.real = a.real + b.imag
121 * out.imag = a.imag - b.real
122 */
123static __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
124  out->real = _mm_add_ps(a->real, b->imag);
125  out->imag = _mm_sub_ps(a->imag, b->real);
126}
127
128/* VC_ADD_SUB_X and store the result with Split format. */
129static __inline void VC_ADD_SUB_X_STORE_SPLIT(
130    OMX_F32 *out,
131    VC *a,
132    VC *b,
133    OMX_INT offset) {
134  _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
135  _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
136}
137
138/**
139 * out.real = a.real - b.real
140 * out.imag = a.imag + b.imag
141 */
142static __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
143  out->real = _mm_sub_ps(a->real, b->real);
144  out->imag = _mm_add_ps(a->imag, b->imag);
145}
146
147/**
148 * out.real = a.real - b.imag
149 * out.imag = a.imag + b.real
150 */
151static __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
152  out->real = _mm_sub_ps(a->real, b->imag);
153  out->imag = _mm_add_ps(a->imag, b->real);
154}
155
156/* VC_SUB_ADD_X and store the result with Split format. */
157static __inline void VC_SUB_ADD_X_STORE_SPLIT(
158    OMX_F32 *out,
159    VC *a, VC *b,
160    OMX_INT offset) {
161  _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
162  _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
163}
164
165/**
166 * out[0]      = in.real
167 * out[offset] = in.imag
168 */
169static __inline void VC_STORE_SPLIT(
170    OMX_F32 *out,
171    VC *in,
172    OMX_INT offset) {
173  _mm_store_ps(out, in->real);
174  _mm_store_ps(out + offset, in->imag);
175}
176
177/**
178 * out.real = in[0];
179 * out.imag = in[offset];
180*/
181static __inline void VC_LOAD_SPLIT(
182    VC *out,
183    const OMX_F32 *in,
184    OMX_INT offset) {
185  out->real = _mm_load_ps(in);
186  out->imag = _mm_load_ps(in + offset);
187}
188
189/* Vector Complex Unpack from Split format to Interleaved format. */
190static __inline void VC_UNPACK(VC *out, VC *in) {
191    out->real = _mm_unpacklo_ps(in->real, in->imag);
192    out->imag = _mm_unpackhi_ps(in->real, in->imag);
193}
194
195/**
196 * Vector Complex load from interleaved complex array.
197 * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
198 * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
199 */
200static __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
201    __m128 temp0 = _mm_load_ps(in);
202    __m128 temp1 = _mm_load_ps(in + 4);
203    out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
204    out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
205}
206/**
207 * Vector Complex Load with Split format.
208 * The input address is not 16 byte aligned.
209 */
210static __inline void VC_LOADU_SPLIT(
211    VC *out,
212    const OMX_F32 *in,
213    OMX_INT offset) {
214  out->real = _mm_loadu_ps(in);
215  out->imag = _mm_loadu_ps(in + offset);
216}
217
218/* Reverse the order of the Complex Vector. */
219static __inline void VC_REVERSE(VC *v) {
220  v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
221  v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
222}
223/*
224 * Vector Complex store to interleaved complex array
225 * out[0] = in.real[0]
226 * out[1] = in.imag[0]
227 * out[2] = in.real[1]
228 * out[3] = in.imag[1]
229 * out[4] = in.real[2]
230 * out[5] = in.imag[2]
231 * out[6] = in.real[3]
232 * out[7] = in.imag[3]
233 */
234static __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
235  _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
236  _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
237}
238
239/**
240 * Vector Complex Store with Interleaved format.
241 * Address is not 16 byte aligned.
242 */
243static __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
244  _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
245  _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
246}
247
248/* VC_ADD_X and store the result with Split format. */
249static __inline void VC_ADD_X_STORE_SPLIT(
250    OMX_F32 *out,
251    VC *a, VC *b,
252    OMX_INT offset) {
253  _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
254  _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
255}
256
257/**
258 * VC_SUB_X and store the result with inverse order.
259 * Address is not 16 byte aligned.
260 */
261static __inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
262    OMX_F32 *out,
263    VC *a,
264    VC *b,
265    OMX_INT offset) {
266  __m128 t;
267  t = _mm_sub_ps(a->real, b->imag);
268  _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
269  t = _mm_sub_ps(b->real, a->imag);
270  _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
271}
272
273/**
274 * Vector Complex Load from Interleaved format to Split format.
275 * Store the result into two __m128 registers.
276 */
277static __inline void VC_LOAD_SHUFFLE(
278    __m128 *out0,
279    __m128 *out1,
280    const OMX_F32 *in) {
281  VC temp;
282  VC_LOAD_INTERLEAVE(&temp, in);
283  *out0 = temp.real;
284  *out1 = temp.imag;
285}
286
287/* Finish the butterfly calculation of forward radix4 and store the outputs. */
288static __inline void RADIX4_FWD_BUTTERFLY_STORE(
289    OMX_F32 *out0,
290    OMX_F32 *out1,
291    OMX_F32 *out2,
292    OMX_F32 *out3,
293    VC *t0,
294    VC *t1,
295    VC *t2,
296    VC *t3,
297    OMX_INT n) {
298  /* CADD out0, t0, t2 */
299  VC_ADD_STORE_SPLIT(out0, t0, t2, n);
300
301  /* CSUB out2, t0, t2 */
302  VC_SUB_STORE_SPLIT(out2, t0, t2, n);
303
304  /* CADD_SUB_X out1, t1, t3 */
305  VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
306
307  /* CSUB_ADD_X out3, t1, t3 */
308  VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
309}
310
311/* Finish the butterfly calculation of inverse radix4 and store the outputs. */
312static __inline void RADIX4_INV_BUTTERFLY_STORE(
313    OMX_F32 *out0,
314    OMX_F32 *out1,
315    OMX_F32 *out2,
316    OMX_F32 *out3,
317    VC *t0,
318    VC *t1,
319    VC *t2,
320    VC *t3,
321    OMX_INT n) {
322  /* CADD out0, t0, t2 */
323  VC_ADD_STORE_SPLIT(out0, t0, t2, n);
324
325  /* CSUB out2, t0, t2 */
326  VC_SUB_STORE_SPLIT(out2, t0, t2, n);
327
328  /* CSUB_ADD_X out1, t1, t3 */
329  VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
330
331  /* CADD_SUB_X out3, t1, t3 */
332  VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
333}
334
335/* Radix4 forward butterfly */
336static __inline void RADIX4_FWD_BUTTERFLY(
337    VC *t0,
338    VC *t1,
339    VC *t2,
340    VC *t3,
341    VC *Tw1,
342    VC *Tw2,
343    VC *Tw3,
344    VC *T0,
345    VC *T1,
346    VC *T2,
347    VC *T3) {
348  VC tt1, tt2, tt3;
349
350  /* CMUL tt1, Tw1, T1 */
351  VC_MUL(&tt1, Tw1, T1);
352
353  /* CMUL tt2, Tw2, T2 */
354  VC_MUL(&tt2, Tw2, T2);
355
356  /* CMUL tt3, Tw3, T3 */
357  VC_MUL(&tt3, Tw3, T3);
358
359  /* CADD t0, T0, tt2 */
360  VC_ADD(t0, T0, &tt2);
361
362  /* CSUB t1, T0, tt2 */
363  VC_SUB(t1, T0, &tt2);
364
365  /* CADD t2, tt1, tt3 */
366  VC_ADD(t2, &tt1, &tt3);
367
368  /* CSUB t3, tt1, tt3 */
369  VC_SUB(t3, &tt1, &tt3);
370}
371
372/* Radix4 inverse butterfly */
373static __inline void RADIX4_INV_BUTTERFLY(
374    VC *t0,
375    VC *t1,
376    VC *t2,
377    VC *t3,
378    VC *Tw1,
379    VC *Tw2,
380    VC *Tw3,
381    VC *T0,
382    VC *T1,
383    VC *T2,
384    VC *T3) {
385  VC tt1, tt2, tt3;
386
387  /* CMUL tt1, Tw1, T1 */
388  VC_CONJ_MUL(&tt1, Tw1, T1);
389
390  /* CMUL tt2, Tw2, T2 */
391  VC_CONJ_MUL(&tt2, Tw2, T2);
392
393  /* CMUL tt3, Tw3, T3 */
394  VC_CONJ_MUL(&tt3, Tw3, T3);
395
396  /* CADD t0, T0, tt2 */
397  VC_ADD(t0, T0, &tt2);
398
399  /* CSUB t1, T0, tt2 */
400  VC_SUB(t1, T0, &tt2);
401
402  /* CADD t2, tt1, tt3 */
403  VC_ADD(t2, &tt1, &tt3);
404
405  /* CSUB t3, tt1, tt3 */
406  VC_SUB(t3, &tt1, &tt3);
407}
408
409/* Radix4 butterfly in first stage for both forward and inverse */
410static __inline void RADIX4_BUTTERFLY_FS(
411    VC *t0,
412    VC *t1,
413    VC *t2,
414    VC *t3,
415    VC *T0,
416    VC *T1,
417    VC *T2,
418    VC *T3) {
419  /* CADD t0, T0, T2 */
420  VC_ADD(t0, T0, T2);
421
422  /* CSUB t1, T0, T2 */
423  VC_SUB(t1, T0, T2);
424
425  /* CADD t2, T1, T3 */
426  VC_ADD(t2, T1, T3);
427
428  /* CSUB t3, T1, T3 */
429  VC_SUB(t3, T1, T3);
430}
431
432/**
433 * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
434 * Then Do transpose on the matrix.
435 * 3,  2,  1,  0                  12, 8,  4,  0
436 * 7,  6,  5,  4        =====>    13, 9,  5,  1
437 * 11, 10, 9,  8                  14, 10, 6,  2
438 * 15, 14, 13, 12                 15, 11, 7,  3
439 */
440static __inline void VC_LOAD_MATRIX_TRANSPOSE(
441    VC *T0,
442    VC *T1,
443    VC *T2,
444    VC *T3,
445    const OMX_F32 *pT0,
446    const OMX_F32 *pT1,
447    const OMX_F32 *pT2,
448    const OMX_F32 *pT3,
449    OMX_INT n) {
450  __m128 xmm0;
451  __m128 xmm1;
452  __m128 xmm2;
453  __m128 xmm3;
454  __m128 xmm4;
455  __m128 xmm5;
456  __m128 xmm6;
457  __m128 xmm7;
458
459  xmm0 = _mm_load_ps(pT0);
460  xmm1 = _mm_load_ps(pT1);
461  xmm2 = _mm_load_ps(pT2);
462  xmm3 = _mm_load_ps(pT3);
463
464  /* Matrix transpose */
465  xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
466  xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
467  xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
468  xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
469  T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
470  T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
471  T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
472  T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
473
474  xmm0 = _mm_load_ps(pT0 + n);
475  xmm1 = _mm_load_ps(pT1 + n);
476  xmm2 = _mm_load_ps(pT2 + n);
477  xmm3 = _mm_load_ps(pT3 + n);
478
479  /* Matrix transpose */
480  xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
481  xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
482  xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
483  xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
484  T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
485  T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
486  T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
487  T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
488}
489