1/*
2 *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12#include "dl/api/omxtypes.h"
13#include "dl/sp/src/x86/x86SP_SSE_Math.h"
14
15// This function handles the case when set_count = 2, in which we cannot
16// unroll the set loop by 4 to meet the SSE requirement (4 elements).
17static void InternalUnroll2Inv(
18    const OMX_F32 *in,
19    OMX_F32 *out,
20    const OMX_F32 *twiddle,
21    OMX_INT n) {
22  OMX_INT i;
23  OMX_INT n_by_2 = n >> 1;
24  OMX_INT n_by_4 = n >> 2;
25  OMX_INT n_mul_2 = n << 1;
26  OMX_F32 *out0 = out;
27
28  for (i = 0; i < n_by_2; i += 8) {
29    const OMX_F32 *tw1  = twiddle + i;
30    const OMX_F32 *tw2  = tw1 + i;
31    const OMX_F32 *tw3  = tw2 + i;
32    const OMX_F32 *tw1e = tw1 + 4;
33    const OMX_F32 *tw2e = tw2 + 8;
34    const OMX_F32 *tw3e = tw3 + 12;
35
36    VC v_tw1;
37    VC v_tw2;
38    VC v_tw3;
39    VC v_t0;
40    VC v_t1;
41    VC v_t2;
42    VC v_t3;
43    VC v_t4;
44    VC v_t5;
45    VC v_t6;
46    VC v_t7;
47
48    v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1),
49                                _mm_load_ss(tw1e),
50                                _MM_SHUFFLE(0, 0, 0, 0));
51    v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2),
52                                _mm_load_ss(tw1e + n_mul_2),
53                                _MM_SHUFFLE(0, 0, 0, 0));
54    v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2),
55                                _mm_load_ss(tw2e),
56                                _MM_SHUFFLE(0, 0, 0, 0));
57    v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2),
58                                _mm_load_ss(tw2e + n_mul_2),
59                                _MM_SHUFFLE(0, 0, 0, 0));
60    v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3),
61                                _mm_load_ss(tw3e),
62                                _MM_SHUFFLE(0, 0, 0, 0));
63    v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2),
64                                _mm_load_ss(tw3e + n_mul_2),
65                                _MM_SHUFFLE(0, 0, 0, 0));
66
67    __m128 xmm0;
68    __m128 xmm1;
69    __m128 xmm2;
70    __m128 xmm3;
71    __m128 xmm4;
72    __m128 xmm5;
73    __m128 xmm6;
74    __m128 xmm7;
75
76    const OMX_F32 *in0 = in + (i << 1);
77    xmm0 = _mm_load_ps(in0);
78    xmm1 = _mm_load_ps(in0 + 4);
79    xmm2 = _mm_load_ps(in0 + 8);
80    xmm3 = _mm_load_ps(in0 + 12);
81    v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0));
82    v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2));
83    v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0));
84    v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2));
85
86    xmm4 = _mm_load_ps(in0 + n);
87    xmm5 = _mm_load_ps(in0 + n + 4);
88    xmm6 = _mm_load_ps(in0 + n + 8);
89    xmm7 = _mm_load_ps(in0 + n + 12);
90    v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
91    v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
92    v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
93    v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
94
95    OMX_F32 *out1 = out0 + n_by_4;
96    OMX_F32 *out2 = out1 + n_by_4;
97    OMX_F32 *out3 = out2 + n_by_4;
98
99    RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
100                         &v_tw1, &v_tw2, &v_tw3,
101                         &v_t0, &v_t1, &v_t2, &v_t3);
102
103    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
104                               &v_t4, &v_t5, &v_t6, &v_t7, n);
105
106    out0 += 4;
107  }
108}
109
110void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
111    const OMX_F32 *in,
112    OMX_F32 *out,
113    const OMX_F32 *twiddle,
114    OMX_INT n,
115    OMX_INT sub_size,
116    OMX_INT sub_num) {
117  OMX_INT set;
118  OMX_INT grp;
119  OMX_INT step = sub_num >> 1;
120  OMX_INT set_count = sub_num >> 2;
121  OMX_INT n_by_4 = n >> 2;
122  OMX_INT n_mul_2 = n << 1;
123
124  OMX_F32 *out0 = out;
125
126  if (set_count == 2) {
127    InternalUnroll2Inv(in, out, twiddle, n);
128    return;
129  }
130
131  // grp == 0
132  for (set = 0; set < set_count; set += 4) {
133    const OMX_F32 * in0 = in + set;
134    const OMX_F32 *in1 = in0 + set_count;
135    const OMX_F32 *in2 = in1 + set_count;
136    const OMX_F32 *in3 = in2 + set_count;
137
138    VC v_t0;
139    VC v_t1;
140    VC v_t2;
141    VC v_t3;
142    VC v_t4;
143    VC v_t5;
144    VC v_t6;
145    VC v_t7;
146
147    VC_LOAD_SPLIT(&v_t0, in0, n);
148    VC_LOAD_SPLIT(&v_t1, in1, n);
149    VC_LOAD_SPLIT(&v_t2, in2, n);
150    VC_LOAD_SPLIT(&v_t3, in3, n);
151
152    OMX_F32 *out1 = out0 + n_by_4;
153    OMX_F32 *out2 = out1 + n_by_4;
154    OMX_F32 *out3 = out2 + n_by_4;
155
156    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
157                        &v_t0, &v_t1, &v_t2, &v_t3);
158
159    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
160                               &v_t4, &v_t5, &v_t6, &v_t7, n);
161
162    out0 += 4;
163  }
164
165  for (grp = 1; grp < sub_size; ++grp) {
166    const OMX_F32 *tw1 = twiddle + grp * step;
167    const OMX_F32 *tw2 = tw1 + grp * step;
168    const OMX_F32 *tw3 = tw2 + grp * step;
169
170    VC v_tw1;
171    VC v_tw2;
172    VC v_tw3;
173
174    v_tw1.real = _mm_load1_ps(tw1);
175    v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
176    v_tw2.real = _mm_load1_ps(tw2);
177    v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
178    v_tw3.real = _mm_load1_ps(tw3);
179    v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);
180
181    for (set = 0; set < set_count; set += 4) {
182      const OMX_F32 *in0 = in + set + grp * sub_num;
183      const OMX_F32 *in1 = in0 + set_count;
184      const OMX_F32 *in2 = in1 + set_count;
185      const OMX_F32 *in3 = in2 + set_count;
186
187      VC v_t0;
188      VC v_t1;
189      VC v_t2;
190      VC v_t3;
191      VC v_t4;
192      VC v_t5;
193      VC v_t6;
194      VC v_t7;
195
196      VC_LOAD_SPLIT(&v_t0, in0, n);
197      VC_LOAD_SPLIT(&v_t1, in1, n);
198      VC_LOAD_SPLIT(&v_t2, in2, n);
199      VC_LOAD_SPLIT(&v_t3, in3, n);
200
201      OMX_F32 *out1 = out0 + n_by_4;
202      OMX_F32 *out2 = out1 + n_by_4;
203      OMX_F32 *out3 = out2 + n_by_4;
204
205      RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
206                           &v_tw1, &v_tw2, &v_tw3,
207                           &v_t0, &v_t1, &v_t2, &v_t3);
208
209      RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
210                                 &v_t4, &v_t5, &v_t6, &v_t7, n);
211
212      out0 += 4;
213    }
214  }
215}
216