1/*
2 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12#include <stdint.h>
13
14#include "dl/api/omxtypes.h"
15#include "dl/sp/api/mipsSP.h"
16
17OMXResult mips_FFTInv_CCSToR_F32_complex(const OMX_F32* pSrc,
18                                         OMX_F32* pDst,
19                                         const MIPSFFTSpec_R_FC32* pFFTSpec) {
20  OMX_U32 num_transforms, step;
21  /* Quarter, half and three-quarters of transform size. */
22  OMX_U32 n1_4, n1_2, n3_4;
23  const OMX_F32* w_re_ptr;
24  const OMX_F32* w_im_ptr;
25  OMX_U32 fft_size = 1 << pFFTSpec->order;
26  OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
27  const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
28  OMX_FC32* p_dst;
29  OMX_U16* p_bitrev = pFFTSpec->pBitRevInv;
30  OMX_F32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, factor;
31  OMX_F32 w_re, w_im;
32
33  w_re_ptr = pFFTSpec->pTwiddle + 1;
34  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - 1;
35
36  /*
37   * Preliminary loop performing input adaptation due to computing real FFT
38   * through complex FFT of half the size.
39   */
40  for (uint32_t n = 1; n < fft_size / 8; ++n) {
41    tmp1 = p_src[n].Re;
42    tmp2 = p_src[n].Im;
43    tmp3 = p_src[fft_size / 2 - n].Re;
44    tmp4 = p_src[fft_size / 2 - n].Im;
45
46    tmp5 = tmp1 + tmp3;
47    tmp6 = tmp1 - tmp3;
48    tmp7 = tmp2 + tmp4;
49    tmp8 = tmp2 - tmp4;
50
51    w_re = *w_re_ptr;
52    w_im = *w_im_ptr;
53
54    p_buf[p_bitrev[n]].Re = 0.5f * (tmp5 - w_re * tmp7 - w_im * tmp6);
55    p_buf[p_bitrev[n]].Im = 0.5f * (tmp8 + w_re * tmp6 - w_im * tmp7);
56    p_buf[p_bitrev[fft_size / 2 - n]].Re =
57        0.5f * (tmp5 + w_re * tmp7 + w_im * tmp6);
58    p_buf[p_bitrev[fft_size / 2 - n]].Im =
59        0.5f * (-tmp8 + w_re * tmp6 - w_im * tmp7);
60
61    tmp1 = p_src[n + fft_size / 4].Re;
62    tmp2 = p_src[n + fft_size / 4].Im;
63    tmp3 = p_src[fft_size / 4 - n].Re;
64    tmp4 = p_src[fft_size / 4 - n].Im;
65
66    tmp5 = tmp1 + tmp3;
67    tmp6 = tmp1 - tmp3;
68    tmp7 = tmp2 + tmp4;
69    tmp8 = tmp2 - tmp4;
70
71    p_buf[p_bitrev[n + fft_size / 4]].Re =
72        0.5f * (tmp5 + w_im * tmp7 - w_re * tmp6);
73    p_buf[p_bitrev[n + fft_size / 4]].Im =
74        0.5f * (tmp8 - w_im * tmp6 - w_re * tmp7);
75    p_buf[p_bitrev[fft_size / 4 - n]].Re =
76        0.5f * (tmp5 - w_im * tmp7 + w_re * tmp6);
77    p_buf[p_bitrev[fft_size / 4 - n]].Im =
78        0.5f * (-tmp8 - w_im * tmp6 - w_re * tmp7);
79
80    ++w_re_ptr;
81    --w_im_ptr;
82  }
83  tmp1 = p_src[fft_size / 8].Re;
84  tmp2 = p_src[fft_size / 8].Im;
85  tmp3 = p_src[3 * fft_size / 8].Re;
86  tmp4 = p_src[3 * fft_size / 8].Im;
87
88  tmp5 = tmp1 + tmp3;
89  tmp6 = tmp1 - tmp3;
90  tmp7 = tmp2 + tmp4;
91  tmp8 = tmp2 - tmp4;
92
93  w_re = *w_re_ptr;
94  w_im = *w_im_ptr;
95
96  p_buf[p_bitrev[fft_size / 8]].Re = 0.5f * (tmp5 - w_re * tmp7 - w_im * tmp6);
97  p_buf[p_bitrev[fft_size / 8]].Im = 0.5f * (tmp8 + w_re * tmp6 - w_im * tmp7);
98  p_buf[p_bitrev[3 * fft_size / 8]].Re =
99      0.5f * (tmp5 + w_re * tmp7 + w_im * tmp6);
100  p_buf[p_bitrev[3 * fft_size / 8]].Im =
101      0.5f * (-tmp8 + w_re * tmp6 - w_im * tmp7);
102
103  p_buf[p_bitrev[0]].Re = 0.5f * (p_src[0].Re + p_src[fft_size / 2].Re);
104  p_buf[p_bitrev[0]].Im = 0.5f * (p_src[0].Re - p_src[fft_size / 2].Re);
105  p_buf[p_bitrev[fft_size / 4]].Re = p_src[fft_size / 4].Re;
106  p_buf[p_bitrev[fft_size / 4]].Im = -p_src[fft_size / 4].Im;
107
108  /*
109   * Loop performing sub-transforms of size 4,
110   * which contain 2 butterfly operations.
111   */
112  num_transforms = (SUBTRANSFORM_CONST >> (17 - pFFTSpec->order)) | 1;
113  for (uint32_t n = 0; n < num_transforms; ++n) {
114    OMX_U32 offset = pFFTSpec->pOffset[n] << 2;
115    OMX_FC32* p_tmp = p_buf + offset;
116
117    tmp1 = p_tmp[0].Re + p_tmp[1].Re;
118    tmp2 = p_tmp[0].Im + p_tmp[1].Im;
119    tmp3 = p_tmp[0].Re - p_tmp[1].Re;
120    tmp4 = p_tmp[0].Im - p_tmp[1].Im;
121    tmp5 = p_tmp[2].Re + p_tmp[3].Re;
122    tmp6 = p_tmp[2].Im + p_tmp[3].Im;
123    tmp8 = p_tmp[2].Im - p_tmp[3].Im;
124    tmp7 = p_tmp[2].Re - p_tmp[3].Re;
125
126    p_tmp[0].Re = tmp1 + tmp5;
127    p_tmp[2].Re = tmp1 - tmp5;
128    p_tmp[0].Im = tmp2 + tmp6;
129    p_tmp[2].Im = tmp2 - tmp6;
130    p_tmp[1].Re = tmp3 - tmp8;
131    p_tmp[3].Re = tmp3 + tmp8;
132    p_tmp[1].Im = tmp4 + tmp7;
133    p_tmp[3].Im = tmp4 - tmp7;
134  }
135
136  num_transforms = (num_transforms >> 1) | 1;
137  /*
138   * Loop performing sub-transforms of size 8,
139   * which contain four butterfly operations.
140   */
141  for (uint32_t n = 0; n < num_transforms; ++n) {
142    OMX_U32 offset = pFFTSpec->pOffset[n] << 3;
143    OMX_FC32* p_tmp = p_buf + offset;
144
145    tmp1 = p_tmp[4].Re + p_tmp[5].Re;
146    tmp3 = p_tmp[6].Re + p_tmp[7].Re;
147    tmp2 = p_tmp[4].Im + p_tmp[5].Im;
148    tmp4 = p_tmp[6].Im + p_tmp[7].Im;
149
150    tmp5 = tmp1 + tmp3;
151    tmp7 = tmp1 - tmp3;
152    tmp6 = tmp2 + tmp4;
153    tmp8 = tmp2 - tmp4;
154
155    tmp1 = p_tmp[4].Re - p_tmp[5].Re;
156    tmp2 = p_tmp[4].Im - p_tmp[5].Im;
157    tmp3 = p_tmp[6].Re - p_tmp[7].Re;
158    tmp4 = p_tmp[6].Im - p_tmp[7].Im;
159
160    p_tmp[4].Re = p_tmp[0].Re - tmp5;
161    p_tmp[0].Re = p_tmp[0].Re + tmp5;
162    p_tmp[4].Im = p_tmp[0].Im - tmp6;
163    p_tmp[0].Im = p_tmp[0].Im + tmp6;
164    p_tmp[6].Re = p_tmp[2].Re + tmp8;
165    p_tmp[2].Re = p_tmp[2].Re - tmp8;
166    p_tmp[6].Im = p_tmp[2].Im - tmp7;
167    p_tmp[2].Im = p_tmp[2].Im + tmp7;
168
169    tmp5 = SQRT1_2 * (tmp1 - tmp2);
170    tmp7 = SQRT1_2 * (tmp3 + tmp4);
171    tmp6 = SQRT1_2 * (tmp1 + tmp2);
172    tmp8 = SQRT1_2 * (tmp4 - tmp3);
173
174    tmp1 = tmp5 + tmp7;
175    tmp3 = tmp5 - tmp7;
176    tmp2 = tmp6 + tmp8;
177    tmp4 = tmp6 - tmp8;
178
179    p_tmp[5].Re = p_tmp[1].Re - tmp1;
180    p_tmp[1].Re = p_tmp[1].Re + tmp1;
181    p_tmp[5].Im = p_tmp[1].Im - tmp2;
182    p_tmp[1].Im = p_tmp[1].Im + tmp2;
183    p_tmp[7].Re = p_tmp[3].Re + tmp4;
184    p_tmp[3].Re = p_tmp[3].Re - tmp4;
185    p_tmp[7].Im = p_tmp[3].Im - tmp3;
186    p_tmp[3].Im = p_tmp[3].Im + tmp3;
187  }
188
189  step = 1 << (pFFTSpec->order - 4);
190  n1_4 = 4;
191  /* Outer loop that loops over FFT stages. */
192  for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 2; ++fft_stage) {
193    n1_2 = 2 * n1_4;
194    n3_4 = 3 * n1_4;
195    num_transforms = (num_transforms >> 1) | 1;
196    for (uint32_t n = 0; n < num_transforms; ++n) {
197      OMX_U32 offset = pFFTSpec->pOffset[n] << fft_stage;
198      OMX_FC32* p_tmp = p_buf + offset;
199
200      tmp1 = p_tmp[n1_2].Re + p_tmp[n3_4].Re;
201      tmp2 = p_tmp[n1_2].Re - p_tmp[n3_4].Re;
202      tmp3 = p_tmp[n1_2].Im + p_tmp[n3_4].Im;
203      tmp4 = p_tmp[n1_2].Im - p_tmp[n3_4].Im;
204
205      p_tmp[n1_2].Re = p_tmp[0].Re - tmp1;
206      p_tmp[0].Re = p_tmp[0].Re + tmp1;
207      p_tmp[n1_2].Im = p_tmp[0].Im - tmp3;
208      p_tmp[0].Im = p_tmp[0].Im + tmp3;
209      p_tmp[n3_4].Re = p_tmp[n1_4].Re + tmp4;
210      p_tmp[n1_4].Re = p_tmp[n1_4].Re - tmp4;
211      p_tmp[n3_4].Im = p_tmp[n1_4].Im - tmp2;
212      p_tmp[n1_4].Im = p_tmp[n1_4].Im + tmp2;
213
214      w_re_ptr = pFFTSpec->pTwiddle + step;
215      w_im_ptr =
216          pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
217
218      /*
219       * Loop performing split-radix butterfly operations for one sub-transform.
220       */
221      for (uint32_t i = 1; i < n1_4; ++i) {
222        w_re = *w_re_ptr;
223        w_im = *w_im_ptr;
224
225        tmp1 = w_re * p_tmp[n1_2 + i].Re - w_im * p_tmp[n1_2 + i].Im;
226        tmp2 = w_re * p_tmp[n1_2 + i].Im + w_im * p_tmp[n1_2 + i].Re;
227        tmp3 = w_re * p_tmp[n3_4 + i].Re + w_im * p_tmp[n3_4 + i].Im;
228        tmp4 = w_re * p_tmp[n3_4 + i].Im - w_im * p_tmp[n3_4 + i].Re;
229
230        tmp5 = tmp1 + tmp3;
231        tmp1 = tmp1 - tmp3;
232        tmp6 = tmp2 + tmp4;
233        tmp2 = tmp2 - tmp4;
234
235        p_tmp[n1_2 + i].Re = p_tmp[i].Re - tmp5;
236        p_tmp[i].Re = p_tmp[i].Re + tmp5;
237        p_tmp[n1_2 + i].Im = p_tmp[i].Im - tmp6;
238        p_tmp[i].Im = p_tmp[i].Im + tmp6;
239        p_tmp[n3_4 + i].Re = p_tmp[n1_4 + i].Re + tmp2;
240        p_tmp[n1_4 + i].Re = p_tmp[n1_4 + i].Re - tmp2;
241        p_tmp[n3_4 + i].Im = p_tmp[n1_4 + i].Im - tmp1;
242        p_tmp[n1_4 + i].Im = p_tmp[n1_4 + i].Im + tmp1;
243
244        w_re_ptr += step;
245        w_im_ptr -= step;
246      }
247    }
248    step >>= 1;
249    n1_4 <<= 1;
250  }
251
252  /* Last FFT stage, write data to output buffer. */
253  n1_2 = 2 * n1_4;
254  n3_4 = 3 * n1_4;
255  factor = (OMX_F32)2.0f / fft_size;
256
257  p_dst = (OMX_FC32*)pDst;
258
259  tmp1 = p_buf[n1_2].Re + p_buf[n3_4].Re;
260  tmp2 = p_buf[n1_2].Re - p_buf[n3_4].Re;
261  tmp3 = p_buf[n1_2].Im + p_buf[n3_4].Im;
262  tmp4 = p_buf[n1_2].Im - p_buf[n3_4].Im;
263
264  p_dst[n1_2].Re = factor * (p_buf[0].Re - tmp1);
265  p_dst[0].Re = factor * (p_buf[0].Re + tmp1);
266  p_dst[n1_2].Im = factor * (p_buf[0].Im - tmp3);
267  p_dst[0].Im = factor * (p_buf[0].Im + tmp3);
268  p_dst[n3_4].Re = factor * (p_buf[n1_4].Re + tmp4);
269  p_dst[n1_4].Re = factor * (p_buf[n1_4].Re - tmp4);
270  p_dst[n3_4].Im = factor * (p_buf[n1_4].Im - tmp2);
271  p_dst[n1_4].Im = factor * (p_buf[n1_4].Im + tmp2);
272
273  w_re_ptr = pFFTSpec->pTwiddle + step;
274  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
275
276  for (uint32_t i = 1; i < n1_4; ++i) {
277    w_re = *w_re_ptr;
278    w_im = *w_im_ptr;
279
280    tmp1 = w_re * p_buf[n1_2 + i].Re - w_im * p_buf[n1_2 + i].Im;
281    tmp2 = w_re * p_buf[n1_2 + i].Im + w_im * p_buf[n1_2 + i].Re;
282    tmp3 = w_re * p_buf[n3_4 + i].Re + w_im * p_buf[n3_4 + i].Im;
283    tmp4 = w_re * p_buf[n3_4 + i].Im - w_im * p_buf[n3_4 + i].Re;
284
285    tmp5 = tmp1 + tmp3;
286    tmp1 = tmp1 - tmp3;
287    tmp6 = tmp2 + tmp4;
288    tmp2 = tmp2 - tmp4;
289
290    p_dst[n1_2 + i].Re = factor * (p_buf[i].Re - tmp5);
291    p_dst[i].Re = factor * (p_buf[i].Re + tmp5);
292    p_dst[n1_2 + i].Im = factor * (p_buf[i].Im - tmp6);
293    p_dst[i].Im = factor * (p_buf[i].Im + tmp6);
294    p_dst[n3_4 + i].Re = factor * (p_buf[n1_4 + i].Re + tmp2);
295    p_dst[n1_4 + i].Re = factor * (p_buf[n1_4 + i].Re - tmp2);
296    p_dst[n3_4 + i].Im = factor * (p_buf[n1_4 + i].Im - tmp1);
297    p_dst[n1_4 + i].Im = factor * (p_buf[n1_4 + i].Im + tmp1);
298
299    w_re_ptr += step;
300    w_im_ptr -= step;
301  }
302
303  return OMX_Sts_NoErr;
304}
305