1/*
2 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12#include <stdint.h>
13
14#include "dl/api/omxtypes.h"
15#include "dl/sp/api/mipsSP.h"
16
17/*
18 * Forward real FFT for FFT sizes larger than 16. Computed using the complex
19 * FFT for half the size.
20 */
21OMXResult mips_FFTFwd_RToCCS_F32_complex(const OMX_F32* pSrc,
22                                         OMX_F32* pDst,
23                                         const MIPSFFTSpec_R_FC32* pFFTSpec) {
24  OMX_U32 n1_4, num_transforms, step;
25  const OMX_F32* w_re_ptr;
26  const OMX_F32* w_im_ptr;
27  OMX_U32 fft_size = 1 << pFFTSpec->order;
28  OMX_FC32* p_dst = (OMX_FC32*)pDst;
29  OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf;
30  OMX_F32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
31  OMX_F32 w_re, w_im;
32
33  /*
34   * Loop performing sub-transforms of size 4,
35   * which contain 2 butterfly operations.
36   */
37  num_transforms = (SUBTRANSFORM_CONST >> (17 - pFFTSpec->order)) | 1;
38  for (uint32_t n = 0; n < num_transforms; ++n) {
39    /*
40     * n is in the range (0 .. num_transforms - 1).
41     * The size of the pFFTSpec->pOffset is (((SUBTRANSFORM_CONST >>
42     * (16 - TWIDDLE_TABLE_ORDER)) | 1)).
43     */
44    OMX_U32 offset = pFFTSpec->pOffset[n] << 2;
45    /*
46     * Offset takes it's value from pFFTSpec->pOffset table which is initialized
47     * in the omxSP_FFTInit_R_F32 function, and is constant afterwards.
48     */
49    OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset;
50    OMX_FC32* p_tmp = p_buf + offset;
51    /* Treating the input as a complex vector. */
52    const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
53
54    tmp1 = p_src[p_bitrev[0]].Re + p_src[p_bitrev[1]].Re;
55    tmp2 = p_src[p_bitrev[2]].Re + p_src[p_bitrev[3]].Re;
56    tmp3 = p_src[p_bitrev[0]].Im + p_src[p_bitrev[1]].Im;
57    tmp4 = p_src[p_bitrev[2]].Im + p_src[p_bitrev[3]].Im;
58
59    p_tmp[0].Re = tmp1 + tmp2;
60    p_tmp[2].Re = tmp1 - tmp2;
61    p_tmp[0].Im = tmp3 + tmp4;
62    p_tmp[2].Im = tmp3 - tmp4;
63
64    tmp1 = p_src[p_bitrev[0]].Re - p_src[p_bitrev[1]].Re;
65    tmp2 = p_src[p_bitrev[2]].Re - p_src[p_bitrev[3]].Re;
66    tmp3 = p_src[p_bitrev[0]].Im - p_src[p_bitrev[1]].Im;
67    tmp4 = p_src[p_bitrev[2]].Im - p_src[p_bitrev[3]].Im;
68
69    p_tmp[1].Re = tmp1 + tmp4;
70    p_tmp[3].Re = tmp1 - tmp4;
71    p_tmp[1].Im = tmp3 - tmp2;
72    p_tmp[3].Im = tmp3 + tmp2;
73  }
74
75  /*
76   * Loop performing sub-transforms of size 8,
77   * which contain four butterfly operations.
78   */
79  num_transforms = (num_transforms >> 1) | 1;
80  for (uint32_t n = 0; n < num_transforms; ++n) {
81    OMX_U32 offset = pFFTSpec->pOffset[n] << 3;
82    OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset;
83    OMX_FC32* p_tmp = p_buf + offset;
84    const OMX_FC32* p_src = (const OMX_FC32*)pSrc;
85
86    tmp1 = p_src[p_bitrev[4]].Re + p_src[p_bitrev[5]].Re;
87    tmp2 = p_src[p_bitrev[6]].Re + p_src[p_bitrev[7]].Re;
88    tmp3 = p_src[p_bitrev[4]].Im + p_src[p_bitrev[5]].Im;
89    tmp4 = p_src[p_bitrev[6]].Im + p_src[p_bitrev[7]].Im;
90
91    tmp5 = tmp1 + tmp2;
92    tmp1 = tmp1 - tmp2;
93    tmp2 = tmp3 + tmp4;
94    tmp3 = tmp3 - tmp4;
95
96    p_tmp[4].Re = p_tmp[0].Re - tmp5;
97    p_tmp[0].Re = p_tmp[0].Re + tmp5;
98    p_tmp[4].Im = p_tmp[0].Im - tmp2;
99    p_tmp[0].Im = p_tmp[0].Im + tmp2;
100    p_tmp[6].Re = p_tmp[2].Re - tmp3;
101    p_tmp[2].Re = p_tmp[2].Re + tmp3;
102    p_tmp[6].Im = p_tmp[2].Im + tmp1;
103    p_tmp[2].Im = p_tmp[2].Im - tmp1;
104
105    tmp1 = p_src[p_bitrev[4]].Re - p_src[p_bitrev[5]].Re;
106    tmp2 = p_src[p_bitrev[6]].Re - p_src[p_bitrev[7]].Re;
107    tmp3 = p_src[p_bitrev[4]].Im - p_src[p_bitrev[5]].Im;
108    tmp4 = p_src[p_bitrev[6]].Im - p_src[p_bitrev[7]].Im;
109
110    tmp5 = SQRT1_2 * (tmp1 + tmp3);
111    tmp1 = SQRT1_2 * (tmp3 - tmp1);
112    tmp3 = SQRT1_2 * (tmp2 - tmp4);
113    tmp2 = SQRT1_2 * (tmp2 + tmp4);
114
115    tmp4 = tmp5 + tmp3;
116    tmp5 = tmp5 - tmp3;
117    tmp3 = tmp1 + tmp2;
118    tmp1 = tmp1 - tmp2;
119
120    p_tmp[5].Re = p_tmp[1].Re - tmp4;
121    p_tmp[1].Re = p_tmp[1].Re + tmp4;
122    p_tmp[5].Im = p_tmp[1].Im - tmp3;
123    p_tmp[1].Im = p_tmp[1].Im + tmp3;
124    p_tmp[7].Re = p_tmp[3].Re - tmp1;
125    p_tmp[3].Re = p_tmp[3].Re + tmp1;
126    p_tmp[7].Im = p_tmp[3].Im + tmp5;
127    p_tmp[3].Im = p_tmp[3].Im - tmp5;
128  }
129
130  step = 1 << (pFFTSpec->order - 4);
131  n1_4 = 4; /* Quarter of the sub-transform size. */
132  /* Outer loop that loops over FFT stages. */
133  for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 1; ++fft_stage) {
134    OMX_U32 n1_2 = 2 * n1_4;
135    OMX_U32 n3_4 = 3 * n1_4;
136    num_transforms = (num_transforms >> 1) | 1;
137    /*
138     * Loop performing sub-transforms of size 16 and higher.
139     * The actual size depends on the stage.
140     */
141    for (uint32_t n = 0; n < num_transforms; ++n) {
142      OMX_U32 offset = pFFTSpec->pOffset[n] << fft_stage;
143      OMX_FC32* p_tmp = p_buf + offset;
144
145      tmp1 = p_tmp[n1_2].Re + p_tmp[n3_4].Re;
146      tmp2 = p_tmp[n1_2].Re - p_tmp[n3_4].Re;
147      tmp3 = p_tmp[n1_2].Im + p_tmp[n3_4].Im;
148      tmp4 = p_tmp[n1_2].Im - p_tmp[n3_4].Im;
149
150      p_tmp[n1_2].Re = p_tmp[0].Re - tmp1;
151      p_tmp[n1_2].Im = p_tmp[0].Im - tmp3;
152      p_tmp[0].Re = p_tmp[0].Re + tmp1;
153      p_tmp[0].Im = p_tmp[0].Im + tmp3;
154      p_tmp[n3_4].Re = p_tmp[n1_4].Re - tmp4;
155      p_tmp[n3_4].Im = p_tmp[n1_4].Im + tmp2;
156      p_tmp[n1_4].Re = p_tmp[n1_4].Re + tmp4;
157      p_tmp[n1_4].Im = p_tmp[n1_4].Im - tmp2;
158
159      /* Twiddle table is initialized for the maximal FFT size. */
160      w_re_ptr = pFFTSpec->pTwiddle + step;
161      w_im_ptr =
162          pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
163
164      /*
165       * Loop performing split-radix butterfly operations for
166       * one sub-transform.
167       */
168      for (uint32_t i = 1; i < n1_4; ++i) {
169        w_re = *w_re_ptr;
170        w_im = *w_im_ptr;
171
172        tmp1 = w_re * p_tmp[n1_2 + i].Re + w_im * p_tmp[n1_2 + i].Im;
173        tmp2 = w_re * p_tmp[n1_2 + i].Im - w_im * p_tmp[n1_2 + i].Re;
174        tmp3 = w_re * p_tmp[n3_4 + i].Re - w_im * p_tmp[n3_4 + i].Im;
175        tmp4 = w_re * p_tmp[n3_4 + i].Im + w_im * p_tmp[n3_4 + i].Re;
176
177        tmp5 = tmp1 + tmp3;
178        tmp1 = tmp1 - tmp3;
179        tmp6 = tmp2 + tmp4;
180        tmp2 = tmp2 - tmp4;
181
182        p_tmp[n1_2 + i].Re = p_tmp[i].Re - tmp5;
183        p_tmp[n1_2 + i].Im = p_tmp[i].Im - tmp6;
184        p_tmp[i].Re = p_tmp[i].Re + tmp5;
185        p_tmp[i].Im = p_tmp[i].Im + tmp6;
186        p_tmp[n3_4 + i].Re = p_tmp[n1_4 + i].Re - tmp2;
187        p_tmp[n3_4 + i].Im = p_tmp[n1_4 + i].Im + tmp1;
188        p_tmp[n1_4 + i].Re = p_tmp[n1_4 + i].Re + tmp2;
189        p_tmp[n1_4 + i].Im = p_tmp[n1_4 + i].Im - tmp1;
190
191        w_re_ptr += step;
192        w_im_ptr -= step;
193      }
194    }
195    step >>= 1;
196    n1_4 <<= 1;
197  }
198
199  /* Additional computation to get the output for full FFT size. */
200  w_re_ptr = pFFTSpec->pTwiddle + step;
201  w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step;
202
203  for (uint32_t i = 1; i < fft_size / 8; ++i) {
204    tmp1 = p_buf[i].Re;
205    tmp2 = p_buf[i].Im;
206    tmp3 = p_buf[fft_size / 2 - i].Re;
207    tmp4 = p_buf[fft_size / 2 - i].Im;
208
209    tmp5 = tmp1 + tmp3;
210    tmp6 = tmp1 - tmp3;
211    tmp7 = tmp2 + tmp4;
212    tmp8 = tmp2 - tmp4;
213
214    tmp1 = p_buf[i + fft_size / 4].Re;
215    tmp2 = p_buf[i + fft_size / 4].Im;
216    tmp3 = p_buf[fft_size / 4 - i].Re;
217    tmp4 = p_buf[fft_size / 4 - i].Im;
218
219    w_re = *w_re_ptr;
220    w_im = *w_im_ptr;
221
222    p_dst[i].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6);
223    p_dst[i].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7);
224    p_dst[fft_size / 2 - i].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6);
225    p_dst[fft_size / 2 - i].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7);
226
227    tmp5 = tmp1 + tmp3;
228    tmp6 = tmp1 - tmp3;
229    tmp7 = tmp2 + tmp4;
230    tmp8 = tmp2 - tmp4;
231
232    p_dst[i + fft_size / 4].Re = 0.5f * (tmp5 - w_im * tmp7 - w_re * tmp6);
233    p_dst[i + fft_size / 4].Im = 0.5f * (tmp8 + w_im * tmp6 - w_re * tmp7);
234    p_dst[fft_size / 4 - i].Re = 0.5f * (tmp5 + w_im * tmp7 + w_re * tmp6);
235    p_dst[fft_size / 4 - i].Im = 0.5f * (-tmp8 + w_im * tmp6 - w_re * tmp7);
236
237    w_re_ptr += step;
238    w_im_ptr -= step;
239  }
240  tmp1 = p_buf[fft_size / 8].Re;
241  tmp2 = p_buf[fft_size / 8].Im;
242  tmp3 = p_buf[3 * fft_size / 8].Re;
243  tmp4 = p_buf[3 * fft_size / 8].Im;
244
245  tmp5 = tmp1 + tmp3;
246  tmp6 = tmp1 - tmp3;
247  tmp7 = tmp2 + tmp4;
248  tmp8 = tmp2 - tmp4;
249
250  w_re = *w_re_ptr;
251  w_im = *w_im_ptr;
252
253  p_dst[fft_size / 8].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6);
254  p_dst[fft_size / 8].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7);
255  p_dst[3 * fft_size / 8].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6);
256  p_dst[3 * fft_size / 8].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7);
257
258  p_dst[0].Re = p_buf[0].Re + p_buf[0].Im;
259  p_dst[fft_size / 4].Re = p_buf[fft_size / 4].Re;
260  p_dst[fft_size / 2].Re = p_buf[0].Re - p_buf[0].Im;
261  p_dst[0].Im = 0.0f;
262  p_dst[fft_size / 4].Im = -p_buf[fft_size / 4].Im;
263  p_dst[fft_size / 2].Im = 0.0f;
264
265  return OMX_Sts_NoErr;
266}
267