1/*
2 * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3 * Copyright Takuya OOURA, 1996-2001
4 *
5 * You may use, copy, modify and distribute this code for any purpose (include
6 * commercial use) and without fee. Please refer to this package when you modify
7 * this code.
8 *
9 * Changes by the WebRTC authors:
10 *    - Trivial type modifications.
11 *    - Minimal code subset to do rdft of length 128.
12 *    - Optimizations because of known length.
13 *
14 *  All changes are covered by the WebRTC license and IP grant:
15 *  Use of this source code is governed by a BSD-style license
16 *  that can be found in the LICENSE file in the root of the source
17 *  tree. An additional intellectual property rights grant can be found
18 *  in the file PATENTS.  All contributing project authors may
19 *  be found in the AUTHORS file in the root of the source tree.
20 */
21
22#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
23
24#include <math.h>
25
26#include "webrtc/system_wrappers/include/cpu_features_wrapper.h"
27#include "webrtc/typedefs.h"
28
29// These tables used to be computed at run-time. For example, refer to:
30// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/aec/aec_rdft.c?r=6564
31// to see the initialization code.
32const float rdft_w[64] = {
33    1.0000000000f, 0.0000000000f, 0.7071067691f, 0.7071067691f,
34    0.9238795638f, 0.3826834559f, 0.3826834559f, 0.9238795638f,
35    0.9807852507f, 0.1950903237f, 0.5555702448f, 0.8314695954f,
36    0.8314695954f, 0.5555702448f, 0.1950903237f, 0.9807852507f,
37    0.9951847196f, 0.0980171412f, 0.6343933344f, 0.7730104327f,
38    0.8819212914f, 0.4713967443f, 0.2902846634f, 0.9569403529f,
39    0.9569403529f, 0.2902846634f, 0.4713967443f, 0.8819212914f,
40    0.7730104327f, 0.6343933344f, 0.0980171412f, 0.9951847196f,
41    0.7071067691f, 0.4993977249f, 0.4975923598f, 0.4945882559f,
42    0.4903926253f, 0.4850156307f, 0.4784701765f, 0.4707720280f,
43    0.4619397819f, 0.4519946277f, 0.4409606457f, 0.4288643003f,
44    0.4157347977f, 0.4016037583f, 0.3865052164f, 0.3704755902f,
45    0.3535533845f, 0.3357794881f, 0.3171966672f, 0.2978496552f,
46    0.2777851224f, 0.2570513785f, 0.2356983721f, 0.2137775421f,
47    0.1913417280f, 0.1684449315f, 0.1451423317f, 0.1214900985f,
48    0.0975451618f, 0.0733652338f, 0.0490085706f, 0.0245338380f,
49};
50const float rdft_wk3ri_first[16] = {
51    1.000000000f, 0.000000000f, 0.382683456f, 0.923879564f,
52    0.831469536f, 0.555570245f, -0.195090353f, 0.980785251f,
53    0.956940353f, 0.290284693f, 0.098017156f, 0.995184720f,
54    0.634393334f, 0.773010492f, -0.471396863f, 0.881921172f,
55};
56const float rdft_wk3ri_second[16] = {
57    -0.707106769f, 0.707106769f, -0.923879564f, -0.382683456f,
58    -0.980785251f, 0.195090353f, -0.555570245f, -0.831469536f,
59    -0.881921172f, 0.471396863f, -0.773010492f, -0.634393334f,
60    -0.995184720f, -0.098017156f, -0.290284693f, -0.956940353f,
61};
62ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
63    1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f,
64    0.923879564f, 0.923879564f, 0.382683456f, 0.382683456f,
65    0.980785251f, 0.980785251f, 0.555570245f, 0.555570245f,
66    0.831469595f, 0.831469595f, 0.195090324f, 0.195090324f,
67    0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
68    0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f,
69    0.956940353f, 0.956940353f, 0.471396744f, 0.471396744f,
70    0.773010433f, 0.773010433f, 0.098017141f, 0.098017141f,
71};
72ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
73    1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f,
74    0.707106769f, 0.707106769f, -0.707106769f, -0.707106769f,
75    0.923879564f, 0.923879564f, -0.382683456f, -0.382683456f,
76    0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
77    0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f,
78    0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f,
79    0.831469595f, 0.831469595f, -0.555570245f, -0.555570245f,
80    0.195090324f, 0.195090324f, -0.980785251f, -0.980785251f,
81};
82ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
83    1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f,
84    0.382683456f, 0.382683456f, -0.923879564f, -0.923879564f,
85    0.831469536f, 0.831469536f, -0.980785251f, -0.980785251f,
86    -0.195090353f, -0.195090353f, -0.555570245f, -0.555570245f,
87    0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f,
88    0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f,
89    0.634393334f, 0.634393334f, -0.995184720f, -0.995184720f,
90    -0.471396863f, -0.471396863f, -0.290284693f, -0.290284693f,
91};
92ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
93    -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
94    -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
95    -0.195090324f, 0.195090324f, -0.831469595f, 0.831469595f,
96    -0.555570245f, 0.555570245f, -0.980785251f, 0.980785251f,
97    -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f,
98    -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f,
99    -0.290284663f, 0.290284663f, -0.881921291f, 0.881921291f,
100    -0.634393334f, 0.634393334f, -0.995184720f, 0.995184720f,
101};
102ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
103    -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f,
104    -0.707106769f, 0.707106769f, -0.707106769f, 0.707106769f,
105    -0.382683456f, 0.382683456f, -0.923879564f, 0.923879564f,
106    -0.923879564f, 0.923879564f, -0.382683456f, 0.382683456f,
107    -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f,
108    -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f,
109    -0.555570245f, 0.555570245f, -0.831469595f, 0.831469595f,
110    -0.980785251f, 0.980785251f, -0.195090324f, 0.195090324f,
111};
112ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
113    -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f,
114    -0.923879564f, 0.923879564f, 0.382683456f, -0.382683456f,
115    -0.555570245f, 0.555570245f, -0.195090353f, 0.195090353f,
116    -0.980785251f, 0.980785251f, 0.831469536f, -0.831469536f,
117    -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f,
118    -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f,
119    -0.773010492f, 0.773010492f, 0.098017156f, -0.098017156f,
120    -0.881921172f, 0.881921172f, 0.956940353f, -0.956940353f,
121};
122ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
123    0.707106769f, 0.707106769f, 0.707106769f, -0.707106769f,
124};
125
126static void bitrv2_128_C(float* a) {
127  /*
128      Following things have been attempted but are no faster:
129      (a) Storing the swap indexes in a LUT (index calculations are done
130          for 'free' while waiting on memory/L1).
131      (b) Consolidate the load/store of two consecutive floats by a 64 bit
132          integer (execution is memory/L1 bound).
133      (c) Do a mix of floats and 64 bit integer to maximize register
134          utilization (execution is memory/L1 bound).
135      (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
136      (e) Hard-coding of the offsets to completely eliminates index
137          calculations.
138  */
139
140  unsigned int j, j1, k, k1;
141  float xr, xi, yr, yi;
142
143  static const int ip[4] = {0, 64, 32, 96};
144  for (k = 0; k < 4; k++) {
145    for (j = 0; j < k; j++) {
146      j1 = 2 * j + ip[k];
147      k1 = 2 * k + ip[j];
148      xr = a[j1 + 0];
149      xi = a[j1 + 1];
150      yr = a[k1 + 0];
151      yi = a[k1 + 1];
152      a[j1 + 0] = yr;
153      a[j1 + 1] = yi;
154      a[k1 + 0] = xr;
155      a[k1 + 1] = xi;
156      j1 += 8;
157      k1 += 16;
158      xr = a[j1 + 0];
159      xi = a[j1 + 1];
160      yr = a[k1 + 0];
161      yi = a[k1 + 1];
162      a[j1 + 0] = yr;
163      a[j1 + 1] = yi;
164      a[k1 + 0] = xr;
165      a[k1 + 1] = xi;
166      j1 += 8;
167      k1 -= 8;
168      xr = a[j1 + 0];
169      xi = a[j1 + 1];
170      yr = a[k1 + 0];
171      yi = a[k1 + 1];
172      a[j1 + 0] = yr;
173      a[j1 + 1] = yi;
174      a[k1 + 0] = xr;
175      a[k1 + 1] = xi;
176      j1 += 8;
177      k1 += 16;
178      xr = a[j1 + 0];
179      xi = a[j1 + 1];
180      yr = a[k1 + 0];
181      yi = a[k1 + 1];
182      a[j1 + 0] = yr;
183      a[j1 + 1] = yi;
184      a[k1 + 0] = xr;
185      a[k1 + 1] = xi;
186    }
187    j1 = 2 * k + 8 + ip[k];
188    k1 = j1 + 8;
189    xr = a[j1 + 0];
190    xi = a[j1 + 1];
191    yr = a[k1 + 0];
192    yi = a[k1 + 1];
193    a[j1 + 0] = yr;
194    a[j1 + 1] = yi;
195    a[k1 + 0] = xr;
196    a[k1 + 1] = xi;
197  }
198}
199
200static void cft1st_128_C(float* a) {
201  const int n = 128;
202  int j, k1, k2;
203  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
204  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
205
206  // The processing of the first set of elements was simplified in C to avoid
207  // some operations (multiplication by zero or one, addition of two elements
208  // multiplied by the same weight, ...).
209  x0r = a[0] + a[2];
210  x0i = a[1] + a[3];
211  x1r = a[0] - a[2];
212  x1i = a[1] - a[3];
213  x2r = a[4] + a[6];
214  x2i = a[5] + a[7];
215  x3r = a[4] - a[6];
216  x3i = a[5] - a[7];
217  a[0] = x0r + x2r;
218  a[1] = x0i + x2i;
219  a[4] = x0r - x2r;
220  a[5] = x0i - x2i;
221  a[2] = x1r - x3i;
222  a[3] = x1i + x3r;
223  a[6] = x1r + x3i;
224  a[7] = x1i - x3r;
225  wk1r = rdft_w[2];
226  x0r = a[8] + a[10];
227  x0i = a[9] + a[11];
228  x1r = a[8] - a[10];
229  x1i = a[9] - a[11];
230  x2r = a[12] + a[14];
231  x2i = a[13] + a[15];
232  x3r = a[12] - a[14];
233  x3i = a[13] - a[15];
234  a[8] = x0r + x2r;
235  a[9] = x0i + x2i;
236  a[12] = x2i - x0i;
237  a[13] = x0r - x2r;
238  x0r = x1r - x3i;
239  x0i = x1i + x3r;
240  a[10] = wk1r * (x0r - x0i);
241  a[11] = wk1r * (x0r + x0i);
242  x0r = x3i + x1r;
243  x0i = x3r - x1i;
244  a[14] = wk1r * (x0i - x0r);
245  a[15] = wk1r * (x0i + x0r);
246  k1 = 0;
247  for (j = 16; j < n; j += 16) {
248    k1 += 2;
249    k2 = 2 * k1;
250    wk2r = rdft_w[k1 + 0];
251    wk2i = rdft_w[k1 + 1];
252    wk1r = rdft_w[k2 + 0];
253    wk1i = rdft_w[k2 + 1];
254    wk3r = rdft_wk3ri_first[k1 + 0];
255    wk3i = rdft_wk3ri_first[k1 + 1];
256    x0r = a[j + 0] + a[j + 2];
257    x0i = a[j + 1] + a[j + 3];
258    x1r = a[j + 0] - a[j + 2];
259    x1i = a[j + 1] - a[j + 3];
260    x2r = a[j + 4] + a[j + 6];
261    x2i = a[j + 5] + a[j + 7];
262    x3r = a[j + 4] - a[j + 6];
263    x3i = a[j + 5] - a[j + 7];
264    a[j + 0] = x0r + x2r;
265    a[j + 1] = x0i + x2i;
266    x0r -= x2r;
267    x0i -= x2i;
268    a[j + 4] = wk2r * x0r - wk2i * x0i;
269    a[j + 5] = wk2r * x0i + wk2i * x0r;
270    x0r = x1r - x3i;
271    x0i = x1i + x3r;
272    a[j + 2] = wk1r * x0r - wk1i * x0i;
273    a[j + 3] = wk1r * x0i + wk1i * x0r;
274    x0r = x1r + x3i;
275    x0i = x1i - x3r;
276    a[j + 6] = wk3r * x0r - wk3i * x0i;
277    a[j + 7] = wk3r * x0i + wk3i * x0r;
278    wk1r = rdft_w[k2 + 2];
279    wk1i = rdft_w[k2 + 3];
280    wk3r = rdft_wk3ri_second[k1 + 0];
281    wk3i = rdft_wk3ri_second[k1 + 1];
282    x0r = a[j + 8] + a[j + 10];
283    x0i = a[j + 9] + a[j + 11];
284    x1r = a[j + 8] - a[j + 10];
285    x1i = a[j + 9] - a[j + 11];
286    x2r = a[j + 12] + a[j + 14];
287    x2i = a[j + 13] + a[j + 15];
288    x3r = a[j + 12] - a[j + 14];
289    x3i = a[j + 13] - a[j + 15];
290    a[j + 8] = x0r + x2r;
291    a[j + 9] = x0i + x2i;
292    x0r -= x2r;
293    x0i -= x2i;
294    a[j + 12] = -wk2i * x0r - wk2r * x0i;
295    a[j + 13] = -wk2i * x0i + wk2r * x0r;
296    x0r = x1r - x3i;
297    x0i = x1i + x3r;
298    a[j + 10] = wk1r * x0r - wk1i * x0i;
299    a[j + 11] = wk1r * x0i + wk1i * x0r;
300    x0r = x1r + x3i;
301    x0i = x1i - x3r;
302    a[j + 14] = wk3r * x0r - wk3i * x0i;
303    a[j + 15] = wk3r * x0i + wk3i * x0r;
304  }
305}
306
307static void cftmdl_128_C(float* a) {
308  const int l = 8;
309  const int n = 128;
310  const int m = 32;
311  int j0, j1, j2, j3, k, k1, k2, m2;
312  float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
313  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
314
315  for (j0 = 0; j0 < l; j0 += 2) {
316    j1 = j0 + 8;
317    j2 = j0 + 16;
318    j3 = j0 + 24;
319    x0r = a[j0 + 0] + a[j1 + 0];
320    x0i = a[j0 + 1] + a[j1 + 1];
321    x1r = a[j0 + 0] - a[j1 + 0];
322    x1i = a[j0 + 1] - a[j1 + 1];
323    x2r = a[j2 + 0] + a[j3 + 0];
324    x2i = a[j2 + 1] + a[j3 + 1];
325    x3r = a[j2 + 0] - a[j3 + 0];
326    x3i = a[j2 + 1] - a[j3 + 1];
327    a[j0 + 0] = x0r + x2r;
328    a[j0 + 1] = x0i + x2i;
329    a[j2 + 0] = x0r - x2r;
330    a[j2 + 1] = x0i - x2i;
331    a[j1 + 0] = x1r - x3i;
332    a[j1 + 1] = x1i + x3r;
333    a[j3 + 0] = x1r + x3i;
334    a[j3 + 1] = x1i - x3r;
335  }
336  wk1r = rdft_w[2];
337  for (j0 = m; j0 < l + m; j0 += 2) {
338    j1 = j0 + 8;
339    j2 = j0 + 16;
340    j3 = j0 + 24;
341    x0r = a[j0 + 0] + a[j1 + 0];
342    x0i = a[j0 + 1] + a[j1 + 1];
343    x1r = a[j0 + 0] - a[j1 + 0];
344    x1i = a[j0 + 1] - a[j1 + 1];
345    x2r = a[j2 + 0] + a[j3 + 0];
346    x2i = a[j2 + 1] + a[j3 + 1];
347    x3r = a[j2 + 0] - a[j3 + 0];
348    x3i = a[j2 + 1] - a[j3 + 1];
349    a[j0 + 0] = x0r + x2r;
350    a[j0 + 1] = x0i + x2i;
351    a[j2 + 0] = x2i - x0i;
352    a[j2 + 1] = x0r - x2r;
353    x0r = x1r - x3i;
354    x0i = x1i + x3r;
355    a[j1 + 0] = wk1r * (x0r - x0i);
356    a[j1 + 1] = wk1r * (x0r + x0i);
357    x0r = x3i + x1r;
358    x0i = x3r - x1i;
359    a[j3 + 0] = wk1r * (x0i - x0r);
360    a[j3 + 1] = wk1r * (x0i + x0r);
361  }
362  k1 = 0;
363  m2 = 2 * m;
364  for (k = m2; k < n; k += m2) {
365    k1 += 2;
366    k2 = 2 * k1;
367    wk2r = rdft_w[k1 + 0];
368    wk2i = rdft_w[k1 + 1];
369    wk1r = rdft_w[k2 + 0];
370    wk1i = rdft_w[k2 + 1];
371    wk3r = rdft_wk3ri_first[k1 + 0];
372    wk3i = rdft_wk3ri_first[k1 + 1];
373    for (j0 = k; j0 < l + k; j0 += 2) {
374      j1 = j0 + 8;
375      j2 = j0 + 16;
376      j3 = j0 + 24;
377      x0r = a[j0 + 0] + a[j1 + 0];
378      x0i = a[j0 + 1] + a[j1 + 1];
379      x1r = a[j0 + 0] - a[j1 + 0];
380      x1i = a[j0 + 1] - a[j1 + 1];
381      x2r = a[j2 + 0] + a[j3 + 0];
382      x2i = a[j2 + 1] + a[j3 + 1];
383      x3r = a[j2 + 0] - a[j3 + 0];
384      x3i = a[j2 + 1] - a[j3 + 1];
385      a[j0 + 0] = x0r + x2r;
386      a[j0 + 1] = x0i + x2i;
387      x0r -= x2r;
388      x0i -= x2i;
389      a[j2 + 0] = wk2r * x0r - wk2i * x0i;
390      a[j2 + 1] = wk2r * x0i + wk2i * x0r;
391      x0r = x1r - x3i;
392      x0i = x1i + x3r;
393      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
394      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
395      x0r = x1r + x3i;
396      x0i = x1i - x3r;
397      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
398      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
399    }
400    wk1r = rdft_w[k2 + 2];
401    wk1i = rdft_w[k2 + 3];
402    wk3r = rdft_wk3ri_second[k1 + 0];
403    wk3i = rdft_wk3ri_second[k1 + 1];
404    for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
405      j1 = j0 + 8;
406      j2 = j0 + 16;
407      j3 = j0 + 24;
408      x0r = a[j0 + 0] + a[j1 + 0];
409      x0i = a[j0 + 1] + a[j1 + 1];
410      x1r = a[j0 + 0] - a[j1 + 0];
411      x1i = a[j0 + 1] - a[j1 + 1];
412      x2r = a[j2 + 0] + a[j3 + 0];
413      x2i = a[j2 + 1] + a[j3 + 1];
414      x3r = a[j2 + 0] - a[j3 + 0];
415      x3i = a[j2 + 1] - a[j3 + 1];
416      a[j0 + 0] = x0r + x2r;
417      a[j0 + 1] = x0i + x2i;
418      x0r -= x2r;
419      x0i -= x2i;
420      a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
421      a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
422      x0r = x1r - x3i;
423      x0i = x1i + x3r;
424      a[j1 + 0] = wk1r * x0r - wk1i * x0i;
425      a[j1 + 1] = wk1r * x0i + wk1i * x0r;
426      x0r = x1r + x3i;
427      x0i = x1i - x3r;
428      a[j3 + 0] = wk3r * x0r - wk3i * x0i;
429      a[j3 + 1] = wk3r * x0i + wk3i * x0r;
430    }
431  }
432}
433
434static void cftfsub_128_C(float* a) {
435  int j, j1, j2, j3, l;
436  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
437
438  cft1st_128(a);
439  cftmdl_128(a);
440  l = 32;
441  for (j = 0; j < l; j += 2) {
442    j1 = j + l;
443    j2 = j1 + l;
444    j3 = j2 + l;
445    x0r = a[j] + a[j1];
446    x0i = a[j + 1] + a[j1 + 1];
447    x1r = a[j] - a[j1];
448    x1i = a[j + 1] - a[j1 + 1];
449    x2r = a[j2] + a[j3];
450    x2i = a[j2 + 1] + a[j3 + 1];
451    x3r = a[j2] - a[j3];
452    x3i = a[j2 + 1] - a[j3 + 1];
453    a[j] = x0r + x2r;
454    a[j + 1] = x0i + x2i;
455    a[j2] = x0r - x2r;
456    a[j2 + 1] = x0i - x2i;
457    a[j1] = x1r - x3i;
458    a[j1 + 1] = x1i + x3r;
459    a[j3] = x1r + x3i;
460    a[j3 + 1] = x1i - x3r;
461  }
462}
463
464static void cftbsub_128_C(float* a) {
465  int j, j1, j2, j3, l;
466  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
467
468  cft1st_128(a);
469  cftmdl_128(a);
470  l = 32;
471
472  for (j = 0; j < l; j += 2) {
473    j1 = j + l;
474    j2 = j1 + l;
475    j3 = j2 + l;
476    x0r = a[j] + a[j1];
477    x0i = -a[j + 1] - a[j1 + 1];
478    x1r = a[j] - a[j1];
479    x1i = -a[j + 1] + a[j1 + 1];
480    x2r = a[j2] + a[j3];
481    x2i = a[j2 + 1] + a[j3 + 1];
482    x3r = a[j2] - a[j3];
483    x3i = a[j2 + 1] - a[j3 + 1];
484    a[j] = x0r + x2r;
485    a[j + 1] = x0i - x2i;
486    a[j2] = x0r - x2r;
487    a[j2 + 1] = x0i + x2i;
488    a[j1] = x1r - x3i;
489    a[j1 + 1] = x1i - x3r;
490    a[j3] = x1r + x3i;
491    a[j3 + 1] = x1i + x3r;
492  }
493}
494
495static void rftfsub_128_C(float* a) {
496  const float* c = rdft_w + 32;
497  int j1, j2, k1, k2;
498  float wkr, wki, xr, xi, yr, yi;
499
500  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
501    k2 = 128 - j2;
502    k1 = 32 - j1;
503    wkr = 0.5f - c[k1];
504    wki = c[j1];
505    xr = a[j2 + 0] - a[k2 + 0];
506    xi = a[j2 + 1] + a[k2 + 1];
507    yr = wkr * xr - wki * xi;
508    yi = wkr * xi + wki * xr;
509    a[j2 + 0] -= yr;
510    a[j2 + 1] -= yi;
511    a[k2 + 0] += yr;
512    a[k2 + 1] -= yi;
513  }
514}
515
516static void rftbsub_128_C(float* a) {
517  const float* c = rdft_w + 32;
518  int j1, j2, k1, k2;
519  float wkr, wki, xr, xi, yr, yi;
520
521  a[1] = -a[1];
522  for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
523    k2 = 128 - j2;
524    k1 = 32 - j1;
525    wkr = 0.5f - c[k1];
526    wki = c[j1];
527    xr = a[j2 + 0] - a[k2 + 0];
528    xi = a[j2 + 1] + a[k2 + 1];
529    yr = wkr * xr + wki * xi;
530    yi = wkr * xi - wki * xr;
531    a[j2 + 0] = a[j2 + 0] - yr;
532    a[j2 + 1] = yi - a[j2 + 1];
533    a[k2 + 0] = yr + a[k2 + 0];
534    a[k2 + 1] = yi - a[k2 + 1];
535  }
536  a[65] = -a[65];
537}
538
539void aec_rdft_forward_128(float* a) {
540  float xi;
541  bitrv2_128(a);
542  cftfsub_128(a);
543  rftfsub_128(a);
544  xi = a[0] - a[1];
545  a[0] += a[1];
546  a[1] = xi;
547}
548
549void aec_rdft_inverse_128(float* a) {
550  a[1] = 0.5f * (a[0] - a[1]);
551  a[0] -= a[1];
552  rftbsub_128(a);
553  bitrv2_128(a);
554  cftbsub_128(a);
555}
556
557// code path selection
558RftSub128 cft1st_128;
559RftSub128 cftmdl_128;
560RftSub128 rftfsub_128;
561RftSub128 rftbsub_128;
562RftSub128 cftfsub_128;
563RftSub128 cftbsub_128;
564RftSub128 bitrv2_128;
565
566void aec_rdft_init(void) {
567  cft1st_128 = cft1st_128_C;
568  cftmdl_128 = cftmdl_128_C;
569  rftfsub_128 = rftfsub_128_C;
570  rftbsub_128 = rftbsub_128_C;
571  cftfsub_128 = cftfsub_128_C;
572  cftbsub_128 = cftbsub_128_C;
573  bitrv2_128 = bitrv2_128_C;
574#if defined(WEBRTC_ARCH_X86_FAMILY)
575  if (WebRtc_GetCPUInfo(kSSE2)) {
576    aec_rdft_init_sse2();
577  }
578#endif
579#if defined(MIPS_FPU_LE)
580  aec_rdft_init_mips();
581#endif
582#if defined(WEBRTC_HAS_NEON)
583  aec_rdft_init_neon();
584#elif defined(WEBRTC_DETECT_NEON)
585  if ((WebRtc_GetCPUFeaturesARM() & kCPUFeatureNEON) != 0) {
586    aec_rdft_init_neon();
587  }
588#endif
589}
590