1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18/*
19
20 Pathname: ./src/fft_rx4_long.c
21 Funtions: fft_rx4_long
22
23------------------------------------------------------------------------------
24 REVISION HISTORY
25
26 Description:
27            (1) Eliminated search for max in the main loop.
28            (2) Reduced precision on w_256rx4 from Q15 to Q10
29
30 Description:
31            (1) Created function fft_rx4_long_no_max to overcome LTP problem.
32
33 Description:
34            (1) Modified shift so the accumulation growths faster than the
35                downshift, so now the input can be as high as 1.0 and saturation
36                will not occurre. The accumulation times the Q10 format will
37                never exceed 31 bits. This increases precision
38            (2) Eliminated unneeded data moves, used before for max search.
39            (3) Eliminated function fft_rx4_long_no_max.
40
41 Description:
42            (1) Added comment to explain max search elimination and
43                Q format during multiplications
44
45 Who:                       Date:
46 Description:
47
48------------------------------------------------------------------------------
49 INPUT AND OUTPUT DEFINITIONS
50
51 Inputs:
52    Data       =  Input complex vector, arranged in the following order:
53                  real, imag, real, imag...
54                  This is a complex vector whose elements (real and Imag) are
55                  Int32.
56                  type Int32 *
57
58    peak_value =  Input,  peak value of the input vector
59                  Output,  peak value of the resulting vector
60                  type Int32 *
61
62 Local Stores/Buffers/Pointers Needed:
63    None
64
65 Global Stores/Buffers/Pointers Needed:
66    None
67
68 Outputs:
69    None
70
71 Pointers and Buffers Modified:
72    calculation are done in-place and returned in Data
73
74 Local Stores Modified:
75    None
76
77 Global Stores Modified:
78    None
79
80------------------------------------------------------------------------------
81 FUNCTION DESCRIPTION
82
83    Fast Fourier Transform, radix 4 with Decimation in Frequency and block
84    floating point arithmetic.
85    The radix-4 FFT  simply divides the FFT into four smaller FFTs. Each of
86    the smaller FFTs is then further divided into smaller ones and so on.
87    It consists of log 4 N stages and each stage consists of N/4 dragonflies.
88
89    An FFT is nothing but a bundle of multiplications and summations which
90    may overflow during calculations.
91
92
93    This routine uses a scheme to test and scale the result output from
94    each FFT stage in order to fix the accumulation overflow.
95
96    The Input Data should be in Q13 format to get the highest precision.
97    At the end of each dragonfly calculation, a test for possible bit growth
98    is made, if bit growth is possible the Data is scale down back to Q13.
99
100------------------------------------------------------------------------------
101 REQUIREMENTS
102
103    This function should provide a fixed point FFT for an input array
104    of size 256.
105
106------------------------------------------------------------------------------
107 REFERENCES
108
109    [1] Advance Digital Signal Processing, J. Proakis, C. Rader, F. Ling,
110        C. Nikias, Macmillan Pub. Co.
111
112------------------------------------------------------------------------------
113 PSEUDO-CODE
114
115
116   MODIFY( x[] )
117   RETURN( exponent )
118
119------------------------------------------------------------------------------
120 RESOURCES USED
121   When the code is written for a specific target processor the
122     the resources used should be documented below.
123
124 STACK USAGE: [stack count for this module] + [variable to represent
125          stack usage for each subroutine called]
126
127     where: [stack usage variable] = stack usage for [subroutine
128         name] (see [filename].ext)
129
130 DATA MEMORY USED: x words
131
132 PROGRAM MEMORY USED: x words
133
134 CLOCK CYCLES: [cycle count equation for this module] + [variable
135           used to represent cycle count for each subroutine
136           called]
137
138     where: [cycle count variable] = cycle count for [subroutine
139        name] (see [filename].ext)
140
141------------------------------------------------------------------------------
142*/
143/*----------------------------------------------------------------------------
144; INCLUDES
145----------------------------------------------------------------------------*/
146
147#include "pv_audio_type_defs.h"
148#include "fft_rx4.h"
149
150#include "fxp_mul32.h"
151
152/*----------------------------------------------------------------------------
153; MACROS
154; Define module specific macros here
155----------------------------------------------------------------------------*/
156
157/*----------------------------------------------------------------------------
158; DEFINES
159; Include all pre-processor statements here. Include conditional
160; compile variables also.
161----------------------------------------------------------------------------*/
162
163/*----------------------------------------------------------------------------
164; LOCAL FUNCTION DEFINITIONS
165; Function Prototype declaration
166----------------------------------------------------------------------------*/
167
168/*----------------------------------------------------------------------------
169; LOCAL VARIABLE DEFINITIONS
170; Variable declaration - defined here and used outside this module
171----------------------------------------------------------------------------*/
172
173/*----------------------------------------------------------------------------
174; EXTERNAL FUNCTION REFERENCES
175; Declare functions defined elsewhere and referenced in this module
176----------------------------------------------------------------------------*/
177
178/*----------------------------------------------------------------------------
179; EXTERNAL VARIABLES REFERENCES
180; Declare variables used in this module but defined elsewhere
181----------------------------------------------------------------------------*/
182
183/*----------------------------------------------------------------------------
184; EXTERNAL GLOBAL STORE/BUFFER/POINTER REFERENCES
185; Declare variables used in this module but defined elsewhere
186----------------------------------------------------------------------------*/
187
188/*----------------------------------------------------------------------------
189; FUNCTION CODE
190----------------------------------------------------------------------------*/
191
192
193void fft_rx4_long(
194    Int32      Data[],
195    Int32      *peak_value)
196
197{
198    Int     n1;
199    Int     n2;
200    Int     j;
201    Int     k;
202    Int     i;
203
204    Int32   t1;
205    Int32   t2;
206    Int32   r1;
207    Int32   r2;
208    Int32   r3;
209    Int32   r4;
210    Int32   s1;
211    Int32   s2;
212    Int32   s3;
213    Int32   *pData1;
214    Int32   *pData2;
215    Int32   *pData3;
216    Int32   *pData4;
217    Int32   temp1;
218    Int32   temp2;
219    Int32   temp3;
220    Int32   temp4;
221    Int32   max;
222
223    Int32   exp_jw1;
224    Int32   exp_jw2;
225    Int32   exp_jw3;
226
227
228
229    const Int32  *pw = W_256rx4;
230
231    n2 = FFT_RX4_LONG;
232
233    for (k = FFT_RX4_LONG; k > 4; k >>= 2)
234    {
235
236        n1 = n2;
237        n2 >>= 2;
238
239        for (i = 0; i < FFT_RX4_LONG; i += n1)
240        {
241            pData1 = &Data[ i<<1];
242            pData2 = pData1 + n1;
243
244            temp1   = *pData1;
245            temp2   = *pData2;
246
247            r1      = temp1 + temp2;
248            r2      = temp1 - temp2;
249
250            pData3 = pData1 + (n1 >> 1);
251            pData4 = pData3 + n1;
252            temp3   = *pData3++;
253            temp4   = *pData4++;
254
255            t1      = temp3 + temp4;
256
257            *(pData1++) = (r1 + t1);
258            t2      = temp3 - temp4;
259            *(pData2++) = (r1 - t1);
260
261            temp1   = *pData1;
262            temp2   = *pData2;
263
264            s1      = temp1 + temp2;
265            temp3   = *pData3;
266            s2      = temp1 - temp2;
267            temp4   = *pData4;
268            *pData3--  = (s2 - t2);
269            *pData4--  = (s2 + t2);
270
271            t1      = temp3 + temp4;
272
273            *pData1    = (s1 + t1);
274            *pData2    = (s1 - t1);
275
276            r1      = temp3 - temp4;
277
278            *pData4    = (r2 - r1);
279            *pData3    = (r2 + r1);
280
281        }  /* i */
282
283
284
285        for (j = 1; j < n2; j++)
286        {
287
288            exp_jw1 = (*pw++);
289            exp_jw2 = (*pw++);
290            exp_jw3 = (*pw++);
291
292
293            for (i = j; i < FFT_RX4_LONG; i += n1)
294            {
295                pData1 = &Data[ i<<1];
296                pData2 = pData1 + n1;
297
298                temp1   = *pData1;
299                temp2   = *pData2++;
300
301                r1      = temp1 + temp2;
302                r2      = temp1 - temp2;
303
304                pData3 = pData1 + (n1 >> 1);
305                pData4 = pData3 + n1;
306                temp3   = *pData3++;
307                temp4   = *pData4++;
308
309                r3      = temp3 + temp4;
310                r4      = temp3 - temp4;
311
312                *(pData1++) = (r1 + r3);
313                r1          = (r1 - r3) << 1;
314
315                temp2   = *pData2;
316                temp1   = *pData1;
317
318                s1      = temp1 + temp2;
319                s2      = temp1 - temp2;
320                s3      = (s2 + r4) << 1;
321                s2      = (s2 - r4) << 1;
322
323                temp3   = *pData3;
324                temp4   = *pData4;
325
326                t1      = temp3 + temp4;
327                t2      = temp3 - temp4;
328
329                *pData1  = (s1 + t1);
330                s1       = (s1 - t1) << 1;
331
332                *pData2--  = cmplx_mul32_by_16(s1, -r1, exp_jw2);
333                r3      = (r2 - t2) << 1;
334                *pData2    = cmplx_mul32_by_16(r1,  s1, exp_jw2);
335
336                r2      = (r2 + t2) << 1;
337
338                *pData3--  = cmplx_mul32_by_16(s2, -r2, exp_jw1);
339                *pData3    = cmplx_mul32_by_16(r2,  s2, exp_jw1);
340
341                *pData4--  = cmplx_mul32_by_16(s3, -r3, exp_jw3);
342                *pData4    = cmplx_mul32_by_16(r3,  s3, exp_jw3);
343
344            }  /* i */
345
346        }  /*  j */
347
348    } /* k */
349
350
351    max = 0;
352
353    pData1 = Data - 7;
354
355
356    for (i = ONE_FOURTH_FFT_RX4_LONG; i != 0 ; i--)
357    {
358        pData1 += 7;
359        pData2 = pData1 + 4;
360
361
362        temp1   = *pData1;
363        temp2   = *pData2++;
364
365        r1      = temp1 + temp2;
366        r2      = temp1 - temp2;
367
368        pData3 = pData1 + 2;
369        pData4 = pData1 + 6;
370        temp1   = *pData3++;
371        temp2   = *pData4++;
372
373        t1      = temp1 + temp2;
374        t2      = temp1 - temp2;
375
376        temp1       = (r1 + t1);
377        r1          = (r1 - t1);
378        *(pData1++) = temp1;
379        max        |= (temp1 >> 31) ^ temp1;
380
381
382
383        temp2   = *pData2;
384        temp1   = *pData1;
385
386        s1      = temp1 + temp2;
387        s2      = temp1 - temp2;
388
389
390        temp1   = *pData3;
391        temp2   = *pData4;
392
393        s3      = (s2 + t2);
394        s2      = (s2 - t2);
395
396        t1      = temp1 + temp2;
397        t2      = temp1 - temp2;
398
399        temp1      = (s1 + t1);
400        *pData1    = temp1;
401        temp2      = (s1 - t1);
402
403        max       |= (temp1 >> 31) ^ temp1;
404        *pData2--  = temp2;
405        max       |= (temp2 >> 31) ^ temp2;
406
407        *pData2    = r1;
408        max       |= (r1 >> 31) ^ r1;
409        *pData3--  = s2;
410        max       |= (s2 >> 31) ^ s2;
411        *pData4--  = s3;
412        max       |= (s3 >> 31) ^ s3;
413
414        temp1      = (r2 - t2);
415        *pData4    = temp1;
416        temp2      = (r2 + t2);
417        *pData3    = temp2;
418        max       |= (temp1 >> 31) ^ temp1;
419        max       |= (temp2 >> 31) ^ temp2;
420
421    }  /* i */
422
423    *peak_value = max;
424
425    return ;
426
427}
428
429