1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s
11@//  to support float instead of SC32.
12@//
13
14@//
15@// Description:
16@// Compute an inverse FFT for a complex signal
17@//
18@//
19
20
21@// Include standard headers
22
23#include "dl/api/arm/armCOMM_s.h"
24#include "dl/api/arm/omxtypes_s.h"
25
26
27@// Import symbols required from other files
28@// (For example tables)
29
30        .extern  armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
31        .extern  armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
32        .extern  armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
33        .extern  armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
34        .extern  armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
35        .extern  armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe
36
37
38@// Set debugging level
39@//DEBUG_ON    SETL {TRUE}
40
41
42
43@// Guarding implementation by the processor name
44
45
46
47      @// Guarding implementation by the processor name
48
49@// Import symbols required from other files
50@// (For example tables)
51        .extern  armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
52        .extern  armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
53
54
55@//Input Registers
56
57#define pSrc            r0
58#define pDst            r1
59#define pFFTSpec        r2
60#define scale           r3
61
62
63@// Output registers
64#define result          r0
65
66@//Local Scratch Registers
67
68#define argTwiddle      r1
69#define argDst          r2
70#define argScale        r4
71#define tmpOrder        r4
72#define pTwiddle        r4
73#define pOut            r5
74#define subFFTSize      r7
75#define subFFTNum       r6
76#define N               r6
77#define order           r14
78#define diff            r9
79@// Total num of radix stages required to comple the FFT
80#define count           r8
81#define x0r             r4
82#define x0i             r5
83#define diffMinusOne    r2
84#define round           r3
85
86#define pOut1           r2
87#define size            r7
88#define step            r8
89#define step1           r9
90#define twStep          r10
91#define pTwiddleTmp     r11
92#define argTwiddle1     r12
93#define zero            r14
94
95@// Neon registers
96
97#define dX0     D0.F32
98#define dShift  D1.F32
99#define dX1     D1.F32
100#define dY0     D2.F32
101#define dY1     D3.F32
102#define dX0r    D0.F32
103#define dX0i    D1.F32
104#define dX1r    D2.F32
105#define dX1i    D3.F32
106#define dW0r    D4.F32
107#define dW0i    D5.F32
108#define dW1r    D6.F32
109#define dW1i    D7.F32
110#define dT0     D8.F32
111#define dT1     D9.F32
112#define dT2     D10.F32
113#define dT3     D11.F32
114#define qT0     d12.F32
115#define qT1     d14.F32
116#define qT2     d16.F32
117#define qT3     d18.F32
118#define dY0r    D4.F32
119#define dY0i    D5.F32
120#define dY1r    D6.F32
121#define dY1i    D7.F32
122#define dzero   D20.F32
123
124#define dY2     D4.F32
125#define dY3     D5.F32
126#define dW0     D6.F32
127#define dW1     D7.F32
128#define dW0Tmp  D10.F32
129#define dW1Neg  D11.F32
130
131#define sN      S0.S32
132#define fN      S1.F32
133@// one must be the same as dScale[0]!
134#define dScale  D2.F32
135#define one     S4.F32
136
137#define qX0     Q2.F32
138#define qX1     Q3.F32
139
140    @// Allocate stack memory required by the function
141        M_ALLOC4        complexFFTSize, 4
142
143    @// Write function header
144        M_START     omxSP_FFTInv_CCSToR_F32_Sfs,r11,d15
145
146@ Structure offsets for the FFTSpec
147        .set    ARMsFFTSpec_N, 0
148        .set    ARMsFFTSpec_pBitRev, 4
149        .set    ARMsFFTSpec_pTwiddle, 8
150        .set    ARMsFFTSpec_pBuf, 12
151
152        @// Define stack arguments
153
154        @// Read the size from structure and take log
155        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
156
157        @// Read other structure parameters
158        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
159        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
160
161        @//  N=1 Treat seperately
162        CMP     N,#1
163        BGT     sizeGreaterThanOne
164        VLD1    dX0[0],[pSrc]
165        VST1    dX0[0],[pDst]
166
167        B       End
168
169sizeGreaterThanOne:
170
171        @// Call the preTwiddle Radix2 stage before doing the compledIFFT
172
173
174        BL    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe
175
176
177complexIFFT:
178
179        ASR     N,N,#1                             @// N/2 point complex IFFT
180        M_STR   N, complexFFTSize                  @ Save N for scaling later
181        ADD     pSrc,pOut,N,LSL #3                 @// set pSrc as pOut1
182
183        CLZ     order,N                             @// N = 2^order
184        RSB     order,order,#31
185        MOV     subFFTSize,#1
186        @//MOV     subFFTNum,N
187
188        CMP     order,#3
189        BGT     orderGreaterthan3                   @// order > 3
190
191        CMP     order,#1
192        BGE     orderGreaterthan0                   @// order > 0
193
194        VLD1    dX0,[pSrc]
195        VST1    dX0,[pDst]
196        MOV     pSrc,pDst
197        BLT     FFTEnd
198
199orderGreaterthan0:
200        @// set the buffers appropriately for various orders
201        CMP     order,#2
202        MOVNE   argDst,pDst
203        MOVEQ   argDst,pOut
204        @// Pass the first stage destination in RN5
205        MOVEQ   pOut,pDst
206        MOV     argTwiddle,pTwiddle
207
208        BGE     orderGreaterthan1
209        BLLT    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe  @// order = 1
210        B       FFTEnd
211
212orderGreaterthan1:
213        MOV     tmpOrder,order                          @// tmpOrder = RN 4
214        BL      armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
215        CMP     tmpOrder,#2
216        BLGT    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
217        BL      armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
218        B       FFTEnd
219
220
221orderGreaterthan3:
222specialScaleCase:
223
224        @// Set input args to fft stages
225        TST     order, #2
226        MOVNE   argDst,pDst
227        MOVEQ   argDst,pOut
228        @// Pass the first stage destination in RN5
229        MOVEQ   pOut,pDst
230        MOV     argTwiddle,pTwiddle
231
232        @//check for even or odd order
233        @// NOTE: The following combination of BL's would work fine even though
234        @// the first BL would corrupt the flags. This is because the end of
235        @// the "grpZeroSetLoop" loop inside
236        @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
237        @// to EQ
238
239        TST     order,#0x00000001
240        BLEQ    armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
241        BLNE    armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
242
243        CMP        subFFTNum,#4
244        BLT     FFTEnd
245
246
247unscaledRadix4Loop:
248        BEQ        lastStageUnscaledRadix4
249         BL        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
250         CMP        subFFTNum,#4
251         B        unscaledRadix4Loop
252
253lastStageUnscaledRadix4:
254        BL      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
255        B        FFTEnd
256
257FFTEnd:                                               @// Does only the scaling
258        @ Scale inverse FFT result by 1/N
259
260        M_LDR   N, complexFFTSize
261        VMOV    sN,N
262        VCVT    fN, sN                  @ fn = fftSize, as a float
263        VMOV    one, 1.0
264        VDIV    one, one, fN            @ one = dScale[0] = 1 / fftSize
265
266
267        @// subFFTSize = N = complexFFTSize, which is always even and
268        @// greater than 0.
269        CMP     subFFTSize, #4
270        BLT     scaleFFTData1
271scaleFFTData:
272        @// Scale 4 complex (8 float) elements at a time
273        VLD1    {qX0, qX1}, [pSrc :256]            @// pSrc contains pDst pointer
274        SUBS    subFFTSize, subFFTSize, #4
275        VMUL    qX0, qX0, dScale[0]
276        VMUL    qX1, qX1, dScale[0]
277        VST1    {qX0, qX1}, [pSrc :256]!
278
279        BGT     scaleFFTData
280scaleFFTData1:
281        CMP     subFFTSize, #2
282        BLT     End
283        VLD1    {qX0}, [pSrc]
284        VMUL    qX0, qX0, dScale[0]
285        VST1    {qX0}, [pSrc]!
286End:
287        @// Set return value
288        MOV     result, #OMX_Sts_NoErr
289
290        @// Write function tail
291        M_END
292
293
294
295        .end
296