1@
2@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10@ Some code in this file was originally from file
11@ omxSP_FFTInv_CToC_SC16_Sfs_s.S which was licensed as follows.
12@ It has been relicensed with permission from the copyright holders.
13@
14
15@
16@ File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
17@ OpenMAX DL: v1.0.2
18@ Last Modified Revision:   6729
19@ Last Modified Date:       Tue, 17 Jul 2007
20@
21@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
22@
23
24@
25@ Description:
26@ Compute an inverse FFT for a 16-bit real signal, with complex FFT routines.
27@
28
29#include "dl/api/arm/armCOMM_s.h"
30#include "dl/api/arm/omxtypes_s.h"
31
32.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
33.extern  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
34.extern  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
35.extern  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
36.extern  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
37.extern  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
38.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
39.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
40.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
41.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
42.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
43.extern  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
44.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
45.extern  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
46.extern  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
47.extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
48
49@Input Registers
50#define pSrc            r0
51#define pDst            r1
52#define pFFTSpec        r2
53#define scale           r3
54
55@ Output registers
56#define result  r0
57
58@Local Scratch Registers
59#define argTwiddle      r1
60#define argDst          r2
61#define argScale        r4
62#define pTwiddle        r4
63#define tmpOrder        r4
64#define pOut            r5
65#define subFFTSize      r7
66#define subFFTNum       r6
67#define N               r6
68#define order           r14
69#define diff            r9
70@ Total num of radix stages to comple the FFT
71#define count           r8
72#define x0r             r4
73#define x0i             r5
74#define diffMinusOne    r2
75#define round           r3
76#define pOut1           r2
77#define size            r7
78#define step            r8
79#define step1           r9
80#define twStep          r10
81#define pTwiddleTmp     r11
82#define argTwiddle1     r12
83#define zero            r14
84
85@ Neon registers
86#define dX0             D0.S32
87#define dShift          D1.S32
88#define qShift          Q0.s16
89#define dX1             D1.S32
90#define dY0             D2.S32
91#define dY1             D3.S32
92#define dX0r            D0.S32
93#define dX0i            D1.S32
94#define dX1r            D2.S32
95#define dX1i            D3.S32
96#define dW0r            D4.S32
97#define dW0i            D5.S32
98#define dW1r            D6.S32
99#define dW1i            D7.S32
100#define dT0             D8.S32
101#define dT1             D9.S32
102#define dT2             D10.S32
103#define dT3             D11.S32
104#define qT0             Q6.S64
105#define qT1             Q7.S64
106#define qT0s            Q6.S16
107#define qT1s            Q7.S16
108#define qT2             Q8.S64
109#define qT3             Q9.S64
110#define dY0r            D4.S32
111#define dY0i            D5.S32
112#define dY1r            D6.S32
113#define dY1i            D7.S32
114#define dzero           D20.S32
115#define dY2             D4.S32
116#define dY3             D5.S32
117#define dW0             D6.S32
118#define dW1             D7.S32
119#define dW0Tmp          D10.S32
120#define dW1Neg          D11.S32
121
122
123
124    @ Allocate stack memory required by the function
125        M_ALLOC4        diffOnStack, 4
126
127    @ Write function header
128        M_START     omxSP_FFTInv_CCSToR_S16_Sfs,r11,d15
129
130@ Structure offsets for the FFTSpec
131        .set    ARMsFFTSpec_N, 0
132        .set    ARMsFFTSpec_pBitRev, 4
133        .set    ARMsFFTSpec_pTwiddle, 8
134        .set    ARMsFFTSpec_pBuf, 12
135
136        @ Define stack arguments
137
138        @ Read the size from structure and take log
139        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
140
141        @ Read other structure parameters
142        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
143        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
144
145        @ Call the preTwiddle Radix2 stage before doing the complex IFFT
146
147        @ The following conditional BL combination would work since
148        @ evenOddButterflyLoop in the first call would set Z flag to zero
149
150        CMP     scale,#0
151        BLEQ    armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe
152        BLGT    armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe
153
154complexIFFT:
155
156        ASR     N,N,#1                              @ N/2 point complex IFFT
157        ADD     pSrc,pOut,N,LSL #2                  @ set pSrc as pOut1
158
159        CLZ     order,N                             @ N = 2^order
160        RSB     order,order,#31
161        MOV     subFFTSize,#1
162
163        ADD     scale,scale,order                   @ FFTInverse has a final scaling factor by N
164
165        CMP     order,#3
166        BGT     orderGreaterthan3                   @ order > 3
167
168        CMP     order,#1
169        BGE     orderGreaterthan0                   @ order > 0
170        M_STR   scale, diffOnStack,LT               @ order = 0
171        LDRLT   x0r,[pSrc]
172        STRLT   x0r,[pDst]
173        MOVLT   pSrc,pDst
174        BLT     FFTEnd
175
176orderGreaterthan0:
177        @ set the buffers appropriately for various orders
178        CMP     order,#2
179        MOVNE   argDst,pDst
180        MOVEQ   argDst,pOut
181        MOVEQ   pOut,pDst                           @ Pass the first stage destination in RN5
182        MOV     argTwiddle,pTwiddle
183        @ Store the scale factor and scale at the end
184        SUB     diff,scale,order
185        M_STR   diff, diffOnStack
186        BGE     orderGreaterthan1
187        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @ order = 1
188        B       FFTEnd
189
190
191orderGreaterthan1:
192        MOV     tmpOrder,order                      @ tmpOrder = RN 4
193        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
194        CMP     tmpOrder,#2
195        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
196        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
197        B       FFTEnd
198
199
200
201
202orderGreaterthan3:
203        @ check scale = 0 or scale = order
204        SUB     diff, scale, order                  @ scale > order
205
206        TST     order, #2                           @ Set input args to fft stages
207        MOVNE   argDst,pDst
208        MOVEQ   argDst,pOut
209        MOVEQ   pOut,pDst                           @ Pass the first stage destination in RN5
210        MOV     argTwiddle,pTwiddle
211
212        CMP     diff,#0
213        M_STR   diff, diffOnStack
214        BGE     scaleEqualsOrder
215
216        @check for even or odd order
217        @ NOTE: The following combination of BL's would work fine eventhough the first
218        @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
219        @ armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
220
221        TST     order,#0x00000001
222        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
223        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
224
225        CMP     subFFTNum,#4
226        BLT     FFTEnd
227
228unscaledRadix4Loop:
229        BEQ     lastStageUnscaledRadix4
230        BL      armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
231        CMP     subFFTNum,#4
232        B       unscaledRadix4Loop
233
234lastStageUnscaledRadix4:
235        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
236        B       FFTEnd
237
238scaleEqualsOrder:
239        @check for even or odd order
240        @ NOTE: The following combination of BL's would work fine eventhough the first
241        @ BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
242        @ armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
243
244        TST     order,#0x00000001
245        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
246        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
247
248        CMP     subFFTNum,#4
249        BLT     FFTEnd
250
251scaledRadix4Loop:
252        BEQ     lastStageScaledRadix4
253        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
254        CMP     subFFTNum,#4
255        B       scaledRadix4Loop
256
257lastStageScaledRadix4:
258        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
259
260FFTEnd:                                         @ Does only the scaling
261
262        M_LDR   diff, diffOnStack
263        CMP     diff,#0
264        BLE     End
265
266        RSB     diff,diff,#0                    @ to use VRSHL for right shift by a variable
267        VDUP    qShift,diff
268
269        @ Use parallel loads for bigger FFT size.
270        CMP     subFFTSize, #8
271        BLT     scaleLessFFTData
272
273scaleFFTData:
274        VLD1    {qT0s, qT1s},[pSrc:256]         @ pSrc contains pDst pointer
275        SUBS    subFFTSize,subFFTSize,#8
276        VSHL    qT0s,qShift
277        VSHL    qT1s,qShift
278        VST1    {qT0s, qT1s},[pSrc:256]!
279        BGT     scaleFFTData
280        B       End
281
282scaleLessFFTData:                               @ N = subFFTSize  ; dataptr = pDst  ; scale = diff
283        VLD1    {dX0[0]},[pSrc]                 @ pSrc contains pDst pointer
284        SUBS    subFFTSize,subFFTSize,#1
285        VRSHL   dX0,dShift
286        VST1    {dX0[0]},[pSrc]!
287        BGT     scaleLessFFTData
288
289End:
290        @ Set return value
291        MOV     result, #OMX_Sts_NoErr
292
293        @ Write function tail
294        M_END
295
296
297
298
299
300
301    .end
302