1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
11@//  to support float instead of SC32.
12@//
13
14@//
15@// Description:
16@// Compute FFT for a real signal
17@//
18@//
19
20
21@// Include standard headers
22
23#include "dl/api/arm/armCOMM_s.h"
24#include "dl/api/arm/omxtypes_s.h"
25
26
27@// Import symbols required from other files
28@// (For example tables)
29
30        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
31        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
32        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
33        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
34        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
35        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
36        .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
37
38@// Set debugging level
39@//DEBUG_ON    SETL {TRUE}
40
41
42
43@// Guarding implementation by the processor name
44
45
46
47    @// Guarding implementation by the processor name
48
49@// Import symbols required from other files
50@// (For example tables)
51        .extern  armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
52        .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
53
54
55@//Input Registers
56
57#define pSrc            r0
58#define pDst            r1
59#define pFFTSpec        r2
60#define scale           r3
61
62
63@// Output registers
64#define result          r0
65
66@//Local Scratch Registers
67
68#define argTwiddle      r1
69#define argDst          r2
70#define argScale        r4
71#define tmpOrder        r4
72#define pTwiddle        r4
73#define pOut            r5
74#define subFFTSize      r7
75#define subFFTNum       r6
76#define N               r6
77#define order           r14
78#define diff            r9
79@// Total num of radix stages required to comple the FFT
80#define count           r8
81#define x0r             r4
82#define x0i             r5
83#define diffMinusOne    r2
84#define subFFTSizeTmp   r6
85#define step            r3
86#define step1           r4
87#define twStep          r8
88#define zero            r9
89#define pTwiddleTmp     r5
90#define t0              r10
91
92@// Neon registers
93
94#define dX0       d0.f32
95#define dzero     d1.f32
96#define dZero     d2.f32
97#define dShift    d3.f32
98#define dX0r      d2.f32
99#define dX0i      d3.f32
100#define dX1r      d4.f32
101#define dX1i      d5.f32
102#define dT0       d6.f32
103#define dT1       d7.f32
104#define dT2       d8.f32
105#define dT3       d9.f32
106#define qT0       d10.f32
107#define qT1       d12.f32
108#define dW0r      d14.f32
109#define dW0i      d15.f32
110#define dW1r      d16.f32
111#define dW1i      d17.f32
112#define dY0r      d14.f32
113#define dY0i      d15.f32
114#define dY1r      d16.f32
115#define dY1i      d17.f32
116#define dY0rS64   d14.s64
117#define dY0iS64   d15.s64
118#define qT2       d18.f32
119#define qT3       d20.f32
120@// lastThreeelements
121#define dX1       d3.f32
122#define dW0       d4.f32
123#define dW1       d5.f32
124#define dY0       d10.f32
125#define dY1       d11.f32
126#define dY2       d12.f32
127#define dY3       d13.f32
128
129#define half      d0.f32
130
131    @// Allocate stack memory required by the function
132
133    @// Write function header
134        M_START     omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15
135
136@ Structure offsets for the FFTSpec
137        .set    ARMsFFTSpec_N, 0
138        .set    ARMsFFTSpec_pBitRev, 4
139        .set    ARMsFFTSpec_pTwiddle, 8
140        .set    ARMsFFTSpec_pBuf, 12
141
142        @// Define stack arguments
143
144        @// Read the size from structure and take log
145        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
146
147        @// Read other structure parameters
148        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
149        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
150
151        @//  N=1 Treat seperately
152        CMP     N,#1
153        BGT     sizeGreaterThanOne
154        VLD1    dX0[0],[pSrc]
155        MOV     zero,#0
156        VMOV    dzero[0],zero
157        VMOV    dZero[0],zero
158        VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
159
160        B       End
161
162
163
164sizeGreaterThanOne:
165        @// Do a N/2 point complex FFT including the scaling
166
167        MOV     N,N,ASR #1                          @// N/2 point complex FFT
168
169        CLZ     order,N                             @// N = 2^order
170        RSB     order,order,#31
171        MOV     subFFTSize,#1
172        @//MOV     subFFTNum,N
173
174        CMP     order,#3
175        BGT     orderGreaterthan3                   @// order > 3
176
177        CMP     order,#1
178        BGE     orderGreaterthan0                   @// order > 0
179        VLD1    dX0,[pSrc]
180        VST1    dX0,[pOut]
181        MOV     pSrc,pOut
182        MOV     argDst,pDst
183        BLT     FFTEnd
184
185orderGreaterthan0:
186        @// set the buffers appropriately for various orders
187        CMP     order,#2
188        MOVEQ   argDst,pDst
189        MOVNE   argDst,pOut
190        @// Pass the first stage destination in RN5
191        MOVNE   pOut,pDst
192        MOV     argTwiddle,pTwiddle
193
194        CMP     order,#1
195        BGT     orderGreaterthan1
196        @// order = 1
197        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
198        B       FFTEnd
199
200orderGreaterthan1:
201        CMP     order,#2
202        BGT     orderGreaterthan2
203        @// order =2
204        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
205        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
206        B       FFTEnd
207
208orderGreaterthan2:@// order =3
209        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
210        BL      armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
211        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
212
213        B       FFTEnd
214
215
216
217orderGreaterthan3:
218specialScaleCase:
219
220        @// Set input args to fft stages
221        TST     order, #2
222        MOVEQ   argDst,pDst
223        MOVNE   argDst,pOut
224        @// Pass the first stage destination in RN5
225        MOVNE   pOut,pDst
226        MOV     argTwiddle,pTwiddle
227
228        @//check for even or odd order
229        @// NOTE: The following combination of BL's would work fine even though
230        @// the first BL would corrupt the flags. This is because the end of
231        @// the "grpZeroSetLoop" loop inside
232        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
233        @// to EQ
234
235        TST     order,#0x00000001
236        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
237        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
238
239        CMP        subFFTNum,#4
240        BLT     FFTEnd
241
242
243unscaledRadix4Loop:
244        BEQ        lastStageUnscaledRadix4
245         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
246         CMP        subFFTNum,#4
247         B        unscaledRadix4Loop
248
249lastStageUnscaledRadix4:
250        BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
251        B        FFTEnd
252
253
254FFTEnd:
255finalComplexToRealFixup:
256
257
258        @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
259        @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
260        @// 1/2[2a+j0] - j [0+j2b]
261        @// (a+b, 0)
262
263        @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
264        @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
265        @// 1/2[2a+j0] + j [0+j2b]
266        @// (a-b, 0)
267
268        @// F(0) and F(N/2)
269        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
270        MOV     zero,#0
271        VMOV    dX0r[1],zero
272        MOV     step,subFFTSize,LSL #3            @// step = N/2 * 8 bytes
273        VMOV    dX0i[1],zero
274        @// twStep = 3N/8 * 8 bytes pointing to W^1
275        SUB     twStep,step,subFFTSize,LSL #1
276
277        VADD    dY0r,dX0r,dX0i                    @// F(0) = ((Z0.r+Z0.i) , 0)
278        MOV     step1,subFFTSize,LSL #2           @// step1 = N/2 * 4 bytes
279        VSUB    dY0i,dX0r,dX0i                    @// F(N/2) = ((Z0.r-Z0.i) , 0)
280        SUBS    subFFTSize,subFFTSize,#2
281
282        VST1    dY0r,[argDst],step
283        ADD     pTwiddleTmp,argTwiddle,#8         @// W^2
284        VST1    dY0i,[argDst]!
285        ADD     argTwiddle,argTwiddle,twStep      @// W^1
286
287        VDUP    dzero,zero
288        SUB     argDst,argDst,step
289
290        BLT     End
291        BEQ     lastElement
292        SUB     step,step,#24
293        SUB     step1,step1,#8                    @// (N/4-1)*8 bytes
294
295        @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
296        @// Note: W^k is stored as negative values in the table
297        @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
298        @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
299
300        VMOV    half, #0.5
301
302evenOddButterflyLoop:
303
304
305        VLD1    dW0r,[argTwiddle],step1
306        VLD1    dW1r,[argTwiddle]!
307
308        VLD2    {dX0r,dX0i},[pSrc],step
309        SUB     argTwiddle,argTwiddle,step1
310        VLD2    {dX1r,dX1i},[pSrc]!
311
312
313
314        SUB     step1,step1,#8                    @// (N/4-2)*8 bytes
315        VLD1    dW0i,[pTwiddleTmp],step1
316        VLD1    dW1i,[pTwiddleTmp]!
317        SUB     pSrc,pSrc,step
318
319        SUB     pTwiddleTmp,pTwiddleTmp,step1
320        VREV64  dX1r,dX1r
321        VREV64  dX1i,dX1i
322        SUBS    subFFTSize,subFFTSize,#4
323
324
325
326        VSUB    dT2,dX0r,dX1r                     @// a-c
327        SUB     step1,step1,#8
328        VADD    dT0,dX0r,dX1r                     @// a+c
329        VSUB    dT1,dX0i,dX1i                     @// b-d
330        VADD    dT3,dX0i,dX1i                     @// b+d
331        VMUL   dT0,dT0,half[0]
332        VMUL   dT1,dT1,half[0]
333        VZIP    dW1r,dW1i
334        VZIP    dW0r,dW0i
335
336
337        VMUL   qT0,dW1r,dT2
338        VMUL   qT1,dW1r,dT3
339        VMUL   qT2,dW0r,dT2
340        VMUL   qT3,dW0r,dT3
341
342        VMLA   qT0,dW1i,dT3
343        VMLS   qT1,dW1i,dT2
344
345        VMLS   qT2,dW0i,dT3
346        VMLA   qT3,dW0i,dT2
347
348
349        VMUL  dX1r,qT0,half[0]
350        VMUL  dX1i,qT1,half[0]
351
352        VSUB    dY1r,dT0,dX1i                     @// F(N/2 -1)
353        VADD    dY1i,dT1,dX1r
354        VNEG    dY1i,dY1i
355
356        VREV64  dY1r,dY1r
357        VREV64  dY1i,dY1i
358
359
360        VMUL  dX0r,qT2,half[0]
361        VMUL  dX0i,qT3,half[0]
362
363        VSUB    dY0r,dT0,dX0i                     @// F(1)
364        VADD    dY0i,dT1,dX0r
365
366
367        VST2    {dY0r,dY0i},[argDst],step
368        VST2    {dY1r,dY1i},[argDst]!
369        SUB     argDst,argDst,step
370        SUB     step,step,#32                     @// (N/2-4)*8 bytes
371
372
373        BGT     evenOddButterflyLoop
374
375        @// set both the ptrs to the last element
376        SUB     pSrc,pSrc,#8
377        SUB     argDst,argDst,#8
378
379
380
381        @// Last element can be expanded as follows
382        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
383        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
384        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
385        @// (a-bc, -bd)
386        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
387
388lastElement:
389        VLD1    dX0r,[pSrc]
390
391        VST1    dX0r[0],[argDst]!
392        VNEG    dX0r,dX0r
393        VST1    dX0r[1],[argDst]!
394
395End:
396        @// Set return value
397        MOV     result, #OMX_Sts_NoErr
398
399        @// Write function tail
400        M_END
401
402        .end
403