1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
11@//  to support float instead of SC32.
12@//
13
14@//
15@// Description:
16@// Compute FFT for a real signal
17@//
18@//
19
20
21@// Include standard headers
22
23#include "dl/api/arm/armCOMM_s.h"
24#include "dl/api/arm/omxtypes_s.h"
25
26@//        M_VARIANTS ARM1136JS
27
28@// Import symbols required from other files
29@// (For example tables)
30
31        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
32        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
33        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
34        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
35
36@// Set debugging level
37@//DEBUG_ON    SETL {TRUE}
38
39
40
41@// Guarding implementation by the processor name
42
43@//    IF  ARM1136JS
44
45@//Input Registers
46
47#define pSrc            r0
48#define pDst            r1
49#define pFFTSpec        r2
50
51
52@// Output registers
53#define result          r0
54
55@//Local Scratch Registers
56
57@// N=1 case
58#define scaleMinusOne   r2
59#define rnd             r2
60#define zero            r8
61#define Zero            r9
62
63
64#define argTwiddle      r1
65#define argDst          r2
66#define argScale        r4
67#define pTwiddle        r4
68#define pOut            r5
69#define subFFTSize      r7
70#define subFFTNum       r6
71#define N               r6
72#define order           r14
73#define diff            r9
74#define count           r8
75#define diffMinusOne    r10
76#define round           r3
77
78#define step            r3
79#define step1           r6
80#define twStep          r12
81#define pTwiddleTmp     r14
82#define t0              r12
83#define t1              r14              /*@// pTwiddleTmp*/
84#define t2              r0
85#define t3              r1               /*@// pSrc,argTwiddle*/
86#define t4              r6
87#define t5              r7               /*@// step1,subFFTSize*/
88
89#define x0r     s0
90#define x0i     s1
91#define y0r     s2
92#define y0i     s3
93#define x1r     s4
94#define x1i     s5
95#define w1r     s2
96#define w1i     s3
97#define w0r     s6
98#define w0i     s7
99#define y1r     s2              /*@// w1r,w1i*/
100#define y1i     s3
101#define st0     s8
102#define st1     s9
103#define st2     s10
104#define st3     s11
105#define st4     s12
106#define st5     s13
107#define half    s15
108
109
110
111
112    @// Allocate stack memory required by the function
113
114
115
116    @// Write function header
117        M_START     omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11
118
119@ Structure offsets for FFTSpec
120        .set    ARMsFFTSpec_N, 0
121        .set    ARMsFFTSpec_pBitRev, 4
122        .set    ARMsFFTSpec_pTwiddle, 8
123        .set    ARMsFFTSpec_pBuf, 12
124
125        @// Define stack arguments
126
127        @// Setup half value
128        movw    N, #0                   @// Use N as a temp.
129        movt    N, #0x3f00
130        vmov.f32 half, N
131
132        @// Read the size from structure and take log
133        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
134
135        @// Read other structure parameters
136        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
137        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
138
139        @//  N=1 Treat seperately
140        CMP     N,#1
141        BGT     sizeGreaterThanOne
142        // N<=1 is not supported
143        @// Set return value
144        MOV     result, #OMX_Sts_NoErr
145        B       FunctionEnd
146
147sizeGreaterThanOne:
148        @// Do a N/2 point complex FFT including the scaling
149
150        MOV     N,N,ASR #1              @// N/2 point complex FFT
151        CLZ     order,N                 @// N = 2^order
152        RSB     order,order,#31
153        MOV     subFFTSize,#1
154        @//MOV     subFFTNum,N
155
156
157        CMP     order,#1
158        BGT     orderGreaterthan1       @// order > 1
159        vldmlt.f32 pSrc, {x0r, x0i}
160        vstmlt.f32 pOut, {x0r, x0i}
161        MOVLT   pSrc,pOut
162        MOVLT   argDst,pDst
163        BLT     FFTEnd
164
165        MOV     argDst,pOut             @// Set input args to fft stages
166        MOV     pOut,pDst               @// Set input args to fft stages
167        MOV     argTwiddle,pTwiddle
168
169        BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp
170        B     finalComplexToRealFixup
171
172orderGreaterthan1:
173
174        TST     order, #2               @// Set input args to fft stages
175        MOVEQ   argDst,pDst
176        MOVNE   argDst,pOut
177        MOVNE   pOut,pDst               @// Pass the first stage dest in RN5
178        MOV     argTwiddle,pTwiddle
179
180        @//check for even or odd order
181
182        @// NOTE: The following combination of BL's would work fine
183        @// eventhough the first BL would corrupt the flags. This is
184        @// because the end of the "grpZeroSetLoop" loop inside
185        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets
186        @// the Z flag to EQ
187
188        TST     order,#0x00000001
189        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp
190        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp
191
192unscaledRadix4Loop:
193        CMP        subFFTNum,#1
194         BEQ        FFTEnd
195         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp
196         B        unscaledRadix4Loop
197
198FFTEnd:
199finalComplexToRealFixup:
200
201        @// step = N/2 * 8 bytes
202        MOV     step,subFFTSize,LSL #3
203        @// twStep = 3N/8 * 8 bytes pointing to W^1
204        SUB     twStep,step,subFFTSize,LSL #1
205        @// step1 = N/4 * 8 = N/2*4 bytes
206        MOV     step1,subFFTSize,LSL #2
207        @// (N/4-1)*8 bytes
208        SUB     step1,step1,#8
209
210        @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
211        @// 1/2 [(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
212        @// 1/2 [2a+j0] - j [0+j2b]
213        @// (a+b, 0)
214
215        @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
216        @// 1/2 [(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
217        @// 1/2 [2a+j0] + j [0+j2b]
218        @// (a-b, 0)
219
220        @// F(0) and F(N/2)
221        vldm.f32 pSrc!, {x0r, x0i}
222        vadd.f32 y0r,x0r,x0i            @// F(0) = (2(Z0.r+Z0.i) , 0)
223        vsub.f32 x0r,x0r,x0i            @// F(N/2) = (2(Z0.r-Z0.i) , 0)
224        vsub.f32 y0i, y0i               @ y0i and x0i set to 0.0
225        vsub.f32 x0i, x0i
226
227        add      argDst, step
228        vstm.f32 argDst, {x0r, x0i}     @// {x0r,x0i}->[argDst, step]
229        sub      argDst, step
230        vstm.f32 argDst!, {y0r, y0i}
231
232        SUBS    subFFTSize,subFFTSize,#2
233
234        ADD     pTwiddleTmp,argTwiddle,#8       @// W^2
235        ADD     argTwiddle,argTwiddle,twStep    @// W^1
236        BLT     End
237        BEQ     lastElement
238
239
240        @// F(k) = 1/2 [Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
241        @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since
242        @// both of them require Z(1) and Z(N/2-1)
243
244        ASR     subFFTSize,subFFTSize,#1
245evenOddButterflyLoop:
246
247        SUB     step,step,#16           @// (N/2-2)*8 bytes
248
249        add      pSrc, step
250        vldm.f32 pSrc, {x1r, x1i}       @// {x1r, x1i} = [pSrc, step]
251        sub      pSrc, step
252        vldm.f32 pSrc!, {x0r, x0i}
253        add      argTwiddle, step1
254        vldm.f32 argTwiddle, {w1r, w1i}  @// {w1r, w1i} = [argTwiddle, step1]
255        sub      argTwiddle, step1
256        vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8
257
258        SUB     step1,step1,#8
259        SUBS    subFFTSize,subFFTSize,#1
260
261        vsub.f32 st2,x0r,x1r            @// a-c
262        vadd.f32 st3,x0i,x1i            @// b+d
263        vadd.f32 st0,x0r,x1r            @// a+c
264        vsub.f32 st1,x0i,x1i            @// b-d
265
266        vmul.f32 x1r,w1r,st2
267        vmul.f32 x1i,w1r,st3
268        vmla.f32 x1r,w1i,st3            @// x1r = w1r*st2 + w1i*st3
269        @//RSB     x1r,x1r,#0
270        vmls.f32 x1i,w1i,st2            @// x1i = w1r*st3 - wli*st2
271
272        vsub.f32 y1r, st0, x1i
273        vadd.f32 y1i, x1r, st1
274        vneg.f32 y1i, y1i
275
276        vmul.f32  x0r,w0r,st2
277        vmul.f32  x0i,w0r,st3
278        vmls.f32  x0r,w0i,st3           @// x0r = w0r*st2 - w0i*st3
279        vmla.f32  x0i,w0i,st2           @// x0i = w0r*st3 + x0i*st1
280
281        vsub.f32   st4,st0,x0i          @// F(1)
282        vadd.f32   st5,x0r,st1
283
284
285        vmul.f32 y1r, half
286        vmul.f32 y1i, half
287        vmul.f32 st4, half
288        vmul.f32 st5, half
289
290        add      argDst, step
291        vstm.f32 argDst, {y1r, y1i}     @// {y1r,y1i} -> [argDst,step]
292        sub      argDst, step
293        vstm.f32 argDst!, {st4, st5}
294
295
296        MOV     t0,argTwiddle           @// swap ptr for even and odd twiddles
297        MOV     argTwiddle,pTwiddleTmp
298        MOV     pTwiddleTmp,t0
299
300        BGT     evenOddButterflyLoop
301
302        @// Last element can be expanded as follows
303        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
304        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
305        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
306        @// (a-bc, -bd)
307
308lastElement:
309        vldm.f32 pSrc, {x0r, x0i}
310        vneg.f32 x0i, x0i
311        vstm.f32 argDst, {x0r, x0i}
312
313End:
314        @// Set return value
315        MOV     result, #OMX_Sts_NoErr
316
317FunctionEnd:
318        @// Write function tail
319        M_END
320
321@//    ENDIF                                           @//ARM1136JS
322
323
324    @// Guarding implementation by the processor name
325
326
327
328    .end
329