1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
11@//  to support float instead of SC32.
12@//
13
14@//
15@// Description:
16@// Compute a first stage Radix 8 FFT stage for a N point complex signal
17@//
18@//
19
20
21@// Include standard headers
22
23#include "dl/api/arm/armCOMM_s.h"
24#include "dl/api/arm/omxtypes_s.h"
25
26@//        M_VARIANTS ARM1136JS
27
28@// Import symbols required from other files
29@// (For example tables)
30
31
32@// Set debugging level
33@//DEBUG_ON    SETL {TRUE}
34
35
36
37@// Guarding implementation by the processor name
38
39@//    IF  ARM1136JS
40
41@//Input Registers
42
43#define pSrc            r0
44#define pDst            r2
45#define pTwiddle        r1
46#define subFFTNum       r6
47#define subFFTSize      r7
48#define pPingPongBuf    r5
49
50
51@//Output Registers
52
53
54@//Local Scratch Registers
55
56#define grpSize         r14
57#define step1           r3
58#define step2           r8
59#define setCount        r14             /*@// Reuse grpSize as setCount*/
60#define pointStep       r12
61
62#define t0              r4
63@// Real and Imaginary parts
64
65#define x0r             s0
66#define x0i             s1
67#define x1r             s2
68#define x1i             s3
69#define x2r             s4
70#define x2i             s5
71#define x3r             s6
72#define x3i             s7
73#define t3r             s8              /*@// Temporarily hold x3r and x3i*/
74#define t3i             s9
75#define t1r             s4
76#define t1i             s5
77#define sr              s10
78#define si              s11
79#define roothalf        s12
80
81@// Define macros to load/store two float regs from/to the stack.
82        .macro M_VSTM r0, r1, p
83        .set    _Offset, _Workspace + \p\()_F
84        add     t0, sp, #_Offset
85        vstm.f32 t0, {\r0, \r1}
86        .endm
87
88        .macro M_VLDM r0, r1, p
89        .set    _Offset, _Workspace + \p\()_F
90        add     t0, sp, #_Offset
91        vldm.f32 t0, {\r0, \r1}
92        .endm
93
94@// Define constants
95
96        .macro FFTSTAGE scaled, inverse , name
97
98        @// Define stack arguments
99
100
101        @// Update grpCount and grpSize rightaway inorder to reuse
102        @// pSubFFTSize and pSubFFTNum regs
103
104        mov     subFFTSize, #8
105        lsr     grpSize, subFFTNum, #3
106        mov     subFFTNum, grpSize
107
108
109        @// pT0+1 increments pT0 by 8 bytes
110        @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
111        @// Note: setCount = grpSize/8 (reuse the updated grpSize for
112        @// setCount)
113        MOV     pointStep,grpSize,LSL #3
114
115
116        @// Calculate the step of input data for the next set
117        MOV     step1,grpSize,LSL #4
118        MOV     step2,pointStep,LSL #3
119        SUB     step2,step2,pointStep           @// step2 = 7*pointStep
120
121
122        @// grp = 0 a special case since all the twiddle factors are 1
123        @// Loop on the sets
124
125        movw    t0,#0x04f3
126        movt    t0,#0x3f35
127        vmov.f32 roothalf, t0                   @// roothalf = sqrt(1/2)
128
129grpZeroSetLoop\name:
130
131        vldm.f32 pSrc, {x0r, x0i}               @// x0
132        add      pSrc, step1
133        vldm.f32 pSrc, {x1r, x1i}               @// x2
134        add      pSrc, step1
135        vldm.f32 pSrc, {x2r, x2i}               @// x4
136        add      pSrc, step1
137        vldm.f32 pSrc, {x3r, x3i}               @// x6
138        add      pSrc, step1
139
140        SUB     pSrc, pSrc, step2
141
142        @// finish first stage of 8 point FFT and save on stack
143
144        vadd.f32     x0r,x0r,x2r                @// u0
145        vadd.f32     x0i,x0i,x2i
146
147        vadd.f32     sr, x2r, x2r
148        vadd.f32     si, x2i, x2i
149        vsub.f32     x2r,x0r,sr                 @// u1
150        vsub.f32     x2i,x0i,si
151
152        M_VSTM   x0r,x0i, pU0
153        M_VSTM   x2r,x2i, pU1
154
155        vadd.f32     x1r,x1r,x3r                @// u4
156        vadd.f32     x1i,x1i,x3i
157
158        vadd.f32     sr, x3r, x3r
159        vadd.f32     si, x3i, x3i
160        vsub.f32     x3r,x1r,sr                 @// u5
161        vsub.f32     x3i,x1i,si
162
163        M_VSTM   x1r,x1i, pU4
164        M_VSTM   x3r,x3i, pU5
165
166
167        vldm    pSrc, {x0r, x0i}                @// x1
168        add     pSrc, step1
169        vldm    pSrc, {x1r, x1i}                @// x3
170        add     pSrc, step1
171        vldm    pSrc, {x2r, x2i}                @// x5
172        add     pSrc, step1
173        vldm    pSrc, {x3r, x3i}                @// x7
174        add     pSrc, #8
175
176        SUB     pSrc, pSrc, step2
177
178        vadd.f32     x0r,x0r,x2r                @// u2
179        vadd.f32     x0i,x0i,x2i
180
181        vadd.f32         sr, x2r, x2r
182        vadd.f32         si, x2i, x2i
183        vsub.f32     x2r,x0r,sr                 @// u3
184        vsub.f32     x2i,x0i,si
185
186        M_VSTM   x2r,x2i, pU3
187
188        vadd.f32     x1r,x1r,x3r                @// u6
189        vadd.f32     x1i,x1i,x3i
190
191        vadd.f32         sr, x3r, x3r
192        vadd.f32         si, x3i, x3i
193        vsub.f32     x3r,x1r,sr                 @// u7
194        vsub.f32     x3i,x1i,si
195
196        @// finish second and third stage of 8 point FFT
197
198        M_VSTM  x3r,x3i, pU7
199        M_VLDM  x2r,x2i, pU0
200
201        @// Decrement setcount
202        SUBS    setCount,setCount,#1
203        M_VLDM  x3r,x3i, pU4
204
205        vadd.f32     x0r,x0r,x1r                @// v4
206        vadd.f32     x0i,x0i,x1i
207
208        vadd.f32     sr, x1r, x1r
209        vadd.f32     si, x1i, x1i
210        vsub.f32     x1r,x0r,sr                 @// v6
211        vsub.f32     x1i,x0i,si
212
213        vadd.f32     x2r,x2r,x3r                @// v0
214        vadd.f32     x2i,x2i,x3i
215
216        vadd.f32     sr, x3r, x3r
217        vadd.f32     si, x3i, x3i
218        vsub.f32     x3r,x2r,sr                 @// v2
219        vsub.f32     x3i,x2i,si
220
221
222
223        vadd.f32     x2r,x2r,x0r                @// y0
224        vadd.f32     x2i,x2i,x0i
225
226        vadd.f32     sr, x0r, x0r
227        vadd.f32     si, x0i, x0i
228        vsub.f32     x0r,x2r,sr                 @// y4
229        vsub.f32     x0i,x2i,si
230
231        vstm    pDst, {x2r, x2i}                @// store y0
232        add     pDst, step1
233
234        vadd.f32     x3r,x3r,x1i                @// y6
235        vsub.f32     x3i,x3i,x1r
236
237        vadd.f32     sr, x1r, x1r
238        vadd.f32     si, x1i, x1i
239        vsub.f32     t1r,x3r,si                 @// t1r=x2r reg;t1i=x2i reg
240        vadd.f32     t1i,x3i,sr                 @// y2
241
242        .ifeqs  "\inverse", "TRUE"
243            vstm        pDst, {t1r, t1i}        @// store y2
244            add pDst, step1
245            vstm        pDst, {x0r, x0i}        @// store y4
246            add pDst, step1
247            vstm        pDst, {x3r, x3i}        @// store y6
248            add pDst, step1
249        .else
250            vstm        pDst, {x3r, x3i}        @// store y2
251            add pDst, step1
252            vstm        pDst, {x0r, x0i}        @// store y4
253            add pDst, step1
254            vstm        pDst, {t1r, t1i}        @// store y6
255            add pDst, step1
256        .endif
257
258        SUB     pDst, pDst, step2               @// set pDst to y1
259
260
261        M_VLDM  x0r,x0i,pU1                     @// Load u1,u3,u5,u7
262        M_VLDM  x1r,x1i,pU5
263        M_VLDM  x3r,x3i,pU7
264
265        vsub.f32     x0r,x0r,x1i                @// v1
266        vadd.f32     x0i,x0i,x1r
267        vadd.f32     sr, x1r, x1r
268        vadd.f32     si, x1i, x1i
269        vadd.f32     t1r,x0r,si                 @// t1r=x2r reg;t1i=x2i reg
270        vsub.f32     t1i,x0i,sr                 @// v3
271
272        M_VLDM  x1r,x1i,pU3
273
274        vsub.f32     x1r,x1r,x3i                @// v5
275        vadd.f32     x1i,x1i,x3r
276
277        vadd.f32     sr, x3r, x3r
278        vadd.f32     si, x3i, x3i
279        vadd.f32     t3r,x1r,si                 @// t3i = x3i
280        vsub.f32     t3i,x1i,sr                 @// v7
281
282        @// store v5  as (v5.r - v5.i,v5.r + v5.i)
283        @// store v7  as (v7.i + v7.r,v7.i - v7.r)
284
285        vadd.f32     x3r,t3i,t3r                @// v7
286        vsub.f32     x3i,t3i,t3r
287
288        vsub.f32     x1r,x1r,x1i                @// v5
289        vadd.f32     x1i, x1i
290        vadd.f32     x1i,x1r,x1i
291
292        vmul.f32  x3r, x3r, roothalf            @// (v7.i + v7.r)*(1/sqrt(2))
293        vmul.f32  x3i, x3i, roothalf            @// (v7.i - v7.r)*(1/sqrt(2))
294        vmul.f32  x1r, x1r, roothalf            @// (v5.r - v5.i)*(1/sqrt(2))
295        vmul.f32  x1i, x1i, roothalf            @// (v5.r + v5.i)*(1/sqrt(2))
296
297        vadd.f32     x2r,x2r,x3r                @// y7
298        vadd.f32     x2i,x2i,x3i
299
300        vadd.f32     sr, x3r, x3r
301        vadd.f32     si, x3i, x3i
302        vsub.f32     x3r,x2r,sr                 @// y3
303        vsub.f32     x3i,x2i,si
304
305
306        vsub.f32     x0r,x0r,x1r                @// y5
307        vsub.f32     x0i,x0i,x1i
308
309        vadd.f32     sr, x1r, x1r
310        vadd.f32     si, x1i, x1i
311        vadd.f32     x1r,x0r,sr                 @// y1
312        vadd.f32     x1i,x0i,si
313
314        .ifeqs  "\inverse", "TRUE"
315            vstm    pDst, {x1r, x1i}            @// store y1
316            add pDst, step1
317            vstm    pDst, {x3r, x3i}            @// store y3
318            add pDst, step1
319            vstm    pDst, {x0r, x0i}            @// store y5
320            add pDst, step1
321            vstm    pDst, {x2r, x2i}            @// store y7
322            add pDst, #8
323        .else
324            vstm    pDst, {x2r, x2i}            @// store y1
325            add pDst, step1
326            vstm    pDst, {x0r, x0i}            @// store y3
327            add pDst, step1
328            vstm    pDst, {x3r, x3i}            @// store y5
329            add pDst, step1
330            vstm    pDst, {x1r, x1i}            @// store y7
331            add pDst, #8
332        .endif
333
334        SUB     pDst, pDst, step2               @// update pDst for the next set
335
336
337        BGT     grpZeroSetLoop\name
338
339
340        @// reset pSrc to pDst for the next stage
341        SUB     pSrc,pDst,pointStep             @// pDst -= 2*grpSize
342        mov     pDst, pPingPongBuf
343
344
345        .endm
346
347
348
349
350
351        @// Allocate stack memory required by the function
352
353        @// Ensure 8 byte alignment to use M_VLDM
354        M_ALLOC8    pU0, 8
355        M_ALLOC8    pU1, 8
356        M_ALLOC8    pU3, 8
357        M_ALLOC8    pU4, 8
358        M_ALLOC8    pU5, 8
359        M_ALLOC8    pU7, 8
360
361        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
362            FFTSTAGE "FALSE","FALSE",FWD
363        M_END
364
365        @// Allocate stack memory required by the function
366
367        @// Ensure 8 byte alignment to use M_VLDM
368        M_ALLOC8    pU0, 8
369        M_ALLOC8    pU1, 8
370        M_ALLOC8    pU3, 8
371        M_ALLOC8    pU4, 8
372        M_ALLOC8    pU5, 8
373        M_ALLOC8    pU7, 8
374
375        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp,r4
376            FFTSTAGE "FALSE","TRUE",INV
377        M_END
378
379@//    ENDIF        @//ARM1136JS
380
381
382
383@// Guarding implementation by the processor name
384
385
386    .end
387