1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12@//
13
14@//
15@// File Name:  armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7767
18@// Last Modified Date:       Thu, 27 Sep 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute a Radix 4 FFT stage for a N point complex signal
26@//
27
28
29@// Include standard headers
30
31#include "dl/api/arm/armCOMM_s.h"
32#include "dl/api/arm/omxtypes_s.h"
33
34@// Import symbols required from other files
35@// (For example tables)
36
37
38
39
40@// Set debugging level
41@//DEBUG_ON    SETL {TRUE}
42
43
44@// Guarding implementation by the processor name
45
46
47@// Import symbols required from other files
48@// (For example tables)
49    @//IMPORT  armAAC_constTable
50
51@//Input Registers
52
53#define pSrc		r0
54#define pDst		r2
55#define pTwiddle	r1
56#define subFFTNum	r6
57#define subFFTSize	r7
58
59
60
61@//Output Registers
62
63
64@//Local Scratch Registers
65
66#define outPointStep	r3
67#define grpCount	r4
68#define dstStep		r5
69#define grpTwStep	r8
70#define stepTwiddle	r9
71#define twStep		r10
72#define pTmp		r4
73#define step16		r11
74#define step24		r12
75
76
77@// Neon Registers
78
79#define dButterfly1Real02	D0.S32
80#define dButterfly1Imag02	D1.S32
81#define dButterfly1Real13	D2.S32
82#define dButterfly1Imag13	D3.S32
83#define dButterfly2Real02	D4.S32
84#define dButterfly2Imag02	D5.S32
85#define dButterfly2Real13	D6.S32
86#define dButterfly2Imag13	D7.S32
87#define dXr0			D0.S32
88#define dXi0			D1.S32
89#define dXr1			D2.S32
90#define dXi1			D3.S32
91#define dXr2			D4.S32
92#define dXi2			D5.S32
93#define dXr3			D6.S32
94#define dXi3			D7.S32
95
96#define dYr0			D16.S32
97#define dYi0			D17.S32
98#define dYr1			D18.S32
99#define dYi1			D19.S32
100#define dYr2			D20.S32
101#define dYi2			D21.S32
102#define dYr3			D22.S32
103#define dYi3			D23.S32
104
105#define dW1r			D8.S32
106#define dW1i			D9.S32
107#define dW2r			D10.S32
108#define dW2i			D11.S32
109#define dW3r			D12.S32
110#define dW3i			D13.S32
111#define qT0			Q7.S64
112#define qT1			Q8.S64
113#define qT2			Q9.S64
114#define qT3			Q10.S64
115#define qT4			Q11.S64
116#define qT5			Q12.S64
117
118#define dZr0			D14.S32
119#define dZi0			D15.S32
120#define dZr1			D26.S32
121#define dZi1			D27.S32
122#define dZr2			D28.S32
123#define dZi2			D29.S32
124#define dZr3			D30.S32
125#define dZi3			D31.S32
126
127#define qX0			Q0.S32
128#define qY0			Q8.S32
129#define qY1			Q9.S32
130#define qY2			Q10.S32
131#define qY3			Q11.S32
132#define qZ0			Q7.S32
133#define qZ1			Q13.S32
134#define qZ2			Q14.S32
135#define qZ3			Q15.S32
136
137
138
139        .MACRO FFTSTAGE scaled, inverse , name
140
141        @// Define stack arguments
142
143
144        @// pOut0+1 increments pOut0 by 8 bytes
145        @// pOut0+outPointStep == increment of 8*outPointStep bytes
146        MOV     outPointStep,subFFTSize,LSL #3
147
148        @// Update grpCount and grpSize rightaway
149
150        VLD2    {dW1r,dW1i},[pTwiddle :128]                          @// [wi|wr]
151        MOV     step16,#16
152        LSL     grpCount,subFFTSize,#2
153
154        VLD1    dW2r,[pTwiddle :64]                             @// [wi|wr]
155        MOV     subFFTNum,#1                            @//after the last stage
156
157        VLD1    dW3r,[pTwiddle :64],step16                     @// [wi|wr]
158        MOV     stepTwiddle,#0
159
160        VLD1    dW2i,[pTwiddle :64]!                            @// [wi|wr]
161        SUB     grpTwStep,stepTwiddle,#8                    @// grpTwStep = -8 to start with
162
163        @// update subFFTSize for the next stage
164        MOV     subFFTSize,grpCount
165        VLD1    dW3i,[pTwiddle :64],grpTwStep                           @// [wi|wr]
166        MOV     dstStep,outPointStep,LSL #1
167
168        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
169        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
170        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
171        MOV     step24,#24
172
173        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
174
175
176        @// Process two groups at a time
177
178grpLoop\name :
179
180        VZIP    dW2r,dW2i
181        ADD     stepTwiddle,stepTwiddle,#16                 @// increment for the next iteration
182        VZIP    dW3r,dW3i
183        ADD     grpTwStep,stepTwiddle,#4
184        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
185        SUB     twStep,stepTwiddle,#16                      @// -16+stepTwiddle
186        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
187        MOV     grpTwStep,grpTwStep,LSL #1
188        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
189        RSB     grpTwStep,grpTwStep,#0                      @// -8-2*stepTwiddle
190
191
192        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
193
194
195        SUBS    grpCount,grpCount,#8                    @// grpCount is multiplied by 4
196
197        .ifeqs  "\inverse", "TRUE"
198            VMULL   qT0,dW1r,dXr1
199            VMLAL   qT0,dW1i,dXi1                       @// real part
200            VMULL   qT1,dW1r,dXi1
201            VMLSL   qT1,dW1i,dXr1                       @// imag part
202
203        .else
204
205            VMULL   qT0,dW1r,dXr1
206            VMLSL   qT0,dW1i,dXi1                       @// real part
207            VMULL   qT1,dW1r,dXi1
208            VMLAL   qT1,dW1i,dXr1                       @// imag part
209
210        .endif
211
212        VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
213
214        .ifeqs  "\inverse", "TRUE"
215            VMULL   qT2,dW2r,dXr2
216            VMLAL   qT2,dW2i,dXi2                       @// real part
217            VMULL   qT3,dW2r,dXi2
218            VLD1    dW2r,[pTwiddle :64],step16                  @// [wi|wr]
219            VMLSL   qT3,dW2i,dXr2                       @// imag part
220
221        .else
222
223            VMULL   qT2,dW2r,dXr2
224            VMLSL   qT2,dW2i,dXi2                       @// real part
225            VMULL   qT3,dW2r,dXi2
226            VLD1    dW2r,[pTwiddle :64],step16                  @// [wi|wr]
227            VMLAL   qT3,dW2i,dXr2                       @// imag part
228
229        .endif
230
231
232        VRSHRN  dZr1,qT0,#31
233        VLD1    dW2i,[pTwiddle :64],twStep                  @// [wi|wr]
234        VRSHRN  dZi1,qT1,#31
235
236        VMOV     qZ0,qX0                                @// move qX0 so as to load for the next iteration
237        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
238
239
240        .ifeqs  "\inverse", "TRUE"
241            VMULL   qT4,dW3r,dXr3
242            VMLAL   qT4,dW3i,dXi3                       @// real part
243            VMULL   qT5,dW3r,dXi3
244            VLD1    dW3r,[pTwiddle :64],step24
245            VMLSL   qT5,dW3i,dXr3                       @// imag part
246
247        .else
248
249            VMULL   qT4,dW3r,dXr3
250            VMLSL   qT4,dW3i,dXi3                       @// real part
251            VMULL   qT5,dW3r,dXi3
252            VLD1    dW3r,[pTwiddle :64],step24
253            VMLAL   qT5,dW3i,dXr3                       @// imag part
254
255        .endif
256
257        VRSHRN  dZr2,qT2,#31
258        VLD1    dW3i,[pTwiddle :64],grpTwStep                           @// [wi|wr]
259        VRSHRN  dZi2,qT3,#31
260
261        VRSHRN  dZr3,qT4,#31
262        VRSHRN  dZi3,qT5,#31
263        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
264
265
266        .ifeqs "\scaled", "TRUE"
267
268            @// finish first stage of 4 point FFT
269
270            VHADD    qY0,qZ0,qZ2
271            VHSUB    qY2,qZ0,qZ2
272            VHADD    qY1,qZ1,qZ3
273            VHSUB    qY3,qZ1,qZ3
274
275
276            @// finish second stage of 4 point FFT
277
278            .ifeqs  "\inverse", "TRUE"
279
280                VHSUB    qZ0,qY2,qY1
281
282                VHADD    dZr3,dYr0,dYi3
283                VST2    {dZr0,dZi0},[pDst :128],outPointStep
284                VHSUB    dZi3,dYi0,dYr3
285
286                VHADD    qZ2,qY2,qY1
287                VST2    {dZr3,dZi3},[pDst :128],outPointStep
288
289                VHSUB    dZr1,dYr0,dYi3
290                VST2    {dZr2,dZi2},[pDst :128],outPointStep
291                VHADD    dZi1,dYi0,dYr3
292
293                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
294
295
296            .else
297
298                VHSUB    qZ0,qY2,qY1
299
300                VHSUB    dZr1,dYr0,dYi3
301                VST2    {dZr0,dZi0},[pDst :128],outPointStep
302                VHADD    dZi1,dYi0,dYr3
303
304                VHADD    qZ2,qY2,qY1
305                VST2    {dZr1,dZi1},[pDst :128],outPointStep
306
307                VHADD    dZr3,dYr0,dYi3
308                VST2    {dZr2,dZi2},[pDst :128],outPointStep
309                VHSUB    dZi3,dYi0,dYr3
310
311                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
312
313
314            .endif
315
316
317
318        .else
319
320            @// finish first stage of 4 point FFT
321
322            VADD    qY0,qZ0,qZ2
323            VSUB    qY2,qZ0,qZ2
324            VADD    qY1,qZ1,qZ3
325            VSUB    qY3,qZ1,qZ3
326
327
328            @// finish second stage of 4 point FFT
329
330            .ifeqs  "\inverse", "TRUE"
331
332                VSUB    qZ0,qY2,qY1
333
334                VADD    dZr3,dYr0,dYi3
335                VST2    {dZr0,dZi0},[pDst :128],outPointStep
336                VSUB    dZi3,dYi0,dYr3
337
338                VADD    qZ2,qY2,qY1
339                VST2    {dZr3,dZi3},[pDst :128],outPointStep
340
341                VSUB    dZr1,dYr0,dYi3
342                VST2    {dZr2,dZi2},[pDst :128],outPointStep
343                VADD    dZi1,dYi0,dYr3
344
345                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
346
347
348            .else
349
350                VSUB    qZ0,qY2,qY1
351
352                VSUB    dZr1,dYr0,dYi3
353                VST2    {dZr0,dZi0},[pDst :128],outPointStep
354                VADD    dZi1,dYi0,dYr3
355
356                VADD    qZ2,qY2,qY1
357                VST2    {dZr1,dZi1},[pDst :128],outPointStep
358
359                VADD    dZr3,dYr0,dYi3
360                VST2    {dZr2,dZi2},[pDst :128],outPointStep
361                VSUB    dZi3,dYi0,dYr3
362
363                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
364
365
366            .endif
367
368        .endif
369
370        BGT     grpLoop\name
371
372
373        @// Reset and Swap pSrc and pDst for the next stage
374        MOV     pTmp,pDst
375        SUB     pSrc,pSrc,#64                       @// Extra increment done in final iteration of the loop
376        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= 4*size; pSrc -= 8*size bytes
377        SUB     pSrc,pTmp,outPointStep
378        SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
379        SUB     pTwiddle,pTwiddle,#16               @// Extra increment done in final iteration of the loop
380
381        .endm
382
383
384        M_START armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4
385        FFTSTAGE "FALSE","FALSE",fwd
386        M_END
387
388
389        M_START armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4
390        FFTSTAGE "FALSE","TRUE",inv
391        M_END
392
393
394        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
395        FFTSTAGE "TRUE","FALSE",fwdsfs
396        M_END
397
398
399        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
400        FFTSTAGE "TRUE","TRUE",invsfs
401        M_END
402
403
404	.end
405