1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12
13@//
14@//
15@// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7765
18@// Last Modified Date:       Thu, 27 Sep 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute a Radix 4 FFT stage for a N point complex signal
26@//
27@//
28
29
30@// Include standard headers
31
32#include "dl/api/arm/armCOMM_s.h"
33#include "dl/api/arm/omxtypes_s.h"
34
35
36@// Import symbols required from other files
37@// (For example tables)
38
39
40
41
42@// Set debugging level
43@//DEBUG_ON    SETL {TRUE}
44
45
46@// Guarding implementation by the processor name
47
48
49
50
51
52
53@// Guarding implementation by the processor name
54
55
56@// Import symbols required from other files
57@// (For example tables)
58    @//IMPORT  armAAC_constTable
59
60@//Input Registers
61
62#define pSrc                            r0
63#define pDst                            r2
64#define pTwiddle                        r1
65#define subFFTNum                       r6
66#define subFFTSize                      r7
67
68
69
70@//Output Registers
71
72
73@//Local Scratch Registers
74
75#define outPointStep                    r3
76#define grpCount                        r4
77#define dstStep                         r5
78#define pw1                             r8
79#define pw2                             r9
80#define pw3                             r10
81#define pTmp                            r4
82
83
84@// Neon Registers
85
86#define dButterfly1Real02               D0.S16
87#define dButterfly1Imag02               D1.S16
88#define dButterfly1Real13               D2.S16
89#define dButterfly1Imag13               D3.S16
90#define dButterfly2Real02               D4.S16
91#define dButterfly2Imag02               D5.S16
92#define dButterfly2Real13               D6.S16
93#define dButterfly2Imag13               D7.S16
94#define dXr0                            D0.S16
95#define dXi0                            D1.S16
96#define dXr1                            D2.S16
97#define dXi1                            D3.S16
98#define dXr2                            D4.S16
99#define dXi2                            D5.S16
100#define dXr3                            D6.S16
101#define dXi3                            D7.S16
102
103#define dW1rS32                         D8.S32
104#define dW1iS32                         D9.S32
105#define dW2rS32                         D10.S32
106#define dW2iS32                         D11.S32
107#define dW3rS32                         D12.S32
108#define dW3iS32                         D13.S32
109
110#define dW1r                            D8.S16
111#define dW1i                            D9.S16
112#define dW2r                            D10.S16
113#define dW2i                            D11.S16
114#define dW3r                            D12.S16
115#define dW3i                            D13.S16
116
117#define dTmp0                           D12.S16
118#define dTmp1                           D13.S16
119#define dTmp1S32                        D13.S32
120#define dTmp2S32                        D14.S32
121#define dTmp3S32                        D15.S32
122
123#define dYr0                            D18.S16
124#define dYi0                            D19.S16
125#define dYr1                            D16.S16
126#define dYi1                            D17.S16
127#define dYr2                            D20.S16
128#define dYi2                            D21.S16
129#define dYr3                            D14.S16
130#define dYi3                            D15.S16
131#define qY0                             Q9.S16
132#define qY1                             Q8.S16
133#define qY2                             Q10.S16
134#define qY3                             Q7.S16
135
136#define qX0                             Q0.S16
137#define qX1                             Q1.S16
138#define qX2                             Q2.S16
139#define qX3                             Q3.S16
140
141#define qT0                             Q9.S32
142#define qT1                             Q10.S32
143#define qT2                             Q7.S32
144#define qT3                             Q8.S32
145
146#define dZr0                            D22.S16
147#define dZi0                            D23.S16
148#define dZr1                            D24.S16
149#define dZi1                            D25.S16
150#define dZr2                            D26.S16
151#define dZi2                            D27.S16
152#define dZr3                            D28.S16
153#define dZi3                            D29.S16
154
155#define qZ0                             Q11.S16
156#define qZ1                             Q12.S16
157#define qZ2                             Q13.S16
158#define qZ3                             Q14.S16
159
160
161        .macro FFTSTAGE scaled, inverse , name
162
163        @// Define stack arguments
164
165        MOV     pw2,pTwiddle
166
167        MOV     pw3,pTwiddle
168        MOV     pw1,pTwiddle
169        @// pOut0+1 increments pOut0 by 8 bytes
170        @// pOut0+outPointStep == increment of 4*outPointStep bytes
171        MOV     outPointStep,subFFTSize,LSL #2
172
173        MOV     subFFTNum,#1                            @//after the last stage
174        LSL     grpCount,subFFTSize,#2
175
176
177        @// Update grpCount and grpSize rightaway
178
179        @// update subFFTSize for the next stage
180        MOV     subFFTSize,grpCount
181        MOV     dstStep,outPointStep,LSL #1
182
183        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
184        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
185
186        @// Process 4 groups at a time
187
188grpLoop\name:
189        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
190        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
191
192        @// Load the second twiddle for 4 groups : w^2
193        @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
194        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
195
196        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
197
198        @// Load the third twiddle for 4 groups : w^3
199        @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
200        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
201
202        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
203        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
204
205        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
206
207        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
208
209        VLD2 {dW1r,dW1i}, [pw1 :128]!
210
211        @// Rearrange the third twiddle
212        VUZP    dW3r,dW3i
213        SUBS    grpCount,grpCount,#16                    @// grpCount is multiplied by 4
214
215        .ifeqs  "\inverse", "TRUE"
216            VMULL   qT0,dXr1,dW1r
217            VMLAL   qT0,dXi1,dW1i                       @// real part
218            VMULL   qT1,dXi1,dW1r
219            VMLSL   qT1,dXr1,dW1i                       @// imag part
220
221        .else
222            VMULL   qT0,dXr1,dW1r
223            VMLSL   qT0,dXi1,dW1i                       @// real part
224            VMULL   qT1,dXi1,dW1r
225            VMLAL   qT1,dXr1,dW1i                       @// imag part
226
227        .endif
228
229        @// Load the first twiddle for 4 groups : w^1
230        @// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
231
232        .ifeqs  "\inverse", "TRUE"
233            VMULL   qT2,dXr2,dW2r
234            VMLAL   qT2,dXi2,dW2i                       @// real part
235            VMULL   qT3,dXi2,dW2r
236            VMLSL   qT3,dXr2,dW2i                       @// imag part
237
238        .else
239            VMULL   qT2,dXr2,dW2r
240            VMLSL   qT2,dXi2,dW2i                       @// real part
241            VMULL   qT3,dXi2,dW2r
242            VMLAL   qT3,dXr2,dW2i                       @// imag part
243
244        .endif
245
246        VRSHRN  dZr1,qT0,#15
247        VRSHRN  dZi1,qT1,#15
248
249
250
251        .ifeqs  "\inverse", "TRUE"
252            VMULL   qT0,dXr3,dW3r
253            VMLAL   qT0,dXi3,dW3i                       @// real part
254            VMULL   qT1,dXi3,dW3r
255            VMLSL   qT1,dXr3,dW3i                       @// imag part
256
257        .else
258            VMULL   qT0,dXr3,dW3r
259            VMLSL   qT0,dXi3,dW3i                       @// real part
260            VMULL   qT1,dXi3,dW3r
261            VMLAL   qT1,dXr3,dW3i                       @// imag part
262
263        .endif
264
265        VRSHRN  dZr2,qT2,#15
266        VRSHRN  dZi2,qT3,#15
267
268        VRSHRN  dZr3,qT0,#15
269        VRSHRN  dZi3,qT1,#15
270
271        .ifeqs "\scaled", "TRUE"
272
273            @// finish first stage of 4 point FFT
274
275            VHADD    qY0,qX0,qZ2
276            VHSUB    qY2,qX0,qZ2
277            VHADD    qY1,qZ1,qZ3
278
279            VHSUB    qY3,qZ1,qZ3
280
281            @// finish second stage of 4 point FFT
282
283            VHSUB    qZ0,qY2,qY1
284            VHADD    qZ2,qY2,qY1
285
286
287            .ifeqs "\inverse", "TRUE"
288
289                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
290                VST2    {dZr0,dZi0},[pDst :128],outPointStep
291                VHSUB    dZi3,dYi0,dYr3
292
293                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
294                VHADD    dZi1,dYi0,dYr3
295                VST2    {dZr3,dZi3},[pDst :128],outPointStep
296                VST2    {dZr2,dZi2},[pDst :128],outPointStep
297                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
298
299            .else
300
301                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
302                VHADD    dZi1,dYi0,dYr3
303
304                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
305                VST2    {dZr0,dZi0},[pDst :128],outPointStep
306                VHSUB    dZi3,dYi0,dYr3
307                VST2    {dZr1,dZi1},[pDst :128],outPointStep
308                VST2    {dZr2,dZi2},[pDst :128],outPointStep
309                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
310
311            .endif
312
313        .else
314
315            @// finish first stage of 4 point FFT
316
317            VADD    qY0,qX0,qZ2
318            VSUB    qY2,qX0,qZ2
319            VADD    qY1,qZ1,qZ3
320
321            VSUB    qY3,qZ1,qZ3
322
323            @// finish second stage of 4 point FFT
324
325            VSUB    qZ0,qY2,qY1
326            VADD    qZ2,qY2,qY1
327
328
329            .ifeqs "\inverse", "TRUE"
330
331                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
332                VST2    {dZr0,dZi0},[pDst :128],outPointStep
333                VSUB    dZi3,dYi0,dYr3
334
335                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
336                VADD    dZi1,dYi0,dYr3
337                VST2    {dZr3,dZi3},[pDst :128],outPointStep
338                VST2    {dZr2,dZi2},[pDst :128],outPointStep
339                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
340
341            .else
342
343                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
344                VADD    dZi1,dYi0,dYr3
345
346                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
347                VST2    {dZr0,dZi0},[pDst :128],outPointStep
348                VSUB    dZi3,dYi0,dYr3
349                VST2    {dZr1,dZi1},[pDst :128],outPointStep
350                VST2    {dZr2,dZi2},[pDst :128],outPointStep
351                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
352
353            .endif
354
355
356
357
358        .endif
359
360        BGT     grpLoop\name
361
362
363        @// Reset and Swap pSrc and pDst for the next stage
364        MOV     pTmp,pDst
365        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= size; pSrc -= 4*size bytes
366        SUB     pSrc,pTmp,outPointStep
367
368        .endm
369
370
371        M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
372        FFTSTAGE "FALSE","FALSE",FWD
373        M_END
374
375
376        M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
377        FFTSTAGE "FALSE","TRUE",INV
378        M_END
379
380
381        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
382        FFTSTAGE "TRUE","FALSE",FWDSFS
383        M_END
384
385
386        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
387        FFTSTAGE "TRUE","TRUE",INVSFS
388        M_END
389
390
391
392
393
394
395    .end
396