1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12
13@//
14@//
15@// File Name:  armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7761
18@// Last Modified Date:       Wed, 26 Sep 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute a Radix 4 FFT stage for a N point complex signal
26@//
27@//
28
29
30@// Include standard headers
31
32#include "dl/api/arm/armCOMM_s.h"
33#include "dl/api/arm/omxtypes_s.h"
34
35
36
37@// Import symbols required from other files
38@// (For example tables)
39
40
41
42
43@// Set debugging level
44@//DEBUG_ON    SETL {TRUE}
45
46
47@// Guarding implementation by the processor name
48
49
50
51    @// Guarding implementation by the processor name
52
53
54@// Import symbols required from other files
55@// (For example tables)
56
57
58@//Input Registers
59
60#define pSrc                            r0
61#define pDst                            r2
62#define pTwiddle                        r1
63#define subFFTNum                       r6
64#define subFFTSize                      r7
65
66
67
68@//Output Registers
69
70
71@//Local Scratch Registers
72
73#define grpCount                        r3
74#define pointStep                       r4
75#define outPointStep                    r5
76#define stepTwiddle                     r12
77#define setCount                        r14
78#define srcStep                         r8
79#define setStep                         r9
80#define dstStep                         r10
81#define twStep                          r11
82#define t1                              r3
83
84@// Neon Registers
85
86#define dW1                             D0.S16
87#define dW2                             D1.S16
88#define dW3                             D2.S16
89
90#define dXr0                            D4.S16
91#define dXi0                            D5.S16
92#define dXr1                            D6.S16
93#define dXi1                            D7.S16
94#define dXr2                            D8.S16
95#define dXi2                            D9.S16
96#define dXr3                            D10.S16
97#define dXi3                            D11.S16
98#define dYr0                            D12.S16
99#define dYi0                            D13.S16
100#define dYr1                            D14.S16
101#define dYi1                            D15.S16
102#define dYr2                            D16.S16
103#define dYi2                            D17.S16
104#define dYr3                            D18.S16
105#define dYi3                            D19.S16
106#define qT0                             Q8.S32
107#define qT1                             Q9.S32
108#define qT2                             Q6.S32
109#define qT3                             Q7.S32
110
111#define dZr0                            D20.S16
112#define dZi0                            D21.S16
113#define dZr1                            D22.S16
114#define dZi1                            D23.S16
115#define dZr2                            D24.S16
116#define dZi2                            D25.S16
117#define dZr3                            D26.S16
118#define dZi3                            D27.S16
119#define qY0                             Q6.S16
120#define qY1                             Q7.S16
121#define qY2                             Q8.S16
122#define qY3                             Q9.S16
123#define qX0                             Q2.S16
124#define qZ0                             Q10.S16
125#define qZ1                             Q11.S16
126#define qZ2                             Q12.S16
127#define qZ3                             Q13.S16
128
129
130        .macro FFTSTAGE scaled, inverse , name
131
132        @// Define stack arguments
133
134
135        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
136
137        LSL     grpCount,subFFTSize,#2
138        LSR     subFFTNum,subFFTNum,#2
139        MOV     subFFTSize,grpCount
140
141
142        @// pOut0+1 increments pOut0 by 4 bytes
143        @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
144
145        MOV     stepTwiddle,#0
146        SMULBB  outPointStep,grpCount,subFFTNum
147
148        @// pT0+1 increments pT0 by 4 bytes
149        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
150
151        LSL     pointStep,subFFTNum,#2                      @// 2*grpSize
152
153        VLD1     dW1,[pTwiddle :64]                             @//[wi | wr]
154        MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
155        VLD1     dW2,[pTwiddle :64]                             @//[wi | wr]
156        ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
157        VLD1     dW3,[pTwiddle :64]
158        @//RSB     setStep,setStep,#16                      @// setStep = - 3*pointStep+16
159        RSB     setStep,setStep,#0                          @// setStep = - 3*pointStep
160
161        MOV     dstStep,outPointStep,LSL #1
162        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
163        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
164
165
166
167grpLoop\name:
168
169        ADD      stepTwiddle,stepTwiddle,pointStep
170        ADD      pTwiddle,pTwiddle,stepTwiddle               @// set pTwiddle to the first point
171        MOV      twStep,stepTwiddle,LSL #2
172
173        SUB      twStep,stepTwiddle,twStep                   @// twStep = -3*stepTwiddle
174
175
176        MOV      setCount,pointStep,LSR #2
177        ADD      pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
178
179        @// Loop on the sets : 4 at a time
180
181setLoop\name:
182        VLD2    {dXr1,dXi1},[pSrc :128],pointStep         @//  data[1]
183        VLD2    {dXr2,dXi2},[pSrc :128],pointStep         @//  data[2]
184
185        SUBS    setCount,setCount,#4                      @// decrement the loop counter
186
187        .ifeqs  "\inverse", "TRUE"
188            VMULL   qT0,dXr1,dW1[0]
189            VMLAL   qT0,dXi1,dW1[1]                       @// real part
190            VMULL   qT1,dXi1,dW1[0]
191            VMLSL   qT1,dXr1,dW1[1]                       @// imag part
192
193        .else
194            VMULL   qT0,dXr1,dW1[0]
195            VMLSL   qT0,dXi1,dW1[1]                       @// real part
196            VMULL   qT1,dXi1,dW1[0]
197            VMLAL   qT1,dXr1,dW1[1]                       @// imag part
198
199        .endif
200
201        .ifeqs  "\inverse", "TRUE"
202            VMULL   qT2,dXr2,dW2[0]
203            VMLAL   qT2,dXi2,dW2[1]                       @// real part
204            VMULL   qT3,dXi2,dW2[0]
205            VMLSL   qT3,dXr2,dW2[1]                       @// imag part
206
207        .else
208            VMULL   qT2,dXr2,dW2[0]
209            VMLSL   qT2,dXi2,dW2[1]                       @// real part
210            VMULL   qT3,dXi2,dW2[0]
211            VMLAL   qT3,dXr2,dW2[1]                       @// imag part
212
213        .endif
214
215        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
216
217        VRSHRN  dZr1,qT0,#15
218        VRSHRN  dZi1,qT1,#15
219
220        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
221        ADD     pSrc,pSrc,#16                              @// set pSrc to data[1] of the next set
222
223        .ifeqs  "\inverse", "TRUE"
224            VMULL   qT0,dXr3,dW3[0]
225            VMLAL   qT0,dXi3,dW3[1]                       @// real part
226            VMULL   qT1,dXi3,dW3[0]
227            VMLSL   qT1,dXr3,dW3[1]                       @// imag part
228
229        .else
230            VMULL   qT0,dXr3,dW3[0]
231            VMLSL   qT0,dXi3,dW3[1]                       @// real part
232            VMULL   qT1,dXi3,dW3[0]
233            VMLAL   qT1,dXr3,dW3[1]                       @// imag part
234
235        .endif
236
237        VRSHRN  dZr2,qT2,#15
238        VRSHRN  dZi2,qT3,#15
239
240
241        VRSHRN  dZr3,qT0,#15
242        VRSHRN  dZi3,qT1,#15
243
244
245        .ifeqs "\scaled", "TRUE"
246
247            @// finish first stage of 4 point FFT
248            VHADD    qY0,qX0,qZ2
249            VHSUB    qY2,qX0,qZ2
250
251            VHADD    qY1,qZ1,qZ3
252            VHSUB    qY3,qZ1,qZ3
253
254
255            @// finish second stage of 4 point FFT
256
257            .ifeqs  "\inverse", "TRUE"
258
259                VHSUB    qZ0,qY2,qY1
260
261                VHADD    dZr2,dYr0,dYi3
262                VST2    {dZr0,dZi0},[pDst :128],outPointStep
263                VHSUB    dZi2,dYi0,dYr3
264
265                VHADD    qZ1,qY2,qY1
266                VST2    {dZr2,dZi2},[pDst :128],outPointStep
267
268                VHSUB    dZr3,dYr0,dYi3
269                VST2    {dZr1,dZi1},[pDst :128],outPointStep
270                VHADD    dZi3,dYi0,dYr3
271                VST2    {dZr3,dZi3},[pDst :128],dstStep
272
273
274            .else
275
276                VHSUB    qZ0,qY2,qY1
277
278                VHSUB    dZr3,dYr0,dYi3
279                VST2    {dZr0,dZi0},[pDst :128],outPointStep
280                VHADD    dZi3,dYi0,dYr3
281
282                VHADD    qZ1,qY2,qY1
283                VST2    {dZr3,dZi3},[pDst :128],outPointStep
284
285                VHADD    dZr2,dYr0,dYi3
286                VHSUB    dZi2,dYi0,dYr3
287                VST2    {dZr1,dZi1},[pDst :128],outPointStep
288                VST2    {dZr2,dZi2},[pDst :128],dstStep
289
290
291            .endif
292
293
294        .else
295
296            @// finish first stage of 4 point FFT
297            VADD    qY0,qX0,qZ2
298            VSUB    qY2,qX0,qZ2
299
300            VADD    qY1,qZ1,qZ3
301            VSUB    qY3,qZ1,qZ3
302
303
304            @// finish second stage of 4 point FFT
305
306
307            .ifeqs  "\inverse", "TRUE"
308
309                VSUB    qZ0,qY2,qY1
310
311                VADD    dZr2,dYr0,dYi3
312                VST2    {dZr0,dZi0},[pDst :128],outPointStep
313                VSUB    dZi2,dYi0,dYr3
314
315                VADD    qZ1,qY2,qY1
316                VST2    {dZr2,dZi2},[pDst :128],outPointStep
317
318                VSUB    dZr3,dYr0,dYi3
319                VST2    {dZr1,dZi1},[pDst :128],outPointStep
320                VADD    dZi3,dYi0,dYr3
321                VST2    {dZr3,dZi3},[pDst :128],dstStep
322
323
324            .else
325
326                VSUB    qZ0,qY2,qY1
327
328                VSUB    dZr3,dYr0,dYi3
329                VST2    {dZr0,dZi0},[pDst :128],outPointStep
330                VADD    dZi3,dYi0,dYr3
331
332                VADD    qZ1,qY2,qY1
333                VST2    {dZr3,dZi3},[pDst :128],outPointStep
334
335                VADD    dZr2,dYr0,dYi3
336                VSUB    dZi2,dYi0,dYr3
337                VST2    {dZr1,dZi1},[pDst :128],outPointStep
338                VST2    {dZr2,dZi2},[pDst :128],dstStep
339
340
341            .endif
342
343
344
345        .endif
346
347        BGT     setLoop\name
348
349        VLD1     dW1,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
350        SUBS    grpCount,grpCount,#4                        @// subtract 4 since grpCount multiplied by 4
351        VLD1     dW2,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
352        ADD     pSrc,pSrc,srcStep                           @// increment pSrc for the next grp
353        VLD1     dW3,[pTwiddle :64],twStep                      @//[wi | wr]
354
355
356
357        BGT     grpLoop\name
358
359
360        @// Reset and Swap pSrc and pDst for the next stage
361        MOV     t1,pDst
362        SUB     pDst,pSrc,outPointStep,LSL #2           @// pDst -= size; pSrc -= 4*size bytes
363        SUB     pSrc,t1,outPointStep
364
365
366        .endm
367
368
369        M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
370            FFTSTAGE "FALSE","FALSE",FWD
371        M_END
372
373
374        M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
375            FFTSTAGE "FALSE","TRUE",INV
376        M_END
377
378
379        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
380            FFTSTAGE "TRUE","FALSE",FWDSFS
381        M_END
382
383
384        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
385            FFTSTAGE "TRUE","TRUE",INVSFS
386        M_END
387
388
389
390
391
392    .end
393