1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
11@//  to support float instead of SC32.
12@//
13
14@//
15@// Description:
16@// Compute a first stage Radix 8 FFT stage for a N point complex signal
17@//
18@//
19
20
21@// Include standard headers
22
23#include "dl/api/arm/armCOMM_s.h"
24#include "dl/api/arm/omxtypes_s.h"
25
26@// Import symbols required from other files
27@// (For example tables)
28
29
30@// Set debugging level
31@//DEBUG_ON    SETL {TRUE}
32
33
34
35@// Guarding implementation by the processor name
36
37
38
39
40@// Guarding implementation by the processor name
41
42@//Input Registers
43
44#define pSrc            r0
45#define pDst            r2
46#define pTwiddle        r1
47#define subFFTNum       r6
48#define subFFTSize      r7
49@// dest buffer for the next stage (not pSrc for first stage)
50#define pPingPongBuf    r5
51
52
53@//Output Registers
54
55
56@//Local Scratch Registers
57
58#define grpSize         r3
59@// Reuse grpSize as setCount
60#define setCount        r3
61#define pointStep       r4
62#define outPointStep    r4
63#define setStep         r8
64#define step1           r9
65#define step2           r10
66#define t0              r11
67
68
69@// Neon Registers
70
71#define dXr0    D0.F32
72#define dXi0    D1.F32
73#define dXr1    D2.F32
74#define dXi1    D3.F32
75#define dXr2    D4.F32
76#define dXi2    D5.F32
77#define dXr3    D6.F32
78#define dXi3    D7.F32
79#define dXr4    D8.F32
80#define dXi4    D9.F32
81#define dXr5    D10.F32
82#define dXi5    D11.F32
83#define dXr6    D12.F32
84#define dXi6    D13.F32
85#define dXr7    D14.F32
86#define dXi7    D15.F32
87#define qX0     Q0.F32
88#define qX1     Q1.F32
89#define qX2     Q2.F32
90#define qX3     Q3.F32
91#define qX4     Q4.F32
92#define qX5     Q5.F32
93#define qX6     Q6.F32
94#define qX7     Q7.F32
95
96#define dUr0    D16.F32
97#define dUi0    D17.F32
98#define dUr2    D18.F32
99#define dUi2    D19.F32
100#define dUr4    D20.F32
101#define dUi4    D21.F32
102#define dUr6    D22.F32
103#define dUi6    D23.F32
104#define dUr1    D24.F32
105#define dUi1    D25.F32
106#define dUr3    D26.F32
107#define dUi3    D27.F32
108#define dUr5    D28.F32
109#define dUi5    D29.F32
110@// reuse dXr7 and dXi7
111#define dUr7    D30.F32
112#define dUi7    D31.F32
113#define qU0     Q8.F32
114#define qU1     Q12.F32
115#define qU2     Q9.F32
116#define qU3     Q13.F32
117#define qU4     Q10.F32
118#define qU5     Q14.F32
119#define qU6     Q11.F32
120#define qU7     Q15.F32
121
122
123#define dVr0    D24.F32
124#define dVi0    D25.F32
125#define dVr2    D26.F32
126#define dVi2    D27.F32
127#define dVr4    D28.F32
128#define dVi4    D29.F32
129#define dVr6    D30.F32
130#define dVi6    D31.F32
131#define dVr1    D16.F32
132#define dVi1    D17.F32
133#define dVr3    D18.F32
134#define dVi3    D19.F32
135#define dVr5    D20.F32
136#define dVi5    D21.F32
137#define dVr7    D22.F32
138#define dVi7    D23.F32
139#define qV0     Q12.F32
140#define qV1     Q8.F32
141#define qV2     Q13.F32
142#define qV3     Q9.F32
143#define qV4     Q14.F32
144#define qV5     Q10.F32
145#define qV6     Q15.F32
146#define qV7     Q11.F32
147
148#define dYr0    D16.F32
149#define dYi0    D17.F32
150#define dYr2    D18.F32
151#define dYi2    D19.F32
152#define dYr4    D20.F32
153#define dYi4    D21.F32
154#define dYr6    D22.F32
155#define dYi6    D23.F32
156#define dYr1    D24.F32
157#define dYi1    D25.F32
158#define dYr3    D26.F32
159#define dYi3    D27.F32
160#define dYr5    D28.F32
161#define dYi5    D29.F32
162#define dYr7    D30.F32
163#define dYi7    D31.F32
164#define qY0     Q8.F32
165#define qY1     Q12.F32
166#define qY2     Q9.F32
167#define qY3     Q13.F32
168#define qY4     Q10.F32
169#define qY5     Q14.F32
170#define qY6     Q11.F32
171#define qY7     Q15.F32
172
173#define dT0     D14.F32
174#define dT1     D15.F32
175
176        .MACRO FFTSTAGE scaled, inverse, name
177
178        @// Define stack arguments
179
180        @// Update pSubFFTSize and pSubFFTNum regs
181        @// subFFTSize = 1 for the first stage
182        MOVW    t0, 0x04f3               @// Low half word of sqrt(1/2).
183        MOV     subFFTSize,#8
184        MOVT    t0, 0x3f35               @// High half word of sqrt(1/2).
185
186        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
187        LSR     grpSize,subFFTNum,#3
188        MOV     subFFTNum,grpSize
189
190
191        @// pT0+1 increments pT0 by 8 bytes
192        @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
193        @// Note: outPointStep = pointStep for firststage
194
195        MOV     pointStep,grpSize,LSL #3
196
197
198        @// Calculate the step of input data for the next set
199        @//MOV     step1,pointStep,LSL #1             @// step1 = 2*pointStep
200        VLD2    {dXr0,dXi0},[pSrc :128],pointStep     @//  data[0]
201        MOV     step1,grpSize,LSL #4
202
203        MOV     step2,pointStep,LSL #3
204        VLD2    {dXr1,dXi1},[pSrc :128],pointStep     @//  data[1]
205        SUB     step2,step2,pointStep                 @// step2 = 7*pointStep
206        @// setStep = - 7*pointStep+16
207        RSB     setStep,step2,#16
208
209        VLD2    {dXr2,dXi2},[pSrc :128],pointStep     @//  data[2]
210        VLD2    {dXr3,dXi3},[pSrc :128],pointStep     @//  data[3]
211        VLD2    {dXr4,dXi4},[pSrc :128],pointStep     @//  data[4]
212        VLD2    {dXr5,dXi5},[pSrc :128],pointStep     @//  data[5]
213        VLD2    {dXr6,dXi6},[pSrc :128],pointStep     @//  data[6]
214        @//  data[7] & update pSrc for the next set
215        @//  setStep = -7*pointStep + 16
216        VLD2    {dXr7,dXi7},[pSrc :128],setStep
217        @// grp = 0 a special case since all the twiddle factors are 1
218        @// Loop on the sets
219
220radix8fsGrpZeroSetLoop\name :
221
222        @// Decrement setcount
223        SUBS    setCount,setCount,#2
224
225
226        @// finish first stage of 8 point FFT
227
228        VADD    qU0,qX0,qX4
229        VADD    qU2,qX1,qX5
230        VADD    qU4,qX2,qX6
231        VADD    qU6,qX3,qX7
232
233        @// finish second stage of 8 point FFT
234
235        VADD    qV0,qU0,qU4
236        VSUB    qV2,qU0,qU4
237        VADD    qV4,qU2,qU6
238        VSUB    qV6,qU2,qU6
239
240        @// finish third stage of 8 point FFT
241
242        VADD    qY0,qV0,qV4
243        VSUB    qY4,qV0,qV4
244        VST2    {dYr0,dYi0},[pDst :128],step1         @// store y0
245
246        .ifeqs  "\inverse", "TRUE"
247
248            VSUB    dYr2,dVr2,dVi6
249            VADD    dYi2,dVi2,dVr6
250
251            VADD    dYr6,dVr2,dVi6
252            VST2    {dYr2,dYi2},[pDst :128],step1     @// store y2
253            VSUB    dYi6,dVi2,dVr6
254
255            VSUB    qU1,qX0,qX4
256            VST2    {dYr4,dYi4},[pDst :128],step1     @// store y4
257
258            VSUB    qU3,qX1,qX5
259            VSUB    qU5,qX2,qX6
260            VST2    {dYr6,dYi6},[pDst :128],step1     @// store y6
261
262        .ELSE
263
264            VADD    dYr6,dVr2,dVi6
265            VSUB    dYi6,dVi2,dVr6
266
267            VSUB    dYr2,dVr2,dVi6
268            VST2    {dYr6,dYi6},[pDst :128],step1     @// store y2
269            VADD    dYi2,dVi2,dVr6
270
271
272            VSUB    qU1,qX0,qX4
273            VST2    {dYr4,dYi4},[pDst :128],step1     @// store y4
274            VSUB    qU3,qX1,qX5
275            VSUB    qU5,qX2,qX6
276            VST2    {dYr2,dYi2},[pDst :128],step1     @// store y6
277
278
279        .ENDIF
280
281        @// finish first stage of 8 point FFT
282
283        VSUB    qU7,qX3,qX7
284        VMOV    dT0[0], t0
285
286        @// finish second stage of 8 point FFT
287
288        VSUB    dVr1,dUr1,dUi5
289        @//  data[0] for next iteration
290        VLD2    {dXr0,dXi0},[pSrc :128],pointStep
291        VADD    dVi1,dUi1,dUr5
292        VADD    dVr3,dUr1,dUi5
293        VLD2    {dXr1,dXi1},[pSrc :128],pointStep     @//  data[1]
294        VSUB    dVi3,dUi1,dUr5
295
296        VSUB    dVr5,dUr3,dUi7
297        VLD2    {dXr2,dXi2},[pSrc :128],pointStep     @//  data[2]
298        VADD    dVi5,dUi3,dUr7
299        VADD    dVr7,dUr3,dUi7
300        VLD2    {dXr3,dXi3},[pSrc :128],pointStep     @//  data[3]
301        VSUB    dVi7,dUi3,dUr7
302
303        @// finish third stage of 8 point FFT
304
305        .ifeqs  "\inverse", "TRUE"
306
307            @// calculate a*v5
308            VMUL    dT1,dVr5,dT0[0]                   @// use dVi0 for dT1
309
310            VLD2    {dXr4,dXi4},[pSrc :128],pointStep @//  data[4]
311            VMUL    dVi5,dVi5,dT0[0]
312
313            VLD2    {dXr5,dXi5},[pSrc :128],pointStep @//  data[5]
314            VSUB    dVr5,dT1,dVi5                     @// a * V5
315            VADD    dVi5,dT1,dVi5
316
317            VLD2    {dXr6,dXi6},[pSrc :128],pointStep @//  data[6]
318
319            @// calculate  b*v7
320            VMUL    dT1,dVr7,dT0[0]
321            VMUL    dVi7,dVi7,dT0[0]
322
323            VADD    qY1,qV1,qV5
324            VSUB    qY5,qV1,qV5
325
326
327            VADD    dVr7,dT1,dVi7                     @// b * V7
328            VSUB    dVi7,dVi7,dT1
329            SUB     pDst, pDst, step2                 @// set pDst to y1
330
331            @// On the last iteration,  this will read past the end of pSrc,
332            @// so skip this read.
333            BEQ     radix8SkipLastUpdateInv\name
334            VLD2    {dXr7,dXi7},[pSrc :128],setStep   @//  data[7]
335radix8SkipLastUpdateInv\name:
336
337            VSUB    dYr3,dVr3,dVr7
338            VSUB    dYi3,dVi3,dVi7
339            VST2    {dYr1,dYi1},[pDst :128],step1     @// store y1
340            VADD    dYr7,dVr3,dVr7
341            VADD    dYi7,dVi3,dVi7
342
343
344            VST2    {dYr3,dYi3},[pDst :128],step1     @// store y3
345            VST2    {dYr5,dYi5},[pDst :128],step1     @// store y5
346            VST2    {dYr7,dYi7},[pDst :128]           @// store y7
347            ADD pDst, pDst, #16
348
349        .ELSE
350
351            @// calculate  b*v7
352            VMUL    dT1,dVr7,dT0[0]
353            VLD2    {dXr4,dXi4},[pSrc :128],pointStep @//  data[4]
354            VMUL    dVi7,dVi7,dT0[0]
355
356            VLD2    {dXr5,dXi5},[pSrc :128],pointStep @//  data[5]
357            VADD    dVr7,dT1,dVi7                     @// b * V7
358            VSUB    dVi7,dVi7,dT1
359
360            VLD2    {dXr6,dXi6},[pSrc :128],pointStep @//  data[6]
361
362            @// calculate a*v5
363            VMUL    dT1,dVr5,dT0[0]                   @// use dVi0 for dT1
364            VMUL    dVi5,dVi5,dT0[0]
365
366            VADD    dYr7,dVr3,dVr7
367            VADD    dYi7,dVi3,dVi7
368            SUB     pDst, pDst, step2                 @// set pDst to y1
369
370            VSUB    dVr5,dT1,dVi5                     @// a * V5
371            VADD    dVi5,dT1,dVi5
372
373            @// On the last iteration,  this will read past the end of pSrc,
374            @// so skip this read.
375            BEQ     radix8SkipLastUpdateFwd\name
376            VLD2    {dXr7,dXi7},[pSrc :128],setStep   @//  data[7]
377radix8SkipLastUpdateFwd\name:
378
379            VSUB    qY5,qV1,qV5
380
381            VSUB    dYr3,dVr3,dVr7
382            VST2    {dYr7,dYi7},[pDst :128],step1     @// store y1
383            VSUB    dYi3,dVi3,dVi7
384            VADD    qY1,qV1,qV5
385
386
387            VST2    {dYr5,dYi5},[pDst :128],step1     @// store y3
388            VST2    {dYr3,dYi3},[pDst :128],step1     @// store y5
389            VST2    {dYr1,dYi1},[pDst :128]!          @// store y7
390
391        .ENDIF
392
393
394        @// update pDst for the next set
395        SUB     pDst, pDst, step2
396        BGT     radix8fsGrpZeroSetLoop\name
397
398
399        @// reset pSrc to pDst for the next stage
400        SUB     pSrc,pDst,pointStep                   @// pDst -= 2*grpSize
401        MOV     pDst,pPingPongBuf
402
403
404
405        .endm
406
407
408        @// Allocate stack memory required by the function
409
410
411        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
412            FFTSTAGE "FALSE","FALSE",FWD
413        M_END
414
415
416        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
417            FFTSTAGE "FALSE","TRUE",INV
418        M_END
419
420
421
422        .end
423