1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of
11@//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
12@//  instead of SC32.
13@//
14
15@//
16@// Description:
17@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
18@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
19@//
20@//
21
22
23@// Include standard headers
24
25#include "dl/api/arm/armCOMM_s.h"
26#include "dl/api/arm/omxtypes_s.h"
27
28
29@// Import symbols required from other files
30@// (For example tables)
31
32
33@// Set debugging level
34@//DEBUG_ON    SETL {TRUE}
35
36
37
38@// Guarding implementation by the processor name
39
40
41
42      @// Guarding implementation by the processor name
43
44
45
46@//Input Registers
47
48#define pSrc            r0
49#define pDst            r1
50#define pFFTSpec        r2
51#define scale           r3
52
53
54@// Output registers
55#define result          r0
56
57@//Local Scratch Registers
58
59#define argTwiddle      r1
60#define argDst          r2
61#define argScale        r4
62#define tmpOrder        r4
63#define pTwiddle        r4
64#define pOut            r5
65#define subFFTSize      r7
66#define subFFTNum       r6
67#define N               r6
68#define order           r14
69#define diff            r9
70@// Total num of radix stages required to complete the FFT
71#define count           r8
72#define x0r             r4
73#define x0i             r5
74#define diffMinusOne    r2
75#define round           r3
76
77#define pOut1           r2
78#define size            r7
79#define step            r8
80#define step1           r9
81#define twStep          r10
82#define pTwiddleTmp     r11
83#define argTwiddle1     r12
84#define zero            r14
85
86@// Neon registers
87
88#define dX0     D0.F32
89#define dShift  D1.F32
90#define dX1     D1.F32
91#define dY0     D2.F32
92#define dY1     D3.F32
93#define dX0r    D0.F32
94#define dX0i    D1.F32
95#define dX1r    D2.F32
96#define dX1i    D3.F32
97#define dW0r    D4.F32
98#define dW0i    D5.F32
99#define dW1r    D6.F32
100#define dW1i    D7.F32
101#define dT0     D8.F32
102#define dT1     D9.F32
103#define dT2     D10.F32
104#define dT3     D11.F32
105#define qT0     D12.F32
106#define qT1     D14.F32
107#define qT2     D16.F32
108#define qT3     D18.F32
109#define dY0r    D4.F32
110#define dY0i    D5.F32
111#define dY1r    D6.F32
112#define dY1i    D7.F32
113
114#define dY2     D4.F32
115#define dY3     D5.F32
116#define dW0     D6.F32
117#define dW1     D7.F32
118#define dW0Tmp  D10.F32
119#define dW1Neg  D11.F32
120
121#define half    D13.F32
122
123@ Structure offsets for the FFTSpec
124        .set    ARMsFFTSpec_N, 0
125        .set    ARMsFFTSpec_pBitRev, 4
126        .set    ARMsFFTSpec_pTwiddle, 8
127        .set    ARMsFFTSpec_pBuf, 12
128
129        .macro FFTSTAGE scaled, inverse, name
130
131        @// Read the size from structure and take log
132        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
133
134        @// Read other structure parameters
135        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
136        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
137
138        VMOV    half, 0.5
139
140
141        MOV     size,N,ASR #1                 @// preserve the contents of N
142        MOV     step,N,LSL #2                 @// step = N/2 * 8 bytes
143
144
145        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
146        @// Note: W^(k) is stored as negated value and also need to
147        @// conjugate the values from the table
148
149        @// Z(0) : no need of twiddle multiply
150        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
151
152        VLD1    dX0,[pSrc],step
153        ADD     pOut1,pOut,step               @// pOut1 = pOut+ N/2*8 bytes
154
155        VLD1    dX1,[pSrc]!
156        @// twStep = 3N/8 * 8 bytes pointing to W^1
157        SUB     twStep,step,size,LSL #1
158
159        MOV     step1,size,LSL #2             @// step1 = N/4 * 8 = N/2*4 bytes
160        SUB     step1,step1,#8                @// (N/4-1)*8 bytes
161
162        VADD    dY0,dX0,dX1                   @// [b+d | a+c]
163        VSUB    dY1,dX0,dX1                   @// [b-d | a-c]
164        VMUL    dY0, dY0, half[0]
165        VMUL    dY1, dY1, half[0]
166
167        @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
168        VZIP    dY0,dY1
169
170        VSUB   dX0,dY0,dY1
171        SUBS   size,size,#2
172        VADD   dX1,dY0,dY1
173
174        SUB     pSrc,pSrc,step
175
176        VST1    dX0[0],[pOut1]!
177        ADD     pTwiddleTmp,pTwiddle,#8       @// W^2
178        VST1    dX1[1],[pOut1]!
179        ADD     argTwiddle1,pTwiddle,twStep   @// W^1
180
181
182        BLT     decrementScale\name
183        BEQ     lastElement\name
184
185
186        @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
187        @// Note: W^k is stored as negative values in the table and also
188        @// need to conjugate the values from the table.
189        @//
190        @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
191        @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
192
193
194        SUB     step,step,#24
195evenOddButterflyLoop\name :
196
197
198        VLD1    dW0r,[argTwiddle1],step1
199        VLD1    dW1r,[argTwiddle1]!
200
201        VLD2    {dX0r,dX0i},[pSrc],step
202        SUB     argTwiddle1,argTwiddle1,step1
203        VLD2    {dX1r,dX1i},[pSrc]!
204
205        SUB     step1,step1,#8                @// (N/4-2)*8 bytes
206        VLD1    dW0i,[pTwiddleTmp],step1
207        VLD1    dW1i,[pTwiddleTmp]!
208        SUB     pSrc,pSrc,step
209
210        SUB     pTwiddleTmp,pTwiddleTmp,step1
211        VREV64  dX1r,dX1r
212        VREV64  dX1i,dX1i
213        SUBS    size,size,#4
214
215
216        VSUB    dT2,dX0r,dX1r                 @// a-c
217        VADD    dT3,dX0i,dX1i                 @// b+d
218        VADD    dT0,dX0r,dX1r                 @// a+c
219        VSUB    dT1,dX0i,dX1i                 @// b-d
220        SUB     step1,step1,#8
221
222        VMUL    dT2, dT2, half[0]
223        VMUL    dT3, dT3, half[0]
224
225        VMUL    dT0, dT0, half[0]
226        VMUL    dT1, dT1, half[0]
227
228        VZIP    dW1r,dW1i
229        VZIP    dW0r,dW0i
230
231
232        VMUL   dX1r,dW1r,dT2
233        VMUL   dX1i,dW1r,dT3
234        VMUL   dX0r,dW0r,dT2
235        VMUL   dX0i,dW0r,dT3
236
237        VMLS   dX1r,dW1i,dT3
238        VMLA   dX1i,dW1i,dT2
239
240        VMLA   dX0r,dW0i,dT3
241        VMLS   dX0i,dW0i,dT2
242
243
244        VADD    dY1r,dT0,dX1i                 @// F(N/2 -1)
245        VSUB    dY1i,dX1r,dT1
246
247        VREV64  dY1r,dY1r
248        VREV64  dY1i,dY1i
249
250
251        VADD    dY0r,dT0,dX0i                 @// F(1)
252        VSUB    dY0i,dT1,dX0r
253
254
255        VST2    {dY0r,dY0i},[pOut1],step
256        VST2    {dY1r,dY1i},[pOut1]!
257        SUB     pOut1,pOut1,step
258        SUB     step,step,#32                 @// (N/2-4)*8 bytes
259
260
261        BGT     evenOddButterflyLoop\name
262
263
264        @// set both the ptrs to the last element
265        SUB     pSrc,pSrc,#8
266        SUB     pOut1,pOut1,#8
267
268        @// Last element can be expanded as follows
269        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
270        @// -ve)
271        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
272        @// 1/2[2a+j0] - j (c-jd) [0+j2b]
273        @// (a+bc, -bd)
274        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
275
276lastElement\name :
277        VLD1    dX0r,[pSrc]
278
279        VST1    dX0r[0],[pOut1]!
280        VNEG    dX0r,dX0r
281        VST1    dX0r[1],[pOut1]
282
283
284
285decrementScale\name :
286
287        .endm
288
289        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4
290
291            FFTSTAGE "FALSE","TRUE",Inv
292        M_END
293
294        .end
295