1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12@//
13
14@//
15@// File Name:  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7485
18@// Last Modified Date:       Fri, 21 Sep 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
26@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
27@// It implements both "scaled"(by 1/2) and "unsclaed" versions of the above formula
28@//
29
30
31@// Include standard headers
32
33#include "dl/api/arm/armCOMM_s.h"
34#include "dl/api/arm/omxtypes_s.h"
35
36
37@// Import symbols required from other files
38@// (For example tables)
39
40
41@// Set debugging level
42@//DEBUG_ON    SETL {TRUE}
43
44
45
46@// Guarding implementation by the processor name
47
48
49
50      @// Guarding implementation by the processor name
51
52
53
54@//Input Registers
55
56#define pSrc            r0
57#define pDst            r1
58#define pFFTSpec        r2
59#define scale           r3
60
61
62@// Output registers
63#define result          r0
64
65@//Local Scratch Registers
66
67#define argTwiddle      r1
68#define argDst          r2
69#define argScale        r4
70#define tmpOrder        r4
71#define pTwiddle        r4
72#define pOut            r5
73#define subFFTSize      r7
74#define subFFTNum       r6
75#define N               r6
76#define order           r14
77#define diff            r9
78#define count           r8                   @// Total num of radix stages required to comple the FFT
79#define x0r             r4
80#define x0i             r5
81#define diffMinusOne    r2
82#define round           r3
83
84#define pOut1           r2
85#define size            r7
86#define step            r8
87#define step1           r9
88#define twStep          r10
89#define pTwiddleTmp     r11
90#define argTwiddle1     r12
91#define zero            r14
92
93@// Neon registers
94
95#define dX0     D0.S32
96#define dShift  D1.S32
97#define dX1     D1.S32
98#define dY0     D2.S32
99#define dY1     D3.S32
100#define dX0r    D0.S32
101#define dX0i    D1.S32
102#define dX1r    D2.S32
103#define dX1i    D3.S32
104#define dW0r    D4.S32
105#define dW0i    D5.S32
106#define dW1r    D6.S32
107#define dW1i    D7.S32
108#define dT0     D8.S32
109#define dT1     D9.S32
110#define dT2     D10.S32
111#define dT3     D11.S32
112#define qT0     Q6.S64
113#define qT1     Q7.S64
114#define qT2     Q8.S64
115#define qT3     Q9.S64
116#define dY0r    D4.S32
117#define dY0i    D5.S32
118#define dY1r    D6.S32
119#define dY1i    D7.S32
120
121#define dY2     D4.S32
122#define dY3     D5.S32
123#define dW0     D6.S32
124#define dW1     D7.S32
125#define dW0Tmp  D10.S32
126#define dW1Neg  D11.S32
127
128
129@ Structure offsets for the FFTSpec
130        .set    ARMsFFTSpec_N, 0
131        .set    ARMsFFTSpec_pBitRev, 4
132        .set    ARMsFFTSpec_pTwiddle, 8
133        .set    ARMsFFTSpec_pBuf, 12
134
135
136        .macro FFTSTAGE scaled, inverse, name
137
138        @// Read the size from structure and take log
139        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
140
141        @// Read other structure parameters
142        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
143        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
144
145
146
147        MOV     size,N,ASR #1                    @// preserve the contents of N
148        MOV     step,N,LSL #2                    @// step = N/2 * 8 bytes
149
150
151        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
152        @// Note: W^(k) is stored as negated value and also need to conjugate the values from the table
153
154        @// Z(0) : no need of twiddle multiply
155        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
156
157        VLD1    dX0,[pSrc],step
158        ADD     pOut1,pOut,step                  @// pOut1 = pOut+ N/2*8 bytes
159
160        VLD1    dX1,[pSrc]!
161        SUB     twStep,step,size,LSL #1          @// twStep = 3N/8 * 8 bytes pointing to W^1
162
163        MOV     step1,size,LSL #2                @// step1 = N/4 * 8 = N/2*4 bytes
164        SUB     step1,step1,#8                   @// (N/4-1)*8 bytes
165
166        VHADD    dY0,dX0,dX1                     @// [b+d | a+c]
167        VHSUB    dY1,dX0,dX1                     @// [b-d | a-c]
168        VZIP    dY0,dY1                          @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
169
170        .ifeqs  "\scaled", "TRUE"
171            VHSUB   dX0,dY0,dY1
172            SUBS    size,size,#2
173            VHADD   dX1,dY0,dY1
174        .else
175            VSUB   dX0,dY0,dY1
176            SUBS    size,size,#2
177            VADD   dX1,dY0,dY1
178        .endif
179
180        SUB     pSrc,pSrc,step
181
182        VST1    dX0[0],[pOut1]!
183        ADD     pTwiddleTmp,pTwiddle,#8                @// W^2
184        VST1    dX1[1],[pOut1]!
185        ADD     argTwiddle1,pTwiddle,twStep            @// W^1
186
187
188        BLT     decrementScale\name
189        BEQ     lastElement\name
190
191
192        @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
193        @// Note: W^k is stored as negative values in the table and also need to conjugate the values from the table
194        @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) since both of them
195        @// require F(1),F(2) and F(N/2-2),F(N/2-1)
196
197
198        SUB     step,step,#24
199evenOddButterflyLoop\name :
200
201
202        VLD1    dW0r,[argTwiddle1],step1
203        VLD1    dW1r,[argTwiddle1]!
204
205        VLD2    {dX0r,dX0i},[pSrc],step
206        SUB     argTwiddle1,argTwiddle1,step1
207        VLD2    {dX1r,dX1i},[pSrc]!
208
209        SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
210        VLD1    dW0i,[pTwiddleTmp],step1
211        VLD1    dW1i,[pTwiddleTmp]!
212        SUB     pSrc,pSrc,step
213
214        SUB     pTwiddleTmp,pTwiddleTmp,step1
215        VREV64  dX1r,dX1r
216        VREV64  dX1i,dX1i
217        SUBS    size,size,#4
218
219
220        VHSUB    dT2,dX0r,dX1r                            @// a-c
221        VHADD    dT3,dX0i,dX1i                            @// b+d
222        SUB     step1,step1,#8
223        VHADD    dT0,dX0r,dX1r                           @// a+c
224        VHSUB    dT1,dX0i,dX1i                            @// b-d
225
226        VZIP    dW1r,dW1i
227        VZIP    dW0r,dW0i
228
229
230        VMULL   qT0,dW1r,dT2
231        VMLSL   qT0,dW1i,dT3
232        VMULL   qT1,dW1r,dT3
233        VMLAL   qT1,dW1i,dT2
234
235        VMULL   qT2,dW0r,dT2
236        VMLAL   qT2,dW0i,dT3
237        VMULL   qT3,dW0r,dT3
238        VMLSL   qT3,dW0i,dT2
239
240
241        VRSHRN  dX1r,qT0,#31
242        VRSHRN  dX1i,qT1,#31
243
244        .ifeqs  "\scaled", "TRUE"
245            VHADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
246            VHSUB    dY1i,dX1r,dT1
247        .else
248            VADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
249            VSUB    dY1i,dX1r,dT1
250
251        .endif
252
253
254        VREV64  dY1r,dY1r
255        VREV64  dY1i,dY1i
256
257
258        VRSHRN  dX0r,qT2,#31
259        VRSHRN  dX0i,qT3,#31
260
261        .ifeqs  "\scaled", "TRUE"
262            VHADD    dY0r,dT0,dX0i                           @// F(1)
263            VHSUB    dY0i,dT1,dX0r
264        .else
265            VADD    dY0r,dT0,dX0i                           @// F(1)
266            VSUB    dY0i,dT1,dX0r
267        .endif
268
269
270        VST2    {dY0r,dY0i},[pOut1],step
271        VST2    {dY1r,dY1i},[pOut1]!
272        SUB     pOut1,pOut1,step
273        SUB     step,step,#32                            @// (N/2-4)*8 bytes
274
275
276        BGT     evenOddButterflyLoop\name
277
278
279        SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
280        SUB     pOut1,pOut1,#8
281
282        @// Last element can be expanded as follows
283        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as -ve)
284        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
285        @// 1/2[2a+j0] - j (c-jd) [0+j2b]
286        @// (a+bc, -bd)
287        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
288
289lastElement\name :
290        VLD1    dX0r,[pSrc]
291
292        .ifeqs  "\scaled", "TRUE"
293            VSHR    dX0r,dX0r,#1
294        .endif
295
296        VST1    dX0r[0],[pOut1]!
297        VNEG    dX0r,dX0r
298        VST1    dX0r[1],[pOut1]
299
300
301
302decrementScale\name :
303
304        .ifeqs  "\scaled", "TRUE"
305            SUB scale,scale,#1
306        .endif
307
308        .endm
309
310        M_START armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe,r4
311
312            FFTSTAGE "FALSE","TRUE",Inv
313        M_END
314
315        M_START armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe,r4
316
317            FFTSTAGE "TRUE","TRUE",InvSfs
318        M_END
319
320
321        .end
322