1@
2@  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@
4@  Use of this source code is governed by a BSD-style license
5@  that can be found in the LICENSE file in the root of the source
6@  tree. An additional intellectual property rights grant can be found
7@  in the file PATENTS.  All contributing project authors may
8@  be found in the AUTHORS file in the root of the source tree.
9@
10@ Some code in this file was originally from file
11@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as
12@ follows. It has been relicensed with permission from the copyright holders.
13@
14
15@
16@ OpenMAX DL: v1.0.2
17@ Last Modified Revision:   7485
18@ Last Modified Date:       Fri, 21 Sep 2007
19@
20@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@
22
23@
24@ Description:
25@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT.
26@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation.
27@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above
28@ formula.
29@
30
31#include "dl/api/arm/armCOMM_s.h"
32#include "dl/api/arm/omxtypes_s.h"
33
34@//Input Registers
35#define pSrc            r0
36#define pDst            r1
37#define pFFTSpec        r2
38#define scale           r3
39
40@ Output registers
41#define result          r0
42
43@//Local Scratch Registers
44#define argTwiddle      r1
45#define argDst          r2
46#define argScale        r4
47#define tmpOrder        r4
48#define pTwiddle        r4
49#define pOut            r5
50#define subFFTSize      r7
51#define subFFTNum       r6
52#define N               r6
53#define order           r14
54#define diff            r9
55@ Total num of radix stages to comple the FFT.
56#define count           r8
57#define x0r             r4
58#define x0i             r5
59#define diffMinusOne    r2
60#define round           r3
61#define pOut1           r2
62#define size            r7
63#define step            r8
64#define step1           r9
65#define step2           r10
66#define twStep          r10
67#define pTwiddleTmp     r11
68#define argTwiddle1     r12
69#define zero            r14
70
71@ Neon registers
72#define dX0             D0.S16
73#define dX0S32          D0.S32
74#define dShift          D1.S16
75#define dX1             D1.S16
76#define dX1S32          D1.S32
77#define dY0             D2.S16
78#define dY1             D3.S16
79#define dX0r            D0.S16
80#define dX0rS32         D0.S32
81#define dX0i            D1.S16
82#define dX1r            D2.S16
83#define dX1i            D3.S16
84#define qX1             Q1.S16
85#define dW0r            D4.S16
86#define dW0i            D5.S16
87#define dW1r            D6.S16
88#define dW1i            D7.S16
89#define dW0rS32         D4.S32
90#define dW0iS32         D5.S32
91#define dW1rS32         D6.S32
92#define dW1iS32         D7.S32
93#define dT0             D8.S16
94#define dT1             D9.S16
95#define dT2             D10.S16
96#define dT3             D11.S16
97#define qT0             Q6.S32
98#define qT1             Q7.S32
99#define qT2             Q8.S32
100#define qT3             Q9.S32
101#define dY0r            D4.S16
102#define dY0i            D5.S16
103#define dY1r            D6.S16
104#define dY1i            D7.S16
105#define qY1             Q3.S16
106#define dY2             D4.S16
107#define dY3             D5.S16
108#define dW0             D6.S16
109#define dW1             D7.S16
110#define dW0Tmp          D10.S16
111#define dW1Neg          D11.S16
112
113        @ Structure offsets for the FFTSpec
114        .set    ARMsFFTSpec_N, 0
115        .set    ARMsFFTSpec_pBitRev, 4
116        .set    ARMsFFTSpec_pTwiddle, 8
117        .set    ARMsFFTSpec_pBuf, 12
118
119        .MACRO FFTSTAGE scaled, inverse, name
120
121        @ Read the size from structure and take log
122        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
123
124        @ Read other structure parameters
125        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
126        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
127
128        MOV     size,N,ASR #1        @ preserve the contents of N
129        MOV     step,N,LSL #1        @ step = N/2 * 4 bytes
130
131        @ Process different FFT sizes with different loops.
132        CMP    size,#4
133        BLE    smallFFTSize\name
134
135        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
136        @ Note: W^(k) is stored as negated value and also need to
137        @ conjugate the values from the table.
138
139        @ Z(0) : no need of twiddle multiply
140        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
141
142        VLD1    dX0S32[0],[pSrc],step
143        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes
144
145        VLD1    dX1S32[0],[pSrc]!
146        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
147
148        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
149        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
150
151        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
152        VHSUB    dY1,dX0,dX1         @ [b-d | a-c]
153        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
154
155        .ifeqs  "\scaled", "TRUE"
156            VHSUB   dX0,dY0,dY1
157            SUBS    size,size,#2
158            VHADD   dX1,dY0,dY1
159        .else
160            VSUB   dX0,dY0,dY1
161            SUBS    size,size,#2
162            VADD   dX1,dY0,dY1
163        .endif
164
165        SUB     pSrc,pSrc,step
166        VST1    dX0[0],[pOut1]!
167        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
168        VST1    dX1[1],[pOut1]!
169        ADD     argTwiddle1,pTwiddle,twStep            @ W^1
170
171        BLT     decrementScale\name
172        BEQ     lastElement\name
173
174        SUB     step,step,#20
175        SUB     step1,step1,#4                         @ (N/4-1)*8 bytes
176        SUB     step2, step1, #4
177
178        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
179        @ Note: W^k is stored as negative values in the table and also need to
180        @ conjugate the values from the table.
181        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
182        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
183
184evenOddButterflyLoop\name:
185        VLD2    {dX0r,dX0i},[pSrc],step
186        VLD2    {dX1r,dX1i},[pSrc]!
187        SUB     pSrc, pSrc, step
188
189        VLD1    dW0r,[argTwiddle1],step1
190        VREV64  qX1,qX1
191        VLD1    dW1r,[argTwiddle1]!
192        VHSUB   dT2,dX0r,dX1r                          @ a-c
193        SUB     argTwiddle1, argTwiddle1, step1
194        SUB     step1,step1,#16
195
196        VLD1    dW0i,[pTwiddleTmp],step2
197        VHADD   dT3,dX0i,dX1i                          @ b+d
198        VLD1    dW1i,[pTwiddleTmp]!
199        VHADD   dT0,dX0r,dX1r                          @ a+c
200        VHSUB   dT1,dX0i,dX1i                          @ b-d
201        SUB     pTwiddleTmp, pTwiddleTmp, step2
202        SUB     step2,step2,#16
203
204        SUBS    size,size,#8
205
206        VZIP    dW1r,dW1i
207        VTRN    dW0r,dW0i
208        VZIP    dW1iS32, dW1rS32
209
210        VMULL   qT0,dW1i,dT2
211        VMLSL   qT0,dW1r,dT3
212        VMULL   qT1,dW1i,dT3
213        VMLAL   qT1,dW1r,dT2
214        VMULL   qT2,dW0r,dT2
215        VMLAL   qT2,dW0i,dT3
216        VMULL   qT3,dW0r,dT3
217        VMLSL   qT3,dW0i,dT2
218
219        VRSHRN  dX1r,qT0,#15
220        VRSHRN  dX1i,qT1,#15
221        VRSHRN  dX0r,qT2,#15
222        VRSHRN  dX0i,qT3,#15
223
224        .ifeqs  "\scaled", "TRUE"
225            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
226            VHSUB    dY1i,dX1r,dT1
227        .else
228            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
229            VSUB    dY1i,dX1r,dT1
230        .endif
231
232        .ifeqs  "\scaled", "TRUE"
233            VHADD    dY0r,dT0,dX0i                     @ F(1)
234            VHSUB    dY0i,dT1,dX0r
235        .else
236            VADD    dY0r,dT0,dX0i                      @ F(1)
237            VSUB    dY0i,dT1,dX0r
238        .endif
239
240        VREV64  qY1,qY1
241
242        VST2    {dY0r,dY0i},[pOut1],step
243        VST2    {dY1r,dY1i},[pOut1]
244        ADD     pOut1,pOut1,#16
245        SUB     pOut1, pOut1, step
246        SUB     step,step,#32
247
248        BGT     evenOddButterflyLoop\name
249
250        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
251        SUB     pOut1,pOut1,#4
252        B       lastElement\name
253
254smallFFTSize\name:
255        @ Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
256        @ Note: W^(k) is stored as negated value and also need to
257        @ conjugate the values from the table.
258
259        @ Z(0) : no need of twiddle multiply
260        @ Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
261
262        VLD1    dX0S32[0],[pSrc],step
263        ADD     pOut1,pOut,step      @ pOut1 = pOut+ N/2*4 bytes
264
265        VLD1    dX1S32[0],[pSrc]!
266        SUB     twStep,step,size     @ twStep = 3N/8 * 4 bytes pointing to W^1
267
268        MOV     step1,size,LSL #1    @ step1 = N/4 * 4 = N/2*2 bytes
269        SUB     step1,step1,#4       @ (N/4-1)*4 bytes
270
271        VHADD    dY0,dX0,dX1         @ [b+d | a+c]
272        VHSUB    dY1,dX0,dX1         @ [b-d | a-c]
273        VTRN    dY0,dY1              @ dY0= [a-c | a+c] ;dY1= [b-d | b+d]
274
275        .ifeqs  "\scaled", "TRUE"
276            VHSUB   dX0,dY0,dY1
277            SUBS    size,size,#2
278            VHADD   dX1,dY0,dY1
279        .else
280            VSUB   dX0,dY0,dY1
281            SUBS    size,size,#2
282            VADD   dX1,dY0,dY1
283        .endif
284
285        SUB     pSrc,pSrc,step
286        VST1    dX0[0],[pOut1]!
287        ADD     pTwiddleTmp,pTwiddle,#4                @ W^2
288        VST1    dX1[1],[pOut1]!
289        ADD     argTwiddle1,pTwiddle,twStep            @ W^1
290
291        BLT     decrementScale\name
292        BEQ     lastElement\name
293
294        @ Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
295        @ Note: W^k is stored as negative values in the table and also need to
296        @ conjugate the values from the table.
297        @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
298        @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1).
299
300        SUB     step,step,#12
301
302evenOddButterflyLoopSize4\name:
303        VLD1    dW0rS32[0],[argTwiddle1],step1
304        VLD1    dW1rS32[0],[argTwiddle1]!
305
306        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
307        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
308        SUB     pSrc,pSrc,#4
309        SUB     argTwiddle1,argTwiddle1,step1
310        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
311        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
312
313        SUB     step1,step1,#4                         @ (N/4-2)*4 bytes
314        VLD1    dW0iS32[0],[pTwiddleTmp],step1
315        VLD1    dW1iS32[0],[pTwiddleTmp]!
316        SUB     pSrc,pSrc,step
317
318        SUB     pTwiddleTmp,pTwiddleTmp,step1
319        VREV32  dX1r,dX1r
320        VREV32  dX1i,dX1i
321        SUBS    size,size,#4
322
323        VHSUB   dT2,dX0r,dX1r                          @ a-c
324        VHADD   dT3,dX0i,dX1i                          @ b+d
325        SUB     step1,step1,#4
326        VHADD   dT0,dX0r,dX1r                          @ a+c
327        VHSUB   dT1,dX0i,dX1i                          @ b-d
328
329        VTRN    dW1r,dW1i
330        VTRN    dW0r,dW0i
331
332        VMULL   qT0,dW1r,dT2
333        VMLSL   qT0,dW1i,dT3
334        VMULL   qT1,dW1r,dT3
335        VMLAL   qT1,dW1i,dT2
336        VMULL   qT2,dW0r,dT2
337        VMLAL   qT2,dW0i,dT3
338        VMULL   qT3,dW0r,dT3
339        VMLSL   qT3,dW0i,dT2
340
341        VRSHRN  dX1r,qT0,#15
342        VRSHRN  dX1i,qT1,#15
343
344        .ifeqs  "\scaled", "TRUE"
345            VHADD    dY1r,dT0,dX1i                     @ F(N/2 -1)
346            VHSUB    dY1i,dX1r,dT1
347        .else
348            VADD    dY1r,dT0,dX1i                      @ F(N/2 -1)
349            VSUB    dY1i,dX1r,dT1
350        .endif
351
352        VREV32  dY1r,dY1r
353        VREV32  dY1i,dY1i
354
355        VRSHRN  dX0r,qT2,#15
356        VRSHRN  dX0i,qT3,#15
357
358        .ifeqs  "\scaled", "TRUE"
359            VHADD    dY0r,dT0,dX0i                     @ F(1)
360            VHSUB    dY0i,dT1,dX0r
361        .else
362            VADD    dY0r,dT0,dX0i                      @ F(1)
363            VSUB    dY0i,dT1,dX0r
364        .endif
365
366        VST2    {dY0r[0],dY0i[0]},[pOut1]!
367        VST2    {dY0r[1],dY0i[1]},[pOut1],step
368        SUB     pOut1, #4
369        VST2    {dY1r[0],dY1i[0]},[pOut1]!
370        VST2    {dY1r[1],dY1i[1]},[pOut1]!
371        SUB     pOut1,pOut1,step
372        SUB     pSrc,pSrc,#4           @ set both the ptrs to the last element
373        SUB     pOut1,pOut1,#4
374
375        @ Last element can be expanded as follows
376        @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve)
377        @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
378        @ 1/2[2a+j0] - j (c-jd) [0+j2b]
379        @ (a+bc, -bd)
380        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
381
382lastElement\name:
383        VLD1    dX0rS32[0],[pSrc]
384
385        .ifeqs  "\scaled", "TRUE"
386            VSHR    dX0r,dX0r,#1
387        .endif
388
389        VST1    dX0r[0],[pOut1]!
390        VNEG    dX0r,dX0r
391        VST1    dX0r[1],[pOut1]
392
393decrementScale\name:
394        .ifeqs  "\scaled", "TRUE"
395            SUB scale,scale,#1
396        .endif
397
398        .endm
399
400        M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4
401        FFTSTAGE "FALSE","TRUE",Inv
402        M_END
403
404        M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4
405        FFTSTAGE "TRUE","TRUE",InvSfs
406        M_END
407
408
409        .end
410