1@
2@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10@ Some code in this file was originally from file
11@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows.
12@ It has been relicensed with permission from the copyright holders.
13@
14
15@
16@ OpenMAX DL: v1.0.2
17@ Last Modified Revision:   7810
18@ Last Modified Date:       Thu, 04 Oct 2007
19@
20@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@
22
23@
24@ Description:
25@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines.
26@
27
28#include "dl/api/arm/armCOMM_s.h"
29#include "dl/api/arm/omxtypes_s.h"
30
31.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
32.extern  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
33.extern  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
34.extern  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
35.extern  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
36.extern  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
37.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
38.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
39.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
40.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
41.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
42.extern  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
43.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
44.extern  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
45.extern  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
46.extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
47
48@Input Registers
49#define pSrc            r0
50#define pDst            r1
51#define pFFTSpec        r2
52#define scale           r3
53
54@ Output registers
55#define result          r0
56
57@Local Scratch Registers
58#define argTwiddle      r1
59#define argDst          r2
60#define argScale        r4
61#define pTwiddle        r4
62#define tmpOrder        r4
63#define pOut            r5
64#define subFFTSize      r7
65#define subFFTNum       r6
66#define N               r6
67#define order           r14
68#define diff            r9
69@ Total num of radix stages to comple the FFT
70#define count           r8
71#define x0r             r4
72#define x0i             r5
73#define diffMinusOne    r2
74#define round           r3
75#define subFFTSizeTmp   r6
76#define step            r3
77#define stepr           r11
78#define step1           r10
79#define step1r          r6
80#define step2           r8
81#define step2r          r9
82#define twStep          r8
83#define zero            r9
84#define pTwiddleTmp     r5
85#define t0              r10
86
87@ Neon registers
88#define dX0             d0.s16
89#define dX0S32          d0.s32
90#define dzero           d1.s16
91#define dZero           d2.s16
92#define dShift          d3.s16
93#define qShift          q1.s16
94#define dX0r            d2.s16
95#define dX0i            d3.s16
96#define dX1r            d4.s16
97#define dX1i            d5.s16
98#define qX1             q2.s16
99#define dX0rS32         d2.s32
100#define dX0iS32         d3.s32
101#define dX1rS32         d4.s32
102#define dX1iS32         d5.s32
103#define dT0             d6.s16
104#define dT1             d7.s16
105#define dT2             d8.s16
106#define dT3             d9.s16
107#define qT0             q5.s32
108#define qT1             q6.s32
109#define qT0s            q5.s16
110#define qT1s            q6.s16
111#define dW0r            d14.s16
112#define dW0i            d15.s16
113#define dW1r            d16.s16
114#define dW1i            d17.s16
115#define dW0rS32         d14.s32
116#define dW0iS32         d15.s32
117#define dW1rS32         d16.s32
118#define dW1iS32         d17.s32
119#define dY0r            d14.s16
120#define dY0i            d15.s16
121#define dY0rS32         d14.s32
122#define dY0iS32         d15.s32
123#define dY1r            d16.s16
124#define dY1i            d17.s16
125#define qY1             q8.s16
126#define dY1rS32         d16.s32
127#define dY1iS32         d17.s32
128#define dY0rS64         d14.s32
129#define dY0iS64         d15.s32
130#define qT2             q9.s32
131#define qT3             q10.s32
132#define d18s16          d18.s16
133#define d19s16          d19.s16
134#define d20s16          d20.s16
135#define d21s16          d21.s16
136@ lastThreeelements
137#define dX1             d3.s16
138#define dW0             d4.s16
139#define dW1             d5.s16
140#define dY0             d10.s16
141#define dY1             d11.s16
142#define dY2             d12.s16
143#define dY3             d13.s16
144
145        @ Allocate stack memory required by the function
146        M_ALLOC4        diffOnStack, 4
147
148        @ Write function header
149        M_START     omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15
150
151        @ Structure offsets for the FFTSpec
152        .set    ARMsFFTSpec_N, 0
153        .set    ARMsFFTSpec_pBitRev, 4
154        .set    ARMsFFTSpec_pTwiddle, 8
155        .set    ARMsFFTSpec_pBuf, 12
156
157        @ Define stack arguments
158
159        @ Read the size from structure and take log
160        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
161
162        @ Read other structure parameters
163        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
164        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
165
166        @ Do a N/2 point complex FFT including the scaling
167
168        MOV     N,N,ASR #1                    @ N/2 point complex FFT
169
170        CLZ     order,N                       @ N = 2^order
171        RSB     order,order,#31
172        MOV     subFFTSize,#1
173
174        CMP     order,#3
175        BGT     orderGreaterthan3             @ order > 3
176
177        CMP     order,#1
178        BGE     orderGreaterthan0             @ order > 0
179        M_STR   scale, diffOnStack,LT         @ order = 0
180        LDR     x0r,[pSrc]
181        STR     x0r,[pOut]
182        MOV     pSrc,pOut
183        MOV     argDst,pDst
184        B       FFTEnd
185
186orderGreaterthan0:
187        @ set the buffers appropriately for various orders
188        CMP     order,#2
189        MOVEQ   argDst,pDst
190        MOVNE   argDst,pOut
191        MOVNE   pOut,pDst                  @ Pass 1st stage destination in RN5
192        MOV     argTwiddle,pTwiddle
193
194        SUBS    diff,scale,order
195        M_STR   diff,diffOnStack
196        MOVGT   scale,order
197        @ Now scale <= order
198
199        CMP     order,#1
200        BGT     orderGreaterthan1
201        @ order = 1:
202        SUBS    scale,scale,#1
203        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
204        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
205        B       FFTEnd
206
207orderGreaterthan1:
208        CMP     order,#2
209        MOV     argScale,scale
210        BGT     orderGreaterthan2
211        @ order = 2:
212        SUBS    argScale,argScale,#1
213        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
214        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
215        SUBS    argScale,argScale,#1
216        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
217        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
218        B       FFTEnd
219
220orderGreaterthan2:   @ order = 3
221        SUBS    argScale,argScale,#1
222        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
223        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
224        SUBS    argScale,argScale,#1
225        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
226        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
227        SUBS    argScale,argScale,#1
228        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
229        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
230        B       FFTEnd
231
232
233orderGreaterthan3:
234        @ check scale = 0 or scale = order
235        SUBS    diff, scale, order   @ scale > order
236        MOVGT   scale,order
237        BGE     specialScaleCase     @ scale = 0 or scale = order
238        CMP     scale,#0
239        BEQ     specialScaleCase
240        B       generalScaleCase
241
242specialScaleCase:   @ scale = 0, or, scale = order && order > 3
243        TST     order, #2            @ Set input args to fft stages
244        MOVEQ   argDst,pDst
245        MOVNE   argDst,pOut
246        MOVNE   pOut,pDst            @ Pass the first stage destination in RN5
247        MOV     argTwiddle,pTwiddle
248
249        CMP     diff,#0
250        M_STR   diff, diffOnStack
251        BGE     scaleEqualsOrder
252
253        @ check for even or odd order.
254        @ NOTE: The following combination of BL's would work fine even though
255        @ the first BL would corrupt the flags. This is because the end of the
256        @ "grpZeroSetLoop" loop inside
257        @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
258
259        TST     order,#0x00000001
260        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
261        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
262
263        CMP     subFFTNum,#4
264        BLT     FFTEnd
265
266unscaledRadix4Loop:
267        BEQ     lastStageUnscaledRadix4
268        BL      armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
269        CMP     subFFTNum,#4
270        B       unscaledRadix4Loop
271
272lastStageUnscaledRadix4:
273        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
274        B       FFTEnd
275
276scaleEqualsOrder:
277        @ check for even or odd order
278        @ NOTE: The following combination of BL's would work fine even though
279        @ the first BL would corrupt the flags. This is because the end of the
280        @ "grpZeroSetLoop" loop inside
281        @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ.
282
283        TST     order,#0x00000001
284        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
285        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
286
287        CMP     subFFTNum,#4
288        BLT     FFTEnd
289
290scaledRadix4Loop:
291        BEQ     lastStageScaledRadix4
292        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
293        CMP     subFFTNum,#4
294        B       scaledRadix4Loop
295
296lastStageScaledRadix4:
297        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
298        B       FFTEnd
299
300generalScaleCase:                        @ 0 < scale < order and order > 3
301        @ Determine the correct destination buffer
302        SUB     diff,order,scale
303        TST     diff,#0x01
304        ADDEQ   count,scale,diff,LSR #1  @ count = scale + (order - scale)/2
305        MOVNE   count,order
306        TST     count,#0x01              @ Is count even or odd ?
307
308        MOVEQ   argDst,pDst              @ Set input args to fft stages
309        MOVNE   argDst,pOut
310        MOVNE   pOut,pDst                @ Pass 1st stage destination in RN5
311        MOV     argTwiddle,pTwiddle
312
313        CMP     diff,#1
314        M_STR   diff, diffOnStack
315        BEQ     scaleps                  @ scaling including a radix2_ps stage
316
317        MOV     argScale,scale           @ Put scale in RN4 to save and restore
318        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
319        SUBS    argScale,argScale,#1
320
321scaledRadix2Loop:
322        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
323        SUBS    argScale,argScale,#1     @ save, restore scale in scaled stages
324        BGT     scaledRadix2Loop
325        B       outScale
326
327scaleps:
328        SUB     argScale,scale,#1        @ order>3 and diff=1 => scale >= 3
329        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
330        SUBS    argScale,argScale,#1
331
332scaledRadix2psLoop:
333        BEQ     scaledRadix2psStage
334        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
335        SUBS    argScale,argScale,#1     @ save, restore scale in scaled stages
336        BGE     scaledRadix2psLoop
337
338scaledRadix2psStage:
339        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
340        B       generalLastStageUnscaledRadix2
341
342outScale:
343        M_LDR   diff, diffOnStack
344        @check for even or odd order
345        TST     diff,#0x00000001
346        BEQ     generalUnscaledRadix4Loop
347        B       unscaledRadix2Loop
348
349generalUnscaledRadix4Loop:
350        CMP     subFFTNum,#4
351        BEQ     generalLastStageUnscaledRadix4
352        BL      armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
353        B       generalUnscaledRadix4Loop
354
355generalLastStageUnscaledRadix4:
356        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
357        B       End
358
359unscaledRadix2Loop:
360        CMP     subFFTNum,#4
361        BEQ     generalLastTwoStagesUnscaledRadix2
362        BL      armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
363        B       unscaledRadix2Loop
364
365generalLastTwoStagesUnscaledRadix2:
366        BL      armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
367generalLastStageUnscaledRadix2:
368        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
369        B       End
370
371FFTEnd:     @ Does only the scaling
372        M_LDR   diff, diffOnStack
373        CMP     diff,#0
374        BLE     finalComplexToRealFixup
375
376        RSB     diff,diff,#0               @ for right shift by a variable
377        VDUP    qShift,diff
378
379        @ save subFFTSize and use subFFTSizeTmp in the following loop
380        MOV     subFFTSizeTmp,subFFTSize   @ subFFTSizeTmp same reg as subFFTNum
381
382        @ Use parallel loads for bigger FFT size.
383        CMP     subFFTSizeTmp, #8
384        BLT     scaleLessFFTData
385
386scaleFFTData:
387        VLD1    {qT0s, qT1s},[pSrc:256]    @ pSrc contains pDst pointer
388        SUBS    subFFTSizeTmp,subFFTSizeTmp,#8
389        VSHL    qT0s,qShift
390        VSHL    qT1s,qShift
391        VST1    {qT0s, qT1s},[pSrc:256]!
392        BGT     scaleFFTData
393        B       afterScaling
394
395scaleLessFFTData:
396        VLD1    {dX0S32[0]},[pSrc]         @ pSrc contains pDst pointer
397        SUBS    subFFTSizeTmp,subFFTSizeTmp,#1
398        VSHL    dX0,dShift
399        VST1    {dX0S32[0]},[pSrc]!
400        BGT     scaleLessFFTData
401
402afterScaling:
403        SUB     pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup
404
405        @  change the logic so that output after scaling is in pOut and not in pDst
406        @  finally store from pOut to pDst
407        @  change branch "End" to branch "finalComplexToRealFixup" in the above
408        @  chk the code below for multiplication by j factor
409
410finalComplexToRealFixup:
411        @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
412        @ 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
413        @ 1/2[2a+j0] - j [0+j2b]
414        @ (a+b, 0)
415
416        @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
417        @ 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
418        @ 1/2[2a+j0] + j [0+j2b]
419        @ (a-b, 0)
420
421        CMP    subFFTSize,#4
422        BLE    smallFFTSize
423
424@ SubSize > 3:
425        @ F(0) and F(N/2)
426        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
427        MOV     zero,#0
428        VMOV    dX0r[1],zero
429        MOV     step,subFFTSize,LSL #2        @ step = N/2 * 4 bytes
430        VMOV    dX0i[1],zero
431        SUB     twStep,step,subFFTSize        @ twStep = 3N/8 * 8 bytes
432
433        VADD    dY0r,dX0r,dX0i                @ F(0) = ((Z0.r+Z0.i) , 0)
434        MOV     step1,subFFTSize,LSL #1       @ step1 = N/2 * 2 bytes
435        VSUB    dY0i,dX0r,dX0i                @ F(N/2) = ((Z0.r-Z0.i) , 0)
436        SUBS    subFFTSize,subFFTSize,#2
437
438        VST1    dY0rS32[0],[argDst], step
439        ADD     pTwiddleTmp,argTwiddle,#4     @ W^2
440        VST1    dY0iS32[0],[argDst]!
441        ADD     argTwiddle,argTwiddle,twStep  @ W^1
442
443        VDUP    dzero,zero
444        SUB     argDst,argDst,step
445        SUB     step,step,#20
446        RSB     stepr, step, #16
447        SUB     step1,step1,#8                @ (N/4-1)*8 bytes
448        RSB     step1r,step1,#8
449
450        SUB     step2, step1, #4
451        RSB     step2r, step2, #8
452
453        @ F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
454        @ Note: W^k is stored as negative values in the table.
455        @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
456        @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1).
457
458evenOddButterflyLoop:
459        VLD2    {dX0r,dX0i},[pSrc],step
460        VLD2    {dX1r,dX1i},[pSrc],stepr
461
462        VLD1    dW0r,[argTwiddle],step1
463        SUB     step1, step1, #16
464        VREV64  qX1,qX1
465
466        VLD1    dW1r,[argTwiddle],step1r
467        ADD     step1r, step1r, #16
468        VSUB    dT2,dX0r,dX1r                 @ a-c
469
470        VLD1    dW0i,[pTwiddleTmp],step2
471        SUB     step2, step2, #16
472        VADD    dT3,dX0i,dX1i                 @ b+d
473
474        VLD1    dW1i,[pTwiddleTmp],step2r
475        ADD     step2r, step2r, #16
476
477        VTRN    dW0r,dW0i
478        VZIP    dW1r, dW1i
479
480        SUBS    subFFTSize,subFFTSize,#8
481
482        VHADD   dT0,dX0r,dX1r                 @ (a+c)/2
483        VZIP    dW1iS32, dW1rS32
484        VHSUB   dT1,dX0i,dX1i                 @ (b-d)/2
485
486        VQDMULH dY0,dW1i,dT2
487        VQDMULH dY1,dW1r,dT3
488        VQDMULH dY2,dW1i,dT3
489        VQDMULH dY3,dW1r,dT2
490
491        VQDMULH d18s16,dW0r,dT2
492        VQDMULH d19s16,dW0i,dT3
493        VQDMULH d20s16,dW0r,dT3
494        VQDMULH d21s16,dW0i,dT2
495
496        VRHADD  dX1r, dY0, dY1
497        VHSUB   dX1i, dY2, dY3
498        VHSUB   dX0r, d18s16, d19s16
499        VADD    dY1i,dT1,dX1r
500        VRHADD  dX0i, d20s16, d21s16
501        VSUB    dY1r,dT0,dX1i                 @ F(N/2 -1)
502        VSUB    dY0r,dT0,dX0i                 @ F(1)
503        VADD    dY0i,dT1,dX0r
504
505        VNEG    dY1i,dY1i
506        VREV64  qY1, qY1
507
508        VST2    {dY0r,dY0i},[argDst],step
509        SUB     step,step,#32                 @ (N/2-4)*4 bytes
510        VST2    {dY1r,dY1i},[argDst],stepr
511        ADD     stepr,stepr,#32
512
513        BGT     evenOddButterflyLoop
514
515        SUB     pSrc,pSrc,#4                  @ points to the last element.
516        SUB     argDst,argDst,#4              @ points to the last element.
517
518        b lastElement
519
520smallFFTSize:
521
522        @ F(0) and F(N/2)
523        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
524        MOV     zero,#0
525        VMOV    dX0r[1],zero
526        MOV     step,subFFTSize,LSL #2        @ step = N/2 * 4 bytes
527        VMOV    dX0i[1],zero
528        SUB     twStep,step,subFFTSize        @ twStep = 3N/8 * 8 bytes
529
530        VADD    dY0r,dX0r,dX0i                @ F(0) = ((Z0.r+Z0.i) , 0)
531        MOV     step1,subFFTSize,LSL #1       @ step1 = N/2 * 2 bytes
532        VSUB    dY0i,dX0r,dX0i                @ F(N/2) = ((Z0.r-Z0.i) , 0)
533        SUBS    subFFTSize,subFFTSize,#2
534
535
536        VST1    dY0rS32[0],[argDst], step
537        ADD     pTwiddleTmp,argTwiddle,#4     @ W^2
538        VST1    dY0iS32[0],[argDst]!
539        ADD     argTwiddle,argTwiddle,twStep  @ W^1
540
541        VDUP    dzero,zero
542        SUB     argDst,argDst,step
543
544        BLT     End
545        BEQ     lastElement
546
547        SUB     step,step,#12
548        SUB     step1,step1,#4                @ (N/4-1)*8 bytes
549
550        @ F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
551
552butterflyLoopSubFFTSize4:
553        VLD1    dW0rS32[0], [argTwiddle],step1
554        VLD1    dW1rS32[0],[argTwiddle]!
555
556        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
557        VLD2    {dX0r[1],dX0i[1]},[pSrc],step
558        SUB     pSrc,pSrc,#4
559        SUB     argTwiddle,argTwiddle,step1
560        VLD2    {dX1r[0],dX1i[0]},[pSrc]!
561        VLD2    {dX1r[1],dX1i[1]},[pSrc]!
562
563        SUB     step1,step1,#4                @ (N/4-2)*4 bytes
564        VLD1    dW0iS32[0],[pTwiddleTmp],step1
565        VLD1    dW1iS32[0],[pTwiddleTmp]!
566        SUB     pSrc,pSrc,step
567
568        SUB     pTwiddleTmp,pTwiddleTmp,step1
569        VREV32  dX1r,dX1r
570        VREV32  dX1i,dX1i
571        SUBS    subFFTSize,subFFTSize,#4
572
573        VSUB    dT2,dX0r,dX1r                 @ a-c
574        SUB     step1,step1,#4
575        VADD    dT3,dX0i,dX1i                 @ b+d
576        VADD    dT0,dX0r,dX1r                 @ a+c
577        VSUB    dT1,dX0i,dX1i                 @ b-d
578        VHADD   dT0,dT0,dzero
579        VHADD   dT1,dT1,dzero
580
581        VTRN    dW1r,dW1i
582        VTRN    dW0r,dW0i
583
584        VMULL   qT0,dW1r,dT2
585        VMLAL   qT0,dW1i,dT3
586        VMULL   qT1,dW1r,dT3
587        VMLSL   qT1,dW1i,dT2
588
589        VMULL   qT2,dW0r,dT2
590        VMLSL   qT2,dW0i,dT3
591        VMULL   qT3,dW0r,dT3
592        VMLAL   qT3,dW0i,dT2
593
594        VRSHRN  dX1r,qT0,#16
595        VRSHRN  dX1i,qT1,#16
596
597        VSUB    dY1r,dT0,dX1i                 @ F(N/2 -1)
598        VADD    dY1i,dT1,dX1r
599        VNEG    dY1i,dY1i
600
601        VREV32  dY1r,dY1r
602        VREV32  dY1i,dY1i
603
604        VRSHRN  dX0r,qT2,#16
605        VRSHRN  dX0i,qT3,#16
606
607        VSUB    dY0r,dT0,dX0i                 @ F(1)
608        VADD    dY0i,dT1,dX0r
609
610        VST2    {dY0r[0],dY0i[0]},[argDst]!
611        VST2    {dY0r[1],dY0i[1]},[argDst],step
612        SUB     argDst, #4
613        VST2    {dY1r[0],dY1i[0]},[argDst]!
614        VST2    {dY1r[1],dY1i[1]},[argDst]!
615        SUB     argDst,argDst,step
616        SUB     pSrc,pSrc,#4                  @ points to the last element.
617        SUB     argDst,argDst,#4              @ points to the last element.
618
619lastElement:
620        @ Last element can be expanded as follows
621        @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
622        @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
623        @ 1/2[2a+j0] + j (c+jd) [0+j2b]
624        @ (a-bc, -bd)
625        @ Since (c,d) = (0,1) for the last element, result is just (a,-b)
626
627        VLD1    dX0rS32[0],[pSrc]
628        VST1    dX0r[0],[argDst]!
629        VNEG    dX0r,dX0r
630        VST1    dX0r[1],[argDst]!
631
632End:
633        @ Set return value
634        MOV     result, #OMX_Sts_NoErr
635
636        @ Write function tail
637        M_END
638
639    .end
640