1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12@//
13
14@//
15@// File Name:  omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7810
18@// Last Modified Date:       Thu, 04 Oct 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute FFT for a real signal
26@//
27
28
29
30@// Include standard headers
31
32#include "dl/api/arm/armCOMM_s.h"
33#include "dl/api/arm/omxtypes_s.h"
34
35
36@// Import symbols required from other files
37@// (For example tables)
38
39        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe
40        .extern  armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
41        .extern  armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe
42        .extern  armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
43        .extern  armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
44        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe
45        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe
46        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
47        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
48        .extern  armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
49
50@// Set debugging level
51@//DEBUG_ON    SETL {TRUE}
52
53
54
55@// Guarding implementation by the processor name
56
57
58
59    @// Guarding implementation by the processor name
60
61@// Import symbols required from other files
62@// (For example tables)
63        .extern  armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
64        .extern  armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
65        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
66        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
67
68
69@//Input Registers
70
71#define pSrc            r0
72#define pDst            r1
73#define pFFTSpec        r2
74#define scale           r3
75
76
77@// Output registers
78#define result          r0
79
80@//Local Scratch Registers
81
82#define argTwiddle      r1
83#define argDst          r2
84#define argScale        r4
85#define tmpOrder        r4
86#define pTwiddle        r4
87#define pOut            r5
88#define subFFTSize      r7
89#define subFFTNum       r6
90#define N               r6
91#define order           r14
92#define diff            r9
93@// Total num of radix stages required to comple the FFT
94#define count           r8
95#define x0r             r4
96#define x0i             r5
97#define diffMinusOne    r2
98#define subFFTSizeTmp   r6
99#define step            r3
100#define step1           r4
101#define twStep          r8
102#define zero            r9
103#define pTwiddleTmp     r5
104#define t0              r10
105
106@// Neon registers
107
108#define dX0       d0.s32
109#define dzero     d1.s32
110#define dZero     d2.s32
111#define dShift    d3.s32
112#define dX0r      d2.s32
113#define dX0i      d3.s32
114#define dX1r      d4.s32
115#define dX1i      d5.s32
116#define dT0       d6.s32
117#define dT1       d7.s32
118#define dT2       d8.s32
119#define dT3       d9.s32
120#define qT0       q5.s64
121#define qT1       q6.s64
122#define dW0r      d14.s32
123#define dW0i      d15.s32
124#define dW1r      d16.s32
125#define dW1i      d17.s32
126#define dY0r      d14.s32
127#define dY0i      d15.s32
128#define dY1r      d16.s32
129#define dY1i      d17.s32
130#define dY0rS64   d14.s64
131#define dY0iS64   d15.s64
132#define qT2       q9.s64
133#define qT3       q10.s64
134@// lastThreeelements
135#define dX1       d3.s32
136#define dW0       d4.s32
137#define dW1       d5.s32
138#define dY0       d10.s32
139#define dY1       d11.s32
140#define dY2       d12.s32
141#define dY3       d13.s32
142
143    @// Allocate stack memory required by the function
144
145        M_ALLOC4        diffOnStack, 4
146
147    @// Write function header
148        M_START     omxSP_FFTFwd_RToCCS_S32_Sfs,r11,d15
149
150@ Structure offsets for the FFTSpec
151        .set    ARMsFFTSpec_N, 0
152        .set    ARMsFFTSpec_pBitRev, 4
153        .set    ARMsFFTSpec_pTwiddle, 8
154        .set    ARMsFFTSpec_pBuf, 12
155
156        @// Define stack arguments
157
158        @// Read the size from structure and take log
159        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
160
161        @// Read other structure parameters
162        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
163        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
164
165        @//  N=1 Treat seperately
166        CMP     N,#1
167        BGT     sizeGreaterThanOne
168        VLD1    dX0[0],[pSrc]
169        RSB     scale,scale,#0                        @// to use VRSHL for right shift by a variable
170        MOV     zero,#0
171        VMOV    dShift[0],scale
172        VMOV    dzero[0],zero
173        VRSHL   dX0,dShift
174        VMOV    dZero[0],zero
175        VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
176
177        B       End
178
179
180
181sizeGreaterThanOne:
182        @// Do a N/2 point complex FFT including the scaling
183
184        MOV     N,N,ASR #1                          @// N/2 point complex FFT
185
186        CLZ     order,N                             @// N = 2^order
187        RSB     order,order,#31
188        MOV     subFFTSize,#1
189        @//MOV     subFFTNum,N
190
191        CMP     order,#3
192        BGT     orderGreaterthan3                   @// order > 3
193
194        CMP     order,#1
195        BGE     orderGreaterthan0                   @// order > 0
196        M_STR   scale, diffOnStack,LT               @// order = 0
197        VLD1    dX0,[pSrc]
198        VST1    dX0,[pOut]
199        MOV     pSrc,pOut
200        MOV     argDst,pDst
201        BLT     FFTEnd
202
203orderGreaterthan0:
204        @// set the buffers appropriately for various orders
205        CMP     order,#2
206        MOVEQ   argDst,pDst
207        MOVNE   argDst,pOut
208        MOVNE   pOut,pDst                           @// Pass the first stage destination in RN5
209        MOV     argTwiddle,pTwiddle
210
211        SUBS     diff,scale,order
212        M_STR   diff,diffOnStack
213        MOVGT   scale,order
214        @// Now scale <= order
215
216        CMP     order,#1
217        BGT     orderGreaterthan1
218        SUBS    scale,scale,#1
219        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
220        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe      @// order = 1
221        B       FFTEnd
222
223orderGreaterthan1:
224        CMP     order,#2
225        MOV     argScale,scale
226        BGT     orderGreaterthan2
227        SUBS    argScale,argScale,#1
228        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe      @// order =2
229        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
230        SUBS    argScale,argScale,#1
231        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
232        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
233        B       FFTEnd
234
235orderGreaterthan2:@// order =3
236        SUBS    argScale,argScale,#1
237        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe
238        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
239        SUBS    argScale,argScale,#1
240        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
241        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
242        SUBS    argScale,argScale,#1
243        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
244        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
245        B       FFTEnd
246
247
248
249orderGreaterthan3:
250        @// check scale = 0 or scale = order
251        SUBS    diff, scale, order                 @// scale > order
252        MOVGT   scale,order
253        BGE     specialScaleCase                   @// scale = 0 or scale = order
254        CMP     scale,#0
255        BEQ     specialScaleCase
256        B       generalScaleCase
257
258specialScaleCase:@//  scale = 0 or scale = order  and order >= 2
259
260        TST     order, #2                           @// Set input args to fft stages
261        MOVEQ   argDst,pDst
262        MOVNE   argDst,pOut
263        MOVNE   pOut,pDst                           @// Pass the first stage destination in RN5
264        MOV     argTwiddle,pTwiddle
265
266        CMP      diff,#0
267        M_STR    diff, diffOnStack
268        BGE      scaleEqualsOrder
269
270        @//check for even or odd order
271        @// NOTE: The following combination of BL's would work fine eventhough the first
272        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
273        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
274
275        TST     order,#0x00000001
276        BLEQ    armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe
277        BLNE    armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
278
279        CMP        subFFTNum,#4
280        BLT     FFTEnd
281
282
283unscaledRadix4Loop:
284        BEQ        lastStageUnscaledRadix4
285         BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
286         CMP        subFFTNum,#4
287         B        unscaledRadix4Loop
288
289lastStageUnscaledRadix4:
290        BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
291        B        FFTEnd
292
293
294scaleEqualsOrder:
295        @//check for even or odd order
296        @// NOTE: The following combination of BL's would work fine eventhough the first
297        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
298        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
299
300        TST     order,#0x00000001
301        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe
302        BLNE    armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe
303
304        CMP        subFFTNum,#4
305        BLT     FFTEnd
306
307
308scaledRadix4Loop:
309        BEQ        lastStageScaledRadix4
310         BL        armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
311         CMP        subFFTNum,#4
312         B        scaledRadix4Loop
313
314lastStageScaledRadix4:
315        BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
316        B        FFTEnd
317
318generalScaleCase:@// 0 < scale < order and order >= 2
319        @// Determine the correct destination buffer
320        SUB     diff,order,scale
321        TST     diff,#0x01
322        ADDEQ   count, scale,diff,lsr #1         @// count = scale + (order - scale)/2
323        MOVNE   count, order
324        TST     count, #0x01                     @// Is count even or odd ?
325
326        MOVEQ   argDst,pDst                     @// Set input args to fft stages
327        MOVNE   argDst,pOut
328        MOVNE   pOut,pDst                       @// Pass the first stage destination in RN5
329        MOV     argTwiddle,pTwiddle
330
331        M_STR   diff, diffOnStack
332
333        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
334        BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
335        SUBS    argScale,argScale,#1
336
337scaledRadix2Loop:
338        BLGT    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
339        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
340        BGT     scaledRadix2Loop
341
342
343        M_LDR   diff, diffOnStack
344        @//check for even or odd order
345        TST     diff,#0x00000001
346        BEQ     generalUnscaledRadix4Loop
347        B       unscaledRadix2Loop
348
349generalUnscaledRadix4Loop:
350        CMP        subFFTNum,#4
351         BEQ        generalLastStageUnscaledRadix4
352         BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
353         B        generalUnscaledRadix4Loop
354
355generalLastStageUnscaledRadix4:
356        BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe
357        B        finalComplexToRealFixup
358
359
360unscaledRadix2Loop:
361        CMP        subFFTNum,#2
362         BEQ        generalLastStageUnscaledRadix2
363         BL        armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
364         B        unscaledRadix2Loop
365
366generalLastStageUnscaledRadix2:
367        BL      armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
368        B        finalComplexToRealFixup
369
370
371FFTEnd:@// Does only the scaling
372
373        M_LDR   diff, diffOnStack
374        CMP     diff,#0
375        BLE     finalComplexToRealFixup
376
377        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
378        VDUP    dShift,diff
379
380        @// save subFFTSize and use tmpsubfftsize in the folowwing loop
381        MOV    subFFTSizeTmp,subFFTSize                 @// subFFTSizeTmp same reg as subFFTNum
382
383scaleFFTData:@// N = subFFTSize  ; dataptr = pDst  ; scale = diff
384        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
385        SUBS    subFFTSizeTmp,subFFTSizeTmp,#1
386        VRSHL   dX0,dShift
387        VST1    {dX0},[pSrc]!
388
389        BGT     scaleFFTData
390
391        SUB     pSrc,pSrc,subFFTSize,LSL #3             @// reset pSrc for final fixup
392
393        @//  change the logic so that output after scaling is in pOut and not in pDst
394        @//  finally store from pOut to pDst
395        @//  change branch "End" to branch "finalComplexToRealFixup" in the above
396        @//  chk the code below for multiplication by j factor
397
398finalComplexToRealFixup:
399
400
401        @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
402        @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
403        @// 1/2[2a+j0] - j [0+j2b]
404        @// (a+b, 0)
405
406        @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
407        @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
408        @// 1/2[2a+j0] + j [0+j2b]
409        @// (a-b, 0)
410
411        @// F(0) and F(N/2)
412        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
413        MOV     zero,#0
414        VMOV    dX0r[1],zero
415        MOV     step,subFFTSize,LSL #3                  @// step = N/2 * 8 bytes
416        VMOV    dX0i[1],zero
417        SUB     twStep,step,subFFTSize,LSL #1           @// twStep = 3N/8 * 8 bytes pointing to W^1
418
419        VADD    dY0r,dX0r,dX0i                          @// F(0) = ((Z0.r+Z0.i) , 0)
420        MOV     step1,subFFTSize,LSL #2                 @// step1 = N/2 * 4 bytes
421        VSUB    dY0i,dX0r,dX0i                            @// F(N/2) = ((Z0.r-Z0.i) , 0)
422        SUBS    subFFTSize,subFFTSize,#2
423
424        VST1    dY0r,[argDst],step
425        ADD     pTwiddleTmp,argTwiddle,#8                @// W^2
426        VST1    dY0i,[argDst]!
427        ADD     argTwiddle,argTwiddle,twStep             @// W^1
428
429        VDUP    dzero,zero
430        SUB     argDst,argDst,step
431
432        BLT     End
433        BEQ     lastElement
434        SUB     step,step,#24
435        SUB     step1,step1,#8                         @// (N/4-1)*8 bytes
436
437        @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
438        @// Note: W^k is stored as negative values in the table
439        @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) since both of them
440        @// require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
441
442
443evenOddButterflyLoop:
444
445
446        VLD1    dW0r,[argTwiddle],step1
447        VLD1    dW1r,[argTwiddle]!
448
449        VLD2    {dX0r,dX0i},[pSrc],step
450        SUB     argTwiddle,argTwiddle,step1
451        VLD2    {dX1r,dX1i},[pSrc]!
452
453
454
455        SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
456        VLD1    dW0i,[pTwiddleTmp],step1
457        VLD1    dW1i,[pTwiddleTmp]!
458        SUB     pSrc,pSrc,step
459
460        SUB     pTwiddleTmp,pTwiddleTmp,step1
461        VREV64  dX1r,dX1r
462        VREV64  dX1i,dX1i
463        SUBS    subFFTSize,subFFTSize,#4
464
465
466
467        VSUB    dT2,dX0r,dX1r                            @// a-c
468        SUB     step1,step1,#8
469        VADD    dT3,dX0i,dX1i                            @// b+d
470        VADD    dT0,dX0r,dX1r                           @// a+c
471        VSUB    dT1,dX0i,dX1i                            @// b-d
472        VHADD   dT0,dT0,dzero
473        VHADD   dT1,dT1,dzero
474
475        VZIP    dW1r,dW1i
476        vzip    dW0r,dW0i
477
478
479        VMULL   qT0,dW1r,dT2
480        VMLAL   qT0,dW1i,dT3
481        VMULL   qT1,dW1r,dT3
482        VMLSL   qT1,dW1i,dT2
483
484        VMULL   qT2,dW0r,dT2
485        VMLSL   qT2,dW0i,dT3
486        VMULL   qT3,dW0r,dT3
487        VMLAL   qT3,dW0i,dT2
488
489
490        VRSHRN  dX1r,qT0,#32
491        VRSHRN  dX1i,qT1,#32
492
493        VSUB    dY1r,dT0,dX1i                           @// F(N/2 -1)
494        VADD    dY1i,dT1,dX1r
495        VNEG    dY1i,dY1i
496
497        VREV64  dY1r,dY1r
498        VREV64  dY1i,dY1i
499
500
501        VRSHRN  dX0r,qT2,#32
502        VRSHRN  dX0i,qT3,#32
503
504
505        VSUB    dY0r,dT0,dX0i                           @// F(1)
506        VADD    dY0i,dT1,dX0r
507
508
509        VST2    {dY0r,dY0i},[argDst],step
510        VST2    {dY1r,dY1i},[argDst]!
511        SUB     argDst,argDst,step
512        SUB     step,step,#32                            @// (N/2-4)*8 bytes
513
514
515        BGT     evenOddButterflyLoop
516
517        SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
518        SUB     argDst,argDst,#8
519
520
521
522        @// Last element can be expanded as follows
523        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
524        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
525        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
526        @// (a-bc, -bd)
527        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
528
529lastElement:
530        VLD1    dX0r,[pSrc]
531
532        VST1    dX0r[0],[argDst]!
533        VNEG    dX0r,dX0r
534        VST1    dX0r[1],[argDst]!
535
536
537
538
539
540
541End:
542        @// Set return value
543        MOV     result, #OMX_Sts_NoErr
544
545        @// Write function tail
546        M_END
547
548        .end
549
550