1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
11@//  to support float instead of SC32.
12@//
13
14@// Description:
15@// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
16@// complex signal.  This handles the general stage, not the first or last
17@// stage.
18@//
19@//
20
21
22@// Include standard headers
23
24#include "dl/api/arm/armCOMM_s.h"
25#include "dl/api/arm/omxtypes_s.h"
26
27
28@// Import symbols required from other files
29@// (For example tables)
30
31
32
33@// Set debugging level
34@//DEBUG_ON    SETL {TRUE}
35
36
37
38@// Guarding implementation by the processor name
39
40
41
42
43@// Guarding implementation by the processor name
44
45
46@//Input Registers
47
48#define pSrc            r0
49#define pDst            r2
50#define pTwiddle        r1
51#define subFFTNum       r6
52#define subFFTSize      r7
53
54
55@//Output Registers
56
57
58@//Local Scratch Registers
59
60#define outPointStep    r3
61#define pointStep       r4
62#define grpCount        r5
63#define setCount        r8
64@//const           RN  9
65#define step            r10
66#define dstStep         r11
67#define pTable          r9
68#define pTmp            r9
69
70@// Neon Registers
71
72#define dW      D0.F32
73#define dX0     D2.F32
74#define dX1     D3.F32
75#define dX2     D4.F32
76#define dX3     D5.F32
77#define dY0     D6.F32
78#define dY1     D7.F32
79#define dY2     D8.F32
80#define dY3     D9.F32
81#define qT0     D10.F32
82#define qT1     D11.F32
83
84
85        .macro FFTSTAGE scaled, inverse, name
86
87        @// Define stack arguments
88
89
90        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount
91        @// and pGrpSize regs
92
93        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
94        LSL     grpCount,subFFTSize,#1
95
96
97        @// pT0+1 increments pT0 by 8 bytes
98        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
99        MOV     pointStep,subFFTNum,LSL #2
100
101        @// update subFFTSize for the next stage
102        MOV     subFFTSize,grpCount
103
104        @// pOut0+1 increments pOut0 by 8 bytes
105        @// pOut0+outPointStep == increment of 8*outPointStep bytes =
106        @//    4*size bytes
107        SMULBB  outPointStep,grpCount,pointStep
108        LSL     pointStep,pointStep,#1
109
110
111        RSB      step,pointStep,#16
112        RSB      dstStep,outPointStep,#16
113
114        @// Loop on the groups
115
116radix2GrpLoop\name :
117        MOV      setCount,pointStep,LSR #3
118        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
119
120
121        @// Loop on the sets
122
123
124radix2SetLoop\name :
125
126
127        @// point0: dX0-real part dX1-img part
128        VLD2    {dX0,dX1},[pSrc],pointStep
129        @// point1: dX2-real part dX3-img part
130        VLD2    {dX2,dX3},[pSrc],step
131
132        SUBS    setCount,setCount,#2
133
134        .ifeqs  "\inverse", "TRUE"
135            VMUL   qT0,dX2,dW[0]
136            VMLA   qT0,dX3,dW[1]                       @// real part
137            VMUL   qT1,dX3,dW[0]
138            VMLS   qT1,dX2,dW[1]                       @// imag part
139
140        .else
141
142            VMUL   qT0,dX2,dW[0]
143            VMLS   qT0,dX3,dW[1]                       @// real part
144            VMUL   qT1,dX3,dW[0]
145            VMLA   qT1,dX2,dW[1]                       @// imag part
146
147        .endif
148
149        VSUB    dY0,dX0,qT0
150        VSUB    dY1,dX1,qT1
151        VADD    dY2,dX0,qT0
152        VADD    dY3,dX1,qT1
153
154        VST2    {dY0,dY1},[pDst],outPointStep
155        @// dstStep = -outPointStep + 16
156        VST2    {dY2,dY3},[pDst],dstStep
157
158        BGT     radix2SetLoop\name
159
160        SUBS    grpCount,grpCount,#2
161        ADD     pSrc,pSrc,pointStep
162        BGT     radix2GrpLoop\name
163
164
165        @// Reset and Swap pSrc and pDst for the next stage
166        MOV     pTmp,pDst
167        @// pDst -= 4*size; pSrc -= 8*size bytes
168        SUB     pDst,pSrc,outPointStep,LSL #1
169        SUB     pSrc,pTmp,outPointStep
170
171        @// Reset pTwiddle for the next stage
172        @// pTwiddle -= 4*size bytes
173        SUB     pTwiddle,pTwiddle,outPointStep
174
175
176        .endm
177
178
179
180        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
181        FFTSTAGE "FALSE","FALSE",FWD
182        M_END
183
184
185
186        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
187        FFTSTAGE "FALSE","TRUE",INV
188        M_END
189
190
191        .end
192