1//
2//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3//
4//  Use of this source code is governed by a BSD-style license
5//  that can be found in the LICENSE file in the root of the source
6//  tree. An additional intellectual property rights grant can be found
7//  in the file PATENTS.  All contributing project authors may
8//  be found in the AUTHORS file in the root of the source tree.
9//
10//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
11//  to support float instead of SC32.
12//
13
14// Description:
15// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
16// complex signal.  This handles the general stage, not the first or last
17// stage.
18//
19//
20
21
22// Include standard headers
23
24#include "dl/api/arm/arm64COMM_s.h"
25#include "dl/api/arm/omxtypes_s.h"
26
27
28// Import symbols required from other files
29// (For example tables)
30
31
32
33// Set debugging level
34//DEBUG_ON    SETL {TRUE}
35
36
37
38// Guarding implementation by the processor name
39
40
41
42
43// Guarding implementation by the processor name
44
45//Input Registers
46
47#define pSrc            x0
48#define pDst            x1
49#define pTwiddle        x2
50#define	pSubFFTNum	x3
51#define pSubFFTSize	x4
52
53
54//Output Registers
55
56
57//Local Scratch Registers
58
59#define subFFTNum       x5
60#define subFFTSize      x6
61#define outPointStep    x8
62#define pointStep       x9
63#define pointStep32     w9
64#define grpCount        x10
65#define grpCount32      w10
66#define setCount        x13
67#define step            x15
68#define dstStep         x11
69
70// Neon Registers
71
72#define dW      v0.2s
73#define dX0     v2.2s
74#define dX1     v3.2s
75#define dX2     v4.2s
76#define dX3     v5.2s
77#define dY0     v6.2s
78#define dY1     v7.2s
79#define dY2     v8.2s
80#define dY3     v9.2s
81#define qT0     v10.2s
82#define qT1     v11.2s
83
84        .macro FFTSTAGE scaled, inverse, name
85
86        // Define stack arguments
87
88        // Move args values into our work registers
89        ldr     subFFTNum, [pSubFFTNum]
90        ldr     subFFTSize, [pSubFFTSize]
91
92        // Update grpCount and grpSize rightaway inorder to reuse pGrpCount
93        // and pGrpSize regs
94
95        LSR     subFFTNum,subFFTNum,#1                 //grpSize
96        LSL     grpCount,subFFTSize,#1
97
98
99        // pT0+1 increments pT0 by 8 bytes
100        // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
101        lsl     pointStep, subFFTNum, #2
102
103        // update subFFTSize for the next stage
104        MOV     subFFTSize,grpCount
105
106        // pOut0+1 increments pOut0 by 8 bytes
107        // pOut0+outPointStep == increment of 8*outPointStep bytes =
108        //    4*size bytes
109        smull   outPointStep, grpCount32, pointStep32
110
111        LSL     pointStep,pointStep,#1
112
113
114        rsb      step,pointStep,#16
115        rsb      dstStep,outPointStep,#16
116
117        // Loop on the groups
118
119radix2GrpLoop\name :
120        lsr     setCount, pointStep, #3
121        LD1     {dW},[pTwiddle],pointStep              //[wi | wr]
122
123
124        // Loop on the sets
125
126
127radix2SetLoop\name :
128
129
130        // point0: dX0-real part dX1-img part
131        LD2    {dX0,dX1},[pSrc],pointStep
132        // point1: dX2-real part dX3-img part
133        LD2    {dX2,dX3},[pSrc],step
134
135        SUBS    setCount,setCount,#2
136
137        .ifeqs  "\inverse", "TRUE"
138            fmul   qT0,dX2,dW[0]
139            fmla   qT0,dX3,dW[1]                       // real part
140            fmul   qT1,dX3,dW[0]
141            fmls   qT1,dX2,dW[1]                       // imag part
142
143        .else
144
145            fmul   qT0,dX2,dW[0]
146            fmls   qT0,dX3,dW[1]                       // real part
147            fmul   qT1,dX3,dW[0]
148            fmla   qT1,dX2,dW[1]                       // imag part
149
150        .endif
151
152        fsub    dY0,dX0,qT0
153        fsub    dY1,dX1,qT1
154        fadd    dY2,dX0,qT0
155        fadd    dY3,dX1,qT1
156
157        st2    {dY0,dY1},[pDst],outPointStep
158        // dstStep = -outPointStep + 16
159        st2    {dY2,dY3},[pDst],dstStep
160
161        BGT     radix2SetLoop\name
162
163        SUBS    grpCount,grpCount,#2
164        ADD     pSrc,pSrc,pointStep
165        BGT     radix2GrpLoop\name
166
167
168        str     subFFTNum, [pSubFFTNum]
169        str     subFFTSize, [pSubFFTSize]
170        .endm
171
172
173
174        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11
175        FFTSTAGE "FALSE","FALSE",FWD
176        M_END
177
178
179
180        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11
181        FFTSTAGE "FALSE","TRUE",INV
182        M_END
183
184
185        .end
186