1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s 11@// to support float instead of SC32. 12@// 13 14@// 15@// Description: 16@// Compute FFT for a real signal 17@// 18@// 19 20 21@// Include standard headers 22 23#include "dl/api/arm/armCOMM_s.h" 24#include "dl/api/arm/omxtypes_s.h" 25 26@// M_VARIANTS ARM1136JS 27 28@// Import symbols required from other files 29@// (For example tables) 30 31 .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp 32 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp 33 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp 34 .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp 35 36@// Set debugging level 37@//DEBUG_ON SETL {TRUE} 38 39 40 41@// Guarding implementation by the processor name 42 43@// IF ARM1136JS 44 45@//Input Registers 46 47#define pSrc r0 48#define pDst r1 49#define pFFTSpec r2 50 51 52@// Output registers 53#define result r0 54 55@//Local Scratch Registers 56 57@// N=1 case 58#define scaleMinusOne r2 59#define rnd r2 60#define zero r8 61#define Zero r9 62 63 64#define argTwiddle r1 65#define argDst r2 66#define argScale r4 67#define pTwiddle r4 68#define pOut r5 69#define subFFTSize r7 70#define subFFTNum r6 71#define N r6 72#define order r14 73#define diff r9 74#define count r8 75#define diffMinusOne r10 76#define round r3 77 78#define step r3 79#define step1 r6 80#define twStep r12 81#define pTwiddleTmp r14 82#define t0 r12 83#define t1 r14 /*@// pTwiddleTmp*/ 84#define t2 r0 85#define t3 r1 /*@// pSrc,argTwiddle*/ 86#define t4 r6 87#define t5 r7 /*@// step1,subFFTSize*/ 88 89#define x0r s0 90#define x0i s1 91#define y0r s2 92#define y0i s3 93#define x1r s4 94#define x1i s5 95#define w1r s2 96#define w1i s3 97#define w0r s6 98#define w0i s7 99#define y1r s2 /*@// w1r,w1i*/ 100#define y1i s3 101#define st0 s8 102#define st1 s9 103#define st2 s10 104#define st3 s11 105#define st4 s12 106#define st5 s13 107#define half s15 108 109 110 111 112 @// Allocate stack memory required by the function 113 114 115 116 @// Write function header 117 M_START omxSP_FFTFwd_RToCCS_F32_Sfs_vfp,r11 118 119@ Structure offsets for FFTSpec 120 .set ARMsFFTSpec_N, 0 121 .set ARMsFFTSpec_pBitRev, 4 122 .set ARMsFFTSpec_pTwiddle, 8 123 .set ARMsFFTSpec_pBuf, 12 124 125 @// Define stack arguments 126 127 @// Setup half value 128 movw N, #0 @// Use N as a temp. 129 movt N, #0x3f00 130 vmov.f32 half, N 131 132 @// Read the size from structure and take log 133 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 134 135 @// Read other structure parameters 136 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 137 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 138 139 @// N=1 Treat seperately 140 CMP N,#1 141 BGT sizeGreaterThanOne 142 // N<=1 is not supported 143 @// Set return value 144 MOV result, #OMX_Sts_NoErr 145 B FunctionEnd 146 147sizeGreaterThanOne: 148 @// Do a N/2 point complex FFT including the scaling 149 150 MOV N,N,ASR #1 @// N/2 point complex FFT 151 CLZ order,N @// N = 2^order 152 RSB order,order,#31 153 MOV subFFTSize,#1 154 @//MOV subFFTNum,N 155 156 157 CMP order,#1 158 BGT orderGreaterthan1 @// order > 1 159 vldmlt.f32 pSrc, {x0r, x0i} 160 vstmlt.f32 pOut, {x0r, x0i} 161 MOVLT pSrc,pOut 162 MOVLT argDst,pDst 163 BLT FFTEnd 164 165 MOV argDst,pOut @// Set input args to fft stages 166 MOV pOut,pDst @// Set input args to fft stages 167 MOV argTwiddle,pTwiddle 168 169 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe_vfp 170 B finalComplexToRealFixup 171 172orderGreaterthan1: 173 174 TST order, #2 @// Set input args to fft stages 175 MOVEQ argDst,pDst 176 MOVNE argDst,pOut 177 MOVNE pOut,pDst @// Pass the first stage dest in RN5 178 MOV argTwiddle,pTwiddle 179 180 @//check for even or odd order 181 182 @// NOTE: The following combination of BL's would work fine 183 @// eventhough the first BL would corrupt the flags. This is 184 @// because the end of the "grpZeroSetLoop" loop inside 185 @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp sets 186 @// the Z flag to EQ 187 188 TST order,#0x00000001 189 BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe_vfp 190 BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe_vfp 191 192unscaledRadix4Loop: 193 CMP subFFTNum,#1 194 BEQ FFTEnd 195 BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe_vfp 196 B unscaledRadix4Loop 197 198FFTEnd: 199finalComplexToRealFixup: 200 201 @// step = N/2 * 8 bytes 202 MOV step,subFFTSize,LSL #3 203 @// twStep = 3N/8 * 8 bytes pointing to W^1 204 SUB twStep,step,subFFTSize,LSL #1 205 @// step1 = N/4 * 8 = N/2*4 bytes 206 MOV step1,subFFTSize,LSL #2 207 @// (N/4-1)*8 bytes 208 SUB step1,step1,#8 209 210 @// F(0) = 1/2 [Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 211 @// 1/2 [(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 212 @// 1/2 [2a+j0] - j [0+j2b] 213 @// (a+b, 0) 214 215 @// F(N/2) =1/2 [Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 216 @// 1/2 [(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 217 @// 1/2 [2a+j0] + j [0+j2b] 218 @// (a-b, 0) 219 220 @// F(0) and F(N/2) 221 vldm.f32 pSrc!, {x0r, x0i} 222 vadd.f32 y0r,x0r,x0i @// F(0) = (2(Z0.r+Z0.i) , 0) 223 vsub.f32 x0r,x0r,x0i @// F(N/2) = (2(Z0.r-Z0.i) , 0) 224 vsub.f32 y0i, y0i @ y0i and x0i set to 0.0 225 vsub.f32 x0i, x0i 226 227 add argDst, step 228 vstm.f32 argDst, {x0r, x0i} @// {x0r,x0i}->[argDst, step] 229 sub argDst, step 230 vstm.f32 argDst!, {y0r, y0i} 231 232 SUBS subFFTSize,subFFTSize,#2 233 234 ADD pTwiddleTmp,argTwiddle,#8 @// W^2 235 ADD argTwiddle,argTwiddle,twStep @// W^1 236 BLT End 237 BEQ lastElement 238 239 240 @// F(k) = 1/2 [Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 241 @// Process 2 elements at a time. E.g: F(1) and F(N/2-1) since 242 @// both of them require Z(1) and Z(N/2-1) 243 244 ASR subFFTSize,subFFTSize,#1 245evenOddButterflyLoop: 246 247 SUB step,step,#16 @// (N/2-2)*8 bytes 248 249 add pSrc, step 250 vldm.f32 pSrc, {x1r, x1i} @// {x1r, x1i} = [pSrc, step] 251 sub pSrc, step 252 vldm.f32 pSrc!, {x0r, x0i} 253 add argTwiddle, step1 254 vldm.f32 argTwiddle, {w1r, w1i} @// {w1r, w1i} = [argTwiddle, step1] 255 sub argTwiddle, step1 256 vldm.f32 argTwiddle!, {w0r, w0i} @// {w0r, w0i} = [argTwiddle], #8 257 258 SUB step1,step1,#8 259 SUBS subFFTSize,subFFTSize,#1 260 261 vsub.f32 st2,x0r,x1r @// a-c 262 vadd.f32 st3,x0i,x1i @// b+d 263 vadd.f32 st0,x0r,x1r @// a+c 264 vsub.f32 st1,x0i,x1i @// b-d 265 266 vmul.f32 x1r,w1r,st2 267 vmul.f32 x1i,w1r,st3 268 vmla.f32 x1r,w1i,st3 @// x1r = w1r*st2 + w1i*st3 269 @//RSB x1r,x1r,#0 270 vmls.f32 x1i,w1i,st2 @// x1i = w1r*st3 - wli*st2 271 272 vsub.f32 y1r, st0, x1i 273 vadd.f32 y1i, x1r, st1 274 vneg.f32 y1i, y1i 275 276 vmul.f32 x0r,w0r,st2 277 vmul.f32 x0i,w0r,st3 278 vmls.f32 x0r,w0i,st3 @// x0r = w0r*st2 - w0i*st3 279 vmla.f32 x0i,w0i,st2 @// x0i = w0r*st3 + x0i*st1 280 281 vsub.f32 st4,st0,x0i @// F(1) 282 vadd.f32 st5,x0r,st1 283 284 285 vmul.f32 y1r, half 286 vmul.f32 y1i, half 287 vmul.f32 st4, half 288 vmul.f32 st5, half 289 290 add argDst, step 291 vstm.f32 argDst, {y1r, y1i} @// {y1r,y1i} -> [argDst,step] 292 sub argDst, step 293 vstm.f32 argDst!, {st4, st5} 294 295 296 MOV t0,argTwiddle @// swap ptr for even and odd twiddles 297 MOV argTwiddle,pTwiddleTmp 298 MOV pTwiddleTmp,t0 299 300 BGT evenOddButterflyLoop 301 302 @// Last element can be expanded as follows 303 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] 304 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] 305 @// 1/2[2a+j0] + j (c+jd) [0+j2b] 306 @// (a-bc, -bd) 307 308lastElement: 309 vldm.f32 pSrc, {x0r, x0i} 310 vneg.f32 x0i, x0i 311 vstm.f32 argDst, {x0r, x0i} 312 313End: 314 @// Set return value 315 MOV result, #OMX_Sts_NoErr 316 317FunctionEnd: 318 @// Write function tail 319 M_END 320 321@// ENDIF @//ARM1136JS 322 323 324 @// Guarding implementation by the processor name 325 326 327 328 .end 329