1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s 11@// to support float instead of SC32. 12@// 13 14@// 15@// Description: 16@// Compute FFT for a real signal 17@// 18@// 19 20 21@// Include standard headers 22 23#include "dl/api/arm/armCOMM_s.h" 24#include "dl/api/arm/omxtypes_s.h" 25 26 27@// Import symbols required from other files 28@// (For example tables) 29 30 .extern armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe 31 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe 32 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe 33 .extern armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe 34 .extern armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe 35 .extern armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe 36 .extern armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe 37 38@// Set debugging level 39@//DEBUG_ON SETL {TRUE} 40 41 42 43@// Guarding implementation by the processor name 44 45 46 47 @// Guarding implementation by the processor name 48 49@// Import symbols required from other files 50@// (For example tables) 51 .extern armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe 52 .extern armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe 53 54 55@//Input Registers 56 57#define pSrc r0 58#define pDst r1 59#define pFFTSpec r2 60#define scale r3 61 62 63@// Output registers 64#define result r0 65 66@//Local Scratch Registers 67 68#define argTwiddle r1 69#define argDst r2 70#define argScale r4 71#define tmpOrder r4 72#define pTwiddle r4 73#define pOut r5 74#define subFFTSize r7 75#define subFFTNum r6 76#define N r6 77#define order r14 78#define diff r9 79@// Total num of radix stages required to comple the FFT 80#define count r8 81#define x0r r4 82#define x0i r5 83#define diffMinusOne r2 84#define subFFTSizeTmp r6 85#define step r3 86#define step1 r4 87#define twStep r8 88#define zero r9 89#define pTwiddleTmp r5 90#define t0 r10 91 92@// Neon registers 93 94#define dX0 d0.f32 95#define dzero d1.f32 96#define dZero d2.f32 97#define dShift d3.f32 98#define dX0r d2.f32 99#define dX0i d3.f32 100#define dX1r d4.f32 101#define dX1i d5.f32 102#define dT0 d6.f32 103#define dT1 d7.f32 104#define dT2 d8.f32 105#define dT3 d9.f32 106#define qT0 d10.f32 107#define qT1 d12.f32 108#define dW0r d14.f32 109#define dW0i d15.f32 110#define dW1r d16.f32 111#define dW1i d17.f32 112#define dY0r d14.f32 113#define dY0i d15.f32 114#define dY1r d16.f32 115#define dY1i d17.f32 116#define dY0rS64 d14.s64 117#define dY0iS64 d15.s64 118#define qT2 d18.f32 119#define qT3 d20.f32 120@// lastThreeelements 121#define dX1 d3.f32 122#define dW0 d4.f32 123#define dW1 d5.f32 124#define dY0 d10.f32 125#define dY1 d11.f32 126#define dY2 d12.f32 127#define dY3 d13.f32 128 129#define half d0.f32 130 131 @// Allocate stack memory required by the function 132 133 @// Write function header 134 M_START omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15 135 136@ Structure offsets for the FFTSpec 137 .set ARMsFFTSpec_N, 0 138 .set ARMsFFTSpec_pBitRev, 4 139 .set ARMsFFTSpec_pTwiddle, 8 140 .set ARMsFFTSpec_pBuf, 12 141 142 @// Define stack arguments 143 144 @// Read the size from structure and take log 145 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 146 147 @// Read other structure parameters 148 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 149 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 150 151 @// N=1 Treat seperately 152 CMP N,#1 153 BGT sizeGreaterThanOne 154 VLD1 dX0[0],[pSrc] 155 MOV zero,#0 156 VMOV dzero[0],zero 157 VMOV dZero[0],zero 158 VST3 {dX0[0],dzero[0],dZero[0]},[pDst] 159 160 B End 161 162 163 164sizeGreaterThanOne: 165 @// Do a N/2 point complex FFT including the scaling 166 167 MOV N,N,ASR #1 @// N/2 point complex FFT 168 169 CLZ order,N @// N = 2^order 170 RSB order,order,#31 171 MOV subFFTSize,#1 172 @//MOV subFFTNum,N 173 174 CMP order,#3 175 BGT orderGreaterthan3 @// order > 3 176 177 CMP order,#1 178 BGE orderGreaterthan0 @// order > 0 179 VLD1 dX0,[pSrc] 180 VST1 dX0,[pOut] 181 MOV pSrc,pOut 182 MOV argDst,pDst 183 BLT FFTEnd 184 185orderGreaterthan0: 186 @// set the buffers appropriately for various orders 187 CMP order,#2 188 MOVEQ argDst,pDst 189 MOVNE argDst,pOut 190 @// Pass the first stage destination in RN5 191 MOVNE pOut,pDst 192 MOV argTwiddle,pTwiddle 193 194 CMP order,#1 195 BGT orderGreaterthan1 196 @// order = 1 197 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe 198 B FFTEnd 199 200orderGreaterthan1: 201 CMP order,#2 202 BGT orderGreaterthan2 203 @// order =2 204 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe 205 BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe 206 B FFTEnd 207 208orderGreaterthan2:@// order =3 209 BL armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe 210 BL armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe 211 BL armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe 212 213 B FFTEnd 214 215 216 217orderGreaterthan3: 218specialScaleCase: 219 220 @// Set input args to fft stages 221 TST order, #2 222 MOVEQ argDst,pDst 223 MOVNE argDst,pOut 224 @// Pass the first stage destination in RN5 225 MOVNE pOut,pDst 226 MOV argTwiddle,pTwiddle 227 228 @//check for even or odd order 229 @// NOTE: The following combination of BL's would work fine even though 230 @// the first BL would corrupt the flags. This is because the end of 231 @// the "grpZeroSetLoop" loop inside 232 @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag 233 @// to EQ 234 235 TST order,#0x00000001 236 BLEQ armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe 237 BLNE armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe 238 239 CMP subFFTNum,#4 240 BLT FFTEnd 241 242 243unscaledRadix4Loop: 244 BEQ lastStageUnscaledRadix4 245 BL armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe 246 CMP subFFTNum,#4 247 B unscaledRadix4Loop 248 249lastStageUnscaledRadix4: 250 BL armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe 251 B FFTEnd 252 253 254FFTEnd: 255finalComplexToRealFixup: 256 257 258 @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 259 @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 260 @// 1/2[2a+j0] - j [0+j2b] 261 @// (a+b, 0) 262 263 @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 264 @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 265 @// 1/2[2a+j0] + j [0+j2b] 266 @// (a-b, 0) 267 268 @// F(0) and F(N/2) 269 VLD2 {dX0r[0],dX0i[0]},[pSrc]! 270 MOV zero,#0 271 VMOV dX0r[1],zero 272 MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes 273 VMOV dX0i[1],zero 274 @// twStep = 3N/8 * 8 bytes pointing to W^1 275 SUB twStep,step,subFFTSize,LSL #1 276 277 VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0) 278 MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes 279 VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0) 280 SUBS subFFTSize,subFFTSize,#2 281 282 VST1 dY0r,[argDst],step 283 ADD pTwiddleTmp,argTwiddle,#8 @// W^2 284 VST1 dY0i,[argDst]! 285 ADD argTwiddle,argTwiddle,twStep @// W^1 286 287 VDUP dzero,zero 288 SUB argDst,argDst,step 289 290 BLT End 291 BEQ lastElement 292 SUB step,step,#24 293 SUB step1,step1,#8 @// (N/4-1)*8 bytes 294 295 @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 296 @// Note: W^k is stored as negative values in the table 297 @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) 298 @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 299 300 VMOV half, #0.5 301 302evenOddButterflyLoop: 303 304 305 VLD1 dW0r,[argTwiddle],step1 306 VLD1 dW1r,[argTwiddle]! 307 308 VLD2 {dX0r,dX0i},[pSrc],step 309 SUB argTwiddle,argTwiddle,step1 310 VLD2 {dX1r,dX1i},[pSrc]! 311 312 313 314 SUB step1,step1,#8 @// (N/4-2)*8 bytes 315 VLD1 dW0i,[pTwiddleTmp],step1 316 VLD1 dW1i,[pTwiddleTmp]! 317 SUB pSrc,pSrc,step 318 319 SUB pTwiddleTmp,pTwiddleTmp,step1 320 VREV64 dX1r,dX1r 321 VREV64 dX1i,dX1i 322 SUBS subFFTSize,subFFTSize,#4 323 324 325 326 VSUB dT2,dX0r,dX1r @// a-c 327 SUB step1,step1,#8 328 VADD dT0,dX0r,dX1r @// a+c 329 VSUB dT1,dX0i,dX1i @// b-d 330 VADD dT3,dX0i,dX1i @// b+d 331 VMUL dT0,dT0,half[0] 332 VMUL dT1,dT1,half[0] 333 VZIP dW1r,dW1i 334 VZIP dW0r,dW0i 335 336 337 VMUL qT0,dW1r,dT2 338 VMUL qT1,dW1r,dT3 339 VMUL qT2,dW0r,dT2 340 VMUL qT3,dW0r,dT3 341 342 VMLA qT0,dW1i,dT3 343 VMLS qT1,dW1i,dT2 344 345 VMLS qT2,dW0i,dT3 346 VMLA qT3,dW0i,dT2 347 348 349 VMUL dX1r,qT0,half[0] 350 VMUL dX1i,qT1,half[0] 351 352 VSUB dY1r,dT0,dX1i @// F(N/2 -1) 353 VADD dY1i,dT1,dX1r 354 VNEG dY1i,dY1i 355 356 VREV64 dY1r,dY1r 357 VREV64 dY1i,dY1i 358 359 360 VMUL dX0r,qT2,half[0] 361 VMUL dX0i,qT3,half[0] 362 363 VSUB dY0r,dT0,dX0i @// F(1) 364 VADD dY0i,dT1,dX0r 365 366 367 VST2 {dY0r,dY0i},[argDst],step 368 VST2 {dY1r,dY1i},[argDst]! 369 SUB argDst,argDst,step 370 SUB step,step,#32 @// (N/2-4)*8 bytes 371 372 373 BGT evenOddButterflyLoop 374 375 @// set both the ptrs to the last element 376 SUB pSrc,pSrc,#8 377 SUB argDst,argDst,#8 378 379 380 381 @// Last element can be expanded as follows 382 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] 383 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] 384 @// 1/2[2a+j0] + j (c+jd) [0+j2b] 385 @// (a-bc, -bd) 386 @// Since (c,d) = (0,1) for the last element, result is just (a,-b) 387 388lastElement: 389 VLD1 dX0r,[pSrc] 390 391 VST1 dX0r[0],[argDst]! 392 VNEG dX0r,dX0r 393 VST1 dX0r[1],[argDst]! 394 395End: 396 @// Set return value 397 MOV result, #OMX_Sts_NoErr 398 399 @// Write function tail 400 M_END 401 402 .end 403