1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This file was originally licensed as follows. It has been 11@// relicensed with permission from the copyright holders. 12@// 13 14@// 15@// File Name: omxSP_FFTFwd_RToCCS_S32_Sfs_s.s 16@// OpenMAX DL: v1.0.2 17@// Last Modified Revision: 7810 18@// Last Modified Date: Thu, 04 Oct 2007 19@// 20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21@// 22@// 23@// 24@// Description: 25@// Compute FFT for a real signal 26@// 27 28 29 30@// Include standard headers 31 32#include "dl/api/arm/armCOMM_s.h" 33#include "dl/api/arm/omxtypes_s.h" 34 35 36@// Import symbols required from other files 37@// (For example tables) 38 39 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe 40 .extern armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe 41 .extern armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 42 .extern armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe 43 .extern armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe 44 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 45 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 46 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe 47 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe 48 .extern armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe 49 50@// Set debugging level 51@//DEBUG_ON SETL {TRUE} 52 53 54 55@// Guarding implementation by the processor name 56 57 58 59 @// Guarding implementation by the processor name 60 61@// Import symbols required from other files 62@// (For example tables) 63 .extern armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 64 .extern armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 65 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe 66 .extern armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe 67 68 69@//Input Registers 70 71#define pSrc r0 72#define pDst r1 73#define pFFTSpec r2 74#define scale r3 75 76 77@// Output registers 78#define result r0 79 80@//Local Scratch Registers 81 82#define argTwiddle r1 83#define argDst r2 84#define argScale r4 85#define tmpOrder r4 86#define pTwiddle r4 87#define pOut r5 88#define subFFTSize r7 89#define subFFTNum r6 90#define N r6 91#define order r14 92#define diff r9 93@// Total num of radix stages required to comple the FFT 94#define count r8 95#define x0r r4 96#define x0i r5 97#define diffMinusOne r2 98#define subFFTSizeTmp r6 99#define step r3 100#define step1 r4 101#define twStep r8 102#define zero r9 103#define pTwiddleTmp r5 104#define t0 r10 105 106@// Neon registers 107 108#define dX0 d0.s32 109#define dzero d1.s32 110#define dZero d2.s32 111#define dShift d3.s32 112#define dX0r d2.s32 113#define dX0i d3.s32 114#define dX1r d4.s32 115#define dX1i d5.s32 116#define dT0 d6.s32 117#define dT1 d7.s32 118#define dT2 d8.s32 119#define dT3 d9.s32 120#define qT0 q5.s64 121#define qT1 q6.s64 122#define dW0r d14.s32 123#define dW0i d15.s32 124#define dW1r d16.s32 125#define dW1i d17.s32 126#define dY0r d14.s32 127#define dY0i d15.s32 128#define dY1r d16.s32 129#define dY1i d17.s32 130#define dY0rS64 d14.s64 131#define dY0iS64 d15.s64 132#define qT2 q9.s64 133#define qT3 q10.s64 134@// lastThreeelements 135#define dX1 d3.s32 136#define dW0 d4.s32 137#define dW1 d5.s32 138#define dY0 d10.s32 139#define dY1 d11.s32 140#define dY2 d12.s32 141#define dY3 d13.s32 142 143 @// Allocate stack memory required by the function 144 145 M_ALLOC4 diffOnStack, 4 146 147 @// Write function header 148 M_START omxSP_FFTFwd_RToCCS_S32_Sfs,r11,d15 149 150@ Structure offsets for the FFTSpec 151 .set ARMsFFTSpec_N, 0 152 .set ARMsFFTSpec_pBitRev, 4 153 .set ARMsFFTSpec_pTwiddle, 8 154 .set ARMsFFTSpec_pBuf, 12 155 156 @// Define stack arguments 157 158 @// Read the size from structure and take log 159 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 160 161 @// Read other structure parameters 162 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 163 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 164 165 @// N=1 Treat seperately 166 CMP N,#1 167 BGT sizeGreaterThanOne 168 VLD1 dX0[0],[pSrc] 169 RSB scale,scale,#0 @// to use VRSHL for right shift by a variable 170 MOV zero,#0 171 VMOV dShift[0],scale 172 VMOV dzero[0],zero 173 VRSHL dX0,dShift 174 VMOV dZero[0],zero 175 VST3 {dX0[0],dzero[0],dZero[0]},[pDst] 176 177 B End 178 179 180 181sizeGreaterThanOne: 182 @// Do a N/2 point complex FFT including the scaling 183 184 MOV N,N,ASR #1 @// N/2 point complex FFT 185 186 CLZ order,N @// N = 2^order 187 RSB order,order,#31 188 MOV subFFTSize,#1 189 @//MOV subFFTNum,N 190 191 CMP order,#3 192 BGT orderGreaterthan3 @// order > 3 193 194 CMP order,#1 195 BGE orderGreaterthan0 @// order > 0 196 M_STR scale, diffOnStack,LT @// order = 0 197 VLD1 dX0,[pSrc] 198 VST1 dX0,[pOut] 199 MOV pSrc,pOut 200 MOV argDst,pDst 201 BLT FFTEnd 202 203orderGreaterthan0: 204 @// set the buffers appropriately for various orders 205 CMP order,#2 206 MOVEQ argDst,pDst 207 MOVNE argDst,pOut 208 MOVNE pOut,pDst @// Pass the first stage destination in RN5 209 MOV argTwiddle,pTwiddle 210 211 SUBS diff,scale,order 212 M_STR diff,diffOnStack 213 MOVGT scale,order 214 @// Now scale <= order 215 216 CMP order,#1 217 BGT orderGreaterthan1 218 SUBS scale,scale,#1 219 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// order = 1 220 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe @// order = 1 221 B FFTEnd 222 223orderGreaterthan1: 224 CMP order,#2 225 MOV argScale,scale 226 BGT orderGreaterthan2 227 SUBS argScale,argScale,#1 228 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// order =2 229 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe 230 SUBS argScale,argScale,#1 231 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe 232 BLLT armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 233 B FFTEnd 234 235orderGreaterthan2:@// order =3 236 SUBS argScale,argScale,#1 237 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe 238 BLLT armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe 239 SUBS argScale,argScale,#1 240 BLGE armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe 241 BLLT armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe 242 SUBS argScale,argScale,#1 243 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe 244 BLLT armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 245 B FFTEnd 246 247 248 249orderGreaterthan3: 250 @// check scale = 0 or scale = order 251 SUBS diff, scale, order @// scale > order 252 MOVGT scale,order 253 BGE specialScaleCase @// scale = 0 or scale = order 254 CMP scale,#0 255 BEQ specialScaleCase 256 B generalScaleCase 257 258specialScaleCase:@// scale = 0 or scale = order and order >= 2 259 260 TST order, #2 @// Set input args to fft stages 261 MOVEQ argDst,pDst 262 MOVNE argDst,pOut 263 MOVNE pOut,pDst @// Pass the first stage destination in RN5 264 MOV argTwiddle,pTwiddle 265 266 CMP diff,#0 267 M_STR diff, diffOnStack 268 BGE scaleEqualsOrder 269 270 @//check for even or odd order 271 @// NOTE: The following combination of BL's would work fine eventhough the first 272 @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 273 @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ 274 275 TST order,#0x00000001 276 BLEQ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 277 BLNE armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe 278 279 CMP subFFTNum,#4 280 BLT FFTEnd 281 282 283unscaledRadix4Loop: 284 BEQ lastStageUnscaledRadix4 285 BL armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe 286 CMP subFFTNum,#4 287 B unscaledRadix4Loop 288 289lastStageUnscaledRadix4: 290 BL armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 291 B FFTEnd 292 293 294scaleEqualsOrder: 295 @//check for even or odd order 296 @// NOTE: The following combination of BL's would work fine eventhough the first 297 @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 298 @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ 299 300 TST order,#0x00000001 301 BLEQ armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 302 BLNE armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 303 304 CMP subFFTNum,#4 305 BLT FFTEnd 306 307 308scaledRadix4Loop: 309 BEQ lastStageScaledRadix4 310 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe 311 CMP subFFTNum,#4 312 B scaledRadix4Loop 313 314lastStageScaledRadix4: 315 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe 316 B FFTEnd 317 318generalScaleCase:@// 0 < scale < order and order >= 2 319 @// Determine the correct destination buffer 320 SUB diff,order,scale 321 TST diff,#0x01 322 ADDEQ count, scale,diff,lsr #1 @// count = scale + (order - scale)/2 323 MOVNE count, order 324 TST count, #0x01 @// Is count even or odd ? 325 326 MOVEQ argDst,pDst @// Set input args to fft stages 327 MOVNE argDst,pOut 328 MOVNE pOut,pDst @// Pass the first stage destination in RN5 329 MOV argTwiddle,pTwiddle 330 331 M_STR diff, diffOnStack 332 333 MOV argScale,scale @// Put scale in RN4 so as to save and restore 334 BL armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe @// scaled first stage 335 SUBS argScale,argScale,#1 336 337scaledRadix2Loop: 338 BLGT armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe 339 SUBS argScale,argScale,#1 @// save and restore scale (RN4) in the scaled stages 340 BGT scaledRadix2Loop 341 342 343 M_LDR diff, diffOnStack 344 @//check for even or odd order 345 TST diff,#0x00000001 346 BEQ generalUnscaledRadix4Loop 347 B unscaledRadix2Loop 348 349generalUnscaledRadix4Loop: 350 CMP subFFTNum,#4 351 BEQ generalLastStageUnscaledRadix4 352 BL armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe 353 B generalUnscaledRadix4Loop 354 355generalLastStageUnscaledRadix4: 356 BL armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 357 B finalComplexToRealFixup 358 359 360unscaledRadix2Loop: 361 CMP subFFTNum,#2 362 BEQ generalLastStageUnscaledRadix2 363 BL armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe 364 B unscaledRadix2Loop 365 366generalLastStageUnscaledRadix2: 367 BL armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 368 B finalComplexToRealFixup 369 370 371FFTEnd:@// Does only the scaling 372 373 M_LDR diff, diffOnStack 374 CMP diff,#0 375 BLE finalComplexToRealFixup 376 377 RSB diff,diff,#0 @// to use VRSHL for right shift by a variable 378 VDUP dShift,diff 379 380 @// save subFFTSize and use tmpsubfftsize in the folowwing loop 381 MOV subFFTSizeTmp,subFFTSize @// subFFTSizeTmp same reg as subFFTNum 382 383scaleFFTData:@// N = subFFTSize ; dataptr = pDst ; scale = diff 384 VLD1 {dX0},[pSrc] @// pSrc contains pDst pointer 385 SUBS subFFTSizeTmp,subFFTSizeTmp,#1 386 VRSHL dX0,dShift 387 VST1 {dX0},[pSrc]! 388 389 BGT scaleFFTData 390 391 SUB pSrc,pSrc,subFFTSize,LSL #3 @// reset pSrc for final fixup 392 393 @// change the logic so that output after scaling is in pOut and not in pDst 394 @// finally store from pOut to pDst 395 @// change branch "End" to branch "finalComplexToRealFixup" in the above 396 @// chk the code below for multiplication by j factor 397 398finalComplexToRealFixup: 399 400 401 @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 402 @// 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 403 @// 1/2[2a+j0] - j [0+j2b] 404 @// (a+b, 0) 405 406 @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 407 @// 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 408 @// 1/2[2a+j0] + j [0+j2b] 409 @// (a-b, 0) 410 411 @// F(0) and F(N/2) 412 VLD2 {dX0r[0],dX0i[0]},[pSrc]! 413 MOV zero,#0 414 VMOV dX0r[1],zero 415 MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes 416 VMOV dX0i[1],zero 417 SUB twStep,step,subFFTSize,LSL #1 @// twStep = 3N/8 * 8 bytes pointing to W^1 418 419 VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0) 420 MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes 421 VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0) 422 SUBS subFFTSize,subFFTSize,#2 423 424 VST1 dY0r,[argDst],step 425 ADD pTwiddleTmp,argTwiddle,#8 @// W^2 426 VST1 dY0i,[argDst]! 427 ADD argTwiddle,argTwiddle,twStep @// W^1 428 429 VDUP dzero,zero 430 SUB argDst,argDst,step 431 432 BLT End 433 BEQ lastElement 434 SUB step,step,#24 435 SUB step1,step1,#8 @// (N/4-1)*8 bytes 436 437 @// F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 438 @// Note: W^k is stored as negative values in the table 439 @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) since both of them 440 @// require Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 441 442 443evenOddButterflyLoop: 444 445 446 VLD1 dW0r,[argTwiddle],step1 447 VLD1 dW1r,[argTwiddle]! 448 449 VLD2 {dX0r,dX0i},[pSrc],step 450 SUB argTwiddle,argTwiddle,step1 451 VLD2 {dX1r,dX1i},[pSrc]! 452 453 454 455 SUB step1,step1,#8 @// (N/4-2)*8 bytes 456 VLD1 dW0i,[pTwiddleTmp],step1 457 VLD1 dW1i,[pTwiddleTmp]! 458 SUB pSrc,pSrc,step 459 460 SUB pTwiddleTmp,pTwiddleTmp,step1 461 VREV64 dX1r,dX1r 462 VREV64 dX1i,dX1i 463 SUBS subFFTSize,subFFTSize,#4 464 465 466 467 VSUB dT2,dX0r,dX1r @// a-c 468 SUB step1,step1,#8 469 VADD dT3,dX0i,dX1i @// b+d 470 VADD dT0,dX0r,dX1r @// a+c 471 VSUB dT1,dX0i,dX1i @// b-d 472 VHADD dT0,dT0,dzero 473 VHADD dT1,dT1,dzero 474 475 VZIP dW1r,dW1i 476 vzip dW0r,dW0i 477 478 479 VMULL qT0,dW1r,dT2 480 VMLAL qT0,dW1i,dT3 481 VMULL qT1,dW1r,dT3 482 VMLSL qT1,dW1i,dT2 483 484 VMULL qT2,dW0r,dT2 485 VMLSL qT2,dW0i,dT3 486 VMULL qT3,dW0r,dT3 487 VMLAL qT3,dW0i,dT2 488 489 490 VRSHRN dX1r,qT0,#32 491 VRSHRN dX1i,qT1,#32 492 493 VSUB dY1r,dT0,dX1i @// F(N/2 -1) 494 VADD dY1i,dT1,dX1r 495 VNEG dY1i,dY1i 496 497 VREV64 dY1r,dY1r 498 VREV64 dY1i,dY1i 499 500 501 VRSHRN dX0r,qT2,#32 502 VRSHRN dX0i,qT3,#32 503 504 505 VSUB dY0r,dT0,dX0i @// F(1) 506 VADD dY0i,dT1,dX0r 507 508 509 VST2 {dY0r,dY0i},[argDst],step 510 VST2 {dY1r,dY1i},[argDst]! 511 SUB argDst,argDst,step 512 SUB step,step,#32 @// (N/2-4)*8 bytes 513 514 515 BGT evenOddButterflyLoop 516 517 SUB pSrc,pSrc,#8 @// set both the ptrs to the last element 518 SUB argDst,argDst,#8 519 520 521 522 @// Last element can be expanded as follows 523 @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] 524 @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] 525 @// 1/2[2a+j0] + j (c+jd) [0+j2b] 526 @// (a-bc, -bd) 527 @// Since (c,d) = (0,1) for the last element, result is just (a,-b) 528 529lastElement: 530 VLD1 dX0r,[pSrc] 531 532 VST1 dX0r[0],[argDst]! 533 VNEG dX0r,dX0r 534 VST1 dX0r[1],[argDst]! 535 536 537 538 539 540 541End: 542 @// Set return value 543 MOV result, #OMX_Sts_NoErr 544 545 @// Write function tail 546 M_END 547 548 .end 549 550