1@ 2@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@ 4@ Use of this source code is governed by a BSD-style license 5@ that can be found in the LICENSE file in the root of the source 6@ tree. An additional intellectual property rights grant can be found 7@ in the file PATENTS. All contributing project authors may 8@ be found in the AUTHORS file in the root of the source tree. 9@ 10@ Some code in this file was originally from file 11@ omxSP_FFTFwd_RToCCS_S32_Sfs_s.S which was licensed as follows. 12@ It has been relicensed with permission from the copyright holders. 13@ 14 15@ 16@ OpenMAX DL: v1.0.2 17@ Last Modified Revision: 7810 18@ Last Modified Date: Thu, 04 Oct 2007 19@ 20@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21@ 22 23@ 24@ Description: 25@ Compute a forward FFT for a real signal, using 16 bit complex FFT routines. 26@ 27 28#include "dl/api/arm/armCOMM_s.h" 29#include "dl/api/arm/omxtypes_s.h" 30 31.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 32.extern armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe 33.extern armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe 34.extern armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 35.extern armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 36.extern armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe 37.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe 38.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe 39.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 40.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe 41.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe 42.extern armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe 43.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 44.extern armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 45.extern armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 46.extern armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 47 48@Input Registers 49#define pSrc r0 50#define pDst r1 51#define pFFTSpec r2 52#define scale r3 53 54@ Output registers 55#define result r0 56 57@Local Scratch Registers 58#define argTwiddle r1 59#define argDst r2 60#define argScale r4 61#define pTwiddle r4 62#define tmpOrder r4 63#define pOut r5 64#define subFFTSize r7 65#define subFFTNum r6 66#define N r6 67#define order r14 68#define diff r9 69@ Total num of radix stages to comple the FFT 70#define count r8 71#define x0r r4 72#define x0i r5 73#define diffMinusOne r2 74#define round r3 75#define subFFTSizeTmp r6 76#define step r3 77#define stepr r11 78#define step1 r10 79#define step1r r6 80#define step2 r8 81#define step2r r9 82#define twStep r8 83#define zero r9 84#define pTwiddleTmp r5 85#define t0 r10 86 87@ Neon registers 88#define dX0 d0.s16 89#define dX0S32 d0.s32 90#define dzero d1.s16 91#define dZero d2.s16 92#define dShift d3.s16 93#define qShift q1.s16 94#define dX0r d2.s16 95#define dX0i d3.s16 96#define dX1r d4.s16 97#define dX1i d5.s16 98#define qX1 q2.s16 99#define dX0rS32 d2.s32 100#define dX0iS32 d3.s32 101#define dX1rS32 d4.s32 102#define dX1iS32 d5.s32 103#define dT0 d6.s16 104#define dT1 d7.s16 105#define dT2 d8.s16 106#define dT3 d9.s16 107#define qT0 q5.s32 108#define qT1 q6.s32 109#define qT0s q5.s16 110#define qT1s q6.s16 111#define dW0r d14.s16 112#define dW0i d15.s16 113#define dW1r d16.s16 114#define dW1i d17.s16 115#define dW0rS32 d14.s32 116#define dW0iS32 d15.s32 117#define dW1rS32 d16.s32 118#define dW1iS32 d17.s32 119#define dY0r d14.s16 120#define dY0i d15.s16 121#define dY0rS32 d14.s32 122#define dY0iS32 d15.s32 123#define dY1r d16.s16 124#define dY1i d17.s16 125#define qY1 q8.s16 126#define dY1rS32 d16.s32 127#define dY1iS32 d17.s32 128#define dY0rS64 d14.s32 129#define dY0iS64 d15.s32 130#define qT2 q9.s32 131#define qT3 q10.s32 132#define d18s16 d18.s16 133#define d19s16 d19.s16 134#define d20s16 d20.s16 135#define d21s16 d21.s16 136@ lastThreeelements 137#define dX1 d3.s16 138#define dW0 d4.s16 139#define dW1 d5.s16 140#define dY0 d10.s16 141#define dY1 d11.s16 142#define dY2 d12.s16 143#define dY3 d13.s16 144 145 @ Allocate stack memory required by the function 146 M_ALLOC4 diffOnStack, 4 147 148 @ Write function header 149 M_START omxSP_FFTFwd_RToCCS_S16_Sfs,r11,d15 150 151 @ Structure offsets for the FFTSpec 152 .set ARMsFFTSpec_N, 0 153 .set ARMsFFTSpec_pBitRev, 4 154 .set ARMsFFTSpec_pTwiddle, 8 155 .set ARMsFFTSpec_pBuf, 12 156 157 @ Define stack arguments 158 159 @ Read the size from structure and take log 160 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 161 162 @ Read other structure parameters 163 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 164 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 165 166 @ Do a N/2 point complex FFT including the scaling 167 168 MOV N,N,ASR #1 @ N/2 point complex FFT 169 170 CLZ order,N @ N = 2^order 171 RSB order,order,#31 172 MOV subFFTSize,#1 173 174 CMP order,#3 175 BGT orderGreaterthan3 @ order > 3 176 177 CMP order,#1 178 BGE orderGreaterthan0 @ order > 0 179 M_STR scale, diffOnStack,LT @ order = 0 180 LDR x0r,[pSrc] 181 STR x0r,[pOut] 182 MOV pSrc,pOut 183 MOV argDst,pDst 184 B FFTEnd 185 186orderGreaterthan0: 187 @ set the buffers appropriately for various orders 188 CMP order,#2 189 MOVEQ argDst,pDst 190 MOVNE argDst,pOut 191 MOVNE pOut,pDst @ Pass 1st stage destination in RN5 192 MOV argTwiddle,pTwiddle 193 194 SUBS diff,scale,order 195 M_STR diff,diffOnStack 196 MOVGT scale,order 197 @ Now scale <= order 198 199 CMP order,#1 200 BGT orderGreaterthan1 201 @ order = 1: 202 SUBS scale,scale,#1 203 BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 204 BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe 205 B FFTEnd 206 207orderGreaterthan1: 208 CMP order,#2 209 MOV argScale,scale 210 BGT orderGreaterthan2 211 @ order = 2: 212 SUBS argScale,argScale,#1 213 BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 214 BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe 215 SUBS argScale,argScale,#1 216 BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 217 BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 218 B FFTEnd 219 220orderGreaterthan2: @ order = 3 221 SUBS argScale,argScale,#1 222 BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 223 BLLT armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe 224 SUBS argScale,argScale,#1 225 BLGE armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 226 BLLT armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 227 SUBS argScale,argScale,#1 228 BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe 229 BLLT armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 230 B FFTEnd 231 232 233orderGreaterthan3: 234 @ check scale = 0 or scale = order 235 SUBS diff, scale, order @ scale > order 236 MOVGT scale,order 237 BGE specialScaleCase @ scale = 0 or scale = order 238 CMP scale,#0 239 BEQ specialScaleCase 240 B generalScaleCase 241 242specialScaleCase: @ scale = 0, or, scale = order && order > 3 243 TST order, #2 @ Set input args to fft stages 244 MOVEQ argDst,pDst 245 MOVNE argDst,pOut 246 MOVNE pOut,pDst @ Pass the first stage destination in RN5 247 MOV argTwiddle,pTwiddle 248 249 CMP diff,#0 250 M_STR diff, diffOnStack 251 BGE scaleEqualsOrder 252 253 @ check for even or odd order. 254 @ NOTE: The following combination of BL's would work fine even though 255 @ the first BL would corrupt the flags. This is because the end of the 256 @ "grpZeroSetLoop" loop inside 257 @ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ. 258 259 TST order,#0x00000001 260 BLEQ armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe 261 BLNE armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe 262 263 CMP subFFTNum,#4 264 BLT FFTEnd 265 266unscaledRadix4Loop: 267 BEQ lastStageUnscaledRadix4 268 BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe 269 CMP subFFTNum,#4 270 B unscaledRadix4Loop 271 272lastStageUnscaledRadix4: 273 BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 274 B FFTEnd 275 276scaleEqualsOrder: 277 @ check for even or odd order 278 @ NOTE: The following combination of BL's would work fine even though 279 @ the first BL would corrupt the flags. This is because the end of the 280 @ "grpZeroSetLoop" loop inside 281 @ armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets Z flag to EQ. 282 283 TST order,#0x00000001 284 BLEQ armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe 285 BLNE armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe 286 287 CMP subFFTNum,#4 288 BLT FFTEnd 289 290scaledRadix4Loop: 291 BEQ lastStageScaledRadix4 292 BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe 293 CMP subFFTNum,#4 294 B scaledRadix4Loop 295 296lastStageScaledRadix4: 297 BL armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe 298 B FFTEnd 299 300generalScaleCase: @ 0 < scale < order and order > 3 301 @ Determine the correct destination buffer 302 SUB diff,order,scale 303 TST diff,#0x01 304 ADDEQ count,scale,diff,LSR #1 @ count = scale + (order - scale)/2 305 MOVNE count,order 306 TST count,#0x01 @ Is count even or odd ? 307 308 MOVEQ argDst,pDst @ Set input args to fft stages 309 MOVNE argDst,pOut 310 MOVNE pOut,pDst @ Pass 1st stage destination in RN5 311 MOV argTwiddle,pTwiddle 312 313 CMP diff,#1 314 M_STR diff, diffOnStack 315 BEQ scaleps @ scaling including a radix2_ps stage 316 317 MOV argScale,scale @ Put scale in RN4 to save and restore 318 BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 319 SUBS argScale,argScale,#1 320 321scaledRadix2Loop: 322 BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe 323 SUBS argScale,argScale,#1 @ save, restore scale in scaled stages 324 BGT scaledRadix2Loop 325 B outScale 326 327scaleps: 328 SUB argScale,scale,#1 @ order>3 and diff=1 => scale >= 3 329 BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe 330 SUBS argScale,argScale,#1 331 332scaledRadix2psLoop: 333 BEQ scaledRadix2psStage 334 BLGT armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe 335 SUBS argScale,argScale,#1 @ save, restore scale in scaled stages 336 BGE scaledRadix2psLoop 337 338scaledRadix2psStage: 339 BL armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe 340 B generalLastStageUnscaledRadix2 341 342outScale: 343 M_LDR diff, diffOnStack 344 @check for even or odd order 345 TST diff,#0x00000001 346 BEQ generalUnscaledRadix4Loop 347 B unscaledRadix2Loop 348 349generalUnscaledRadix4Loop: 350 CMP subFFTNum,#4 351 BEQ generalLastStageUnscaledRadix4 352 BL armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe 353 B generalUnscaledRadix4Loop 354 355generalLastStageUnscaledRadix4: 356 BL armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe 357 B End 358 359unscaledRadix2Loop: 360 CMP subFFTNum,#4 361 BEQ generalLastTwoStagesUnscaledRadix2 362 BL armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe 363 B unscaledRadix2Loop 364 365generalLastTwoStagesUnscaledRadix2: 366 BL armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe 367generalLastStageUnscaledRadix2: 368 BL armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe 369 B End 370 371FFTEnd: @ Does only the scaling 372 M_LDR diff, diffOnStack 373 CMP diff,#0 374 BLE finalComplexToRealFixup 375 376 RSB diff,diff,#0 @ for right shift by a variable 377 VDUP qShift,diff 378 379 @ save subFFTSize and use subFFTSizeTmp in the following loop 380 MOV subFFTSizeTmp,subFFTSize @ subFFTSizeTmp same reg as subFFTNum 381 382 @ Use parallel loads for bigger FFT size. 383 CMP subFFTSizeTmp, #8 384 BLT scaleLessFFTData 385 386scaleFFTData: 387 VLD1 {qT0s, qT1s},[pSrc:256] @ pSrc contains pDst pointer 388 SUBS subFFTSizeTmp,subFFTSizeTmp,#8 389 VSHL qT0s,qShift 390 VSHL qT1s,qShift 391 VST1 {qT0s, qT1s},[pSrc:256]! 392 BGT scaleFFTData 393 B afterScaling 394 395scaleLessFFTData: 396 VLD1 {dX0S32[0]},[pSrc] @ pSrc contains pDst pointer 397 SUBS subFFTSizeTmp,subFFTSizeTmp,#1 398 VSHL dX0,dShift 399 VST1 {dX0S32[0]},[pSrc]! 400 BGT scaleLessFFTData 401 402afterScaling: 403 SUB pSrc,pSrc,subFFTSize,LSL #2 @ reset pSrc for final fixup 404 405 @ change the logic so that output after scaling is in pOut and not in pDst 406 @ finally store from pOut to pDst 407 @ change branch "End" to branch "finalComplexToRealFixup" in the above 408 @ chk the code below for multiplication by j factor 409 410finalComplexToRealFixup: 411 @ F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)] 412 @ 1/2[(a+jb) + (a-jb)] - j [(a+jb) - (a-jb)] 413 @ 1/2[2a+j0] - j [0+j2b] 414 @ (a+b, 0) 415 416 @ F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)] 417 @ 1/2[(a+jb) + (a-jb)] + j [(a+jb) - (a-jb)] 418 @ 1/2[2a+j0] + j [0+j2b] 419 @ (a-b, 0) 420 421 CMP subFFTSize,#4 422 BLE smallFFTSize 423 424@ SubSize > 3: 425 @ F(0) and F(N/2) 426 VLD2 {dX0r[0],dX0i[0]},[pSrc]! 427 MOV zero,#0 428 VMOV dX0r[1],zero 429 MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes 430 VMOV dX0i[1],zero 431 SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes 432 433 VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0) 434 MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes 435 VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0) 436 SUBS subFFTSize,subFFTSize,#2 437 438 VST1 dY0rS32[0],[argDst], step 439 ADD pTwiddleTmp,argTwiddle,#4 @ W^2 440 VST1 dY0iS32[0],[argDst]! 441 ADD argTwiddle,argTwiddle,twStep @ W^1 442 443 VDUP dzero,zero 444 SUB argDst,argDst,step 445 SUB step,step,#20 446 RSB stepr, step, #16 447 SUB step1,step1,#8 @ (N/4-1)*8 bytes 448 RSB step1r,step1,#8 449 450 SUB step2, step1, #4 451 RSB step2r, step2, #8 452 453 @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 454 @ Note: W^k is stored as negative values in the table. 455 @ Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) 456 @ since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1). 457 458evenOddButterflyLoop: 459 VLD2 {dX0r,dX0i},[pSrc],step 460 VLD2 {dX1r,dX1i},[pSrc],stepr 461 462 VLD1 dW0r,[argTwiddle],step1 463 SUB step1, step1, #16 464 VREV64 qX1,qX1 465 466 VLD1 dW1r,[argTwiddle],step1r 467 ADD step1r, step1r, #16 468 VSUB dT2,dX0r,dX1r @ a-c 469 470 VLD1 dW0i,[pTwiddleTmp],step2 471 SUB step2, step2, #16 472 VADD dT3,dX0i,dX1i @ b+d 473 474 VLD1 dW1i,[pTwiddleTmp],step2r 475 ADD step2r, step2r, #16 476 477 VTRN dW0r,dW0i 478 VZIP dW1r, dW1i 479 480 SUBS subFFTSize,subFFTSize,#8 481 482 VHADD dT0,dX0r,dX1r @ (a+c)/2 483 VZIP dW1iS32, dW1rS32 484 VHSUB dT1,dX0i,dX1i @ (b-d)/2 485 486 VQDMULH dY0,dW1i,dT2 487 VQDMULH dY1,dW1r,dT3 488 VQDMULH dY2,dW1i,dT3 489 VQDMULH dY3,dW1r,dT2 490 491 VQDMULH d18s16,dW0r,dT2 492 VQDMULH d19s16,dW0i,dT3 493 VQDMULH d20s16,dW0r,dT3 494 VQDMULH d21s16,dW0i,dT2 495 496 VRHADD dX1r, dY0, dY1 497 VHSUB dX1i, dY2, dY3 498 VHSUB dX0r, d18s16, d19s16 499 VADD dY1i,dT1,dX1r 500 VRHADD dX0i, d20s16, d21s16 501 VSUB dY1r,dT0,dX1i @ F(N/2 -1) 502 VSUB dY0r,dT0,dX0i @ F(1) 503 VADD dY0i,dT1,dX0r 504 505 VNEG dY1i,dY1i 506 VREV64 qY1, qY1 507 508 VST2 {dY0r,dY0i},[argDst],step 509 SUB step,step,#32 @ (N/2-4)*4 bytes 510 VST2 {dY1r,dY1i},[argDst],stepr 511 ADD stepr,stepr,#32 512 513 BGT evenOddButterflyLoop 514 515 SUB pSrc,pSrc,#4 @ points to the last element. 516 SUB argDst,argDst,#4 @ points to the last element. 517 518 b lastElement 519 520smallFFTSize: 521 522 @ F(0) and F(N/2) 523 VLD2 {dX0r[0],dX0i[0]},[pSrc]! 524 MOV zero,#0 525 VMOV dX0r[1],zero 526 MOV step,subFFTSize,LSL #2 @ step = N/2 * 4 bytes 527 VMOV dX0i[1],zero 528 SUB twStep,step,subFFTSize @ twStep = 3N/8 * 8 bytes 529 530 VADD dY0r,dX0r,dX0i @ F(0) = ((Z0.r+Z0.i) , 0) 531 MOV step1,subFFTSize,LSL #1 @ step1 = N/2 * 2 bytes 532 VSUB dY0i,dX0r,dX0i @ F(N/2) = ((Z0.r-Z0.i) , 0) 533 SUBS subFFTSize,subFFTSize,#2 534 535 536 VST1 dY0rS32[0],[argDst], step 537 ADD pTwiddleTmp,argTwiddle,#4 @ W^2 538 VST1 dY0iS32[0],[argDst]! 539 ADD argTwiddle,argTwiddle,twStep @ W^1 540 541 VDUP dzero,zero 542 SUB argDst,argDst,step 543 544 BLT End 545 BEQ lastElement 546 547 SUB step,step,#12 548 SUB step1,step1,#4 @ (N/4-1)*8 bytes 549 550 @ F(k) = 1/2[Z(k) + Z'(N/2-k)] -j*W^(k) [Z(k) - Z'(N/2-k)] 551 552butterflyLoopSubFFTSize4: 553 VLD1 dW0rS32[0], [argTwiddle],step1 554 VLD1 dW1rS32[0],[argTwiddle]! 555 556 VLD2 {dX0r[0],dX0i[0]},[pSrc]! 557 VLD2 {dX0r[1],dX0i[1]},[pSrc],step 558 SUB pSrc,pSrc,#4 559 SUB argTwiddle,argTwiddle,step1 560 VLD2 {dX1r[0],dX1i[0]},[pSrc]! 561 VLD2 {dX1r[1],dX1i[1]},[pSrc]! 562 563 SUB step1,step1,#4 @ (N/4-2)*4 bytes 564 VLD1 dW0iS32[0],[pTwiddleTmp],step1 565 VLD1 dW1iS32[0],[pTwiddleTmp]! 566 SUB pSrc,pSrc,step 567 568 SUB pTwiddleTmp,pTwiddleTmp,step1 569 VREV32 dX1r,dX1r 570 VREV32 dX1i,dX1i 571 SUBS subFFTSize,subFFTSize,#4 572 573 VSUB dT2,dX0r,dX1r @ a-c 574 SUB step1,step1,#4 575 VADD dT3,dX0i,dX1i @ b+d 576 VADD dT0,dX0r,dX1r @ a+c 577 VSUB dT1,dX0i,dX1i @ b-d 578 VHADD dT0,dT0,dzero 579 VHADD dT1,dT1,dzero 580 581 VTRN dW1r,dW1i 582 VTRN dW0r,dW0i 583 584 VMULL qT0,dW1r,dT2 585 VMLAL qT0,dW1i,dT3 586 VMULL qT1,dW1r,dT3 587 VMLSL qT1,dW1i,dT2 588 589 VMULL qT2,dW0r,dT2 590 VMLSL qT2,dW0i,dT3 591 VMULL qT3,dW0r,dT3 592 VMLAL qT3,dW0i,dT2 593 594 VRSHRN dX1r,qT0,#16 595 VRSHRN dX1i,qT1,#16 596 597 VSUB dY1r,dT0,dX1i @ F(N/2 -1) 598 VADD dY1i,dT1,dX1r 599 VNEG dY1i,dY1i 600 601 VREV32 dY1r,dY1r 602 VREV32 dY1i,dY1i 603 604 VRSHRN dX0r,qT2,#16 605 VRSHRN dX0i,qT3,#16 606 607 VSUB dY0r,dT0,dX0i @ F(1) 608 VADD dY0i,dT1,dX0r 609 610 VST2 {dY0r[0],dY0i[0]},[argDst]! 611 VST2 {dY0r[1],dY0i[1]},[argDst],step 612 SUB argDst, #4 613 VST2 {dY1r[0],dY1i[0]},[argDst]! 614 VST2 {dY1r[1],dY1i[1]},[argDst]! 615 SUB argDst,argDst,step 616 SUB pSrc,pSrc,#4 @ points to the last element. 617 SUB argDst,argDst,#4 @ points to the last element. 618 619lastElement: 620 @ Last element can be expanded as follows 621 @ 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)] 622 @ 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)] 623 @ 1/2[2a+j0] + j (c+jd) [0+j2b] 624 @ (a-bc, -bd) 625 @ Since (c,d) = (0,1) for the last element, result is just (a,-b) 626 627 VLD1 dX0rS32[0],[pSrc] 628 VST1 dX0r[0],[argDst]! 629 VNEG dX0r,dX0r 630 VST1 dX0r[1],[argDst]! 631 632End: 633 @ Set return value 634 MOV result, #OMX_Sts_NoErr 635 636 @ Write function tail 637 M_END 638 639 .end 640