1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This file was originally licensed as follows. It has been 11@// relicensed with permission from the copyright holders. 12 13@// 14@// 15@// File Name: armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s 16@// OpenMAX DL: v1.0.2 17@// Last Modified Revision: 7766 18@// Last Modified Date: Thu, 27 Sep 2007 19@// 20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21@// 22@// 23@// 24@// Description: 25@// Compute a first stage Radix 8 FFT stage for a N point complex signal 26@// 27@// 28 29 30@// Include standard headers 31 32#include "dl/api/arm/armCOMM_s.h" 33#include "dl/api/arm/omxtypes_s.h" 34 35 36@// Import symbols required from other files 37@// (For example tables) 38 39 40@// Set debugging level 41@//DEBUG_ON SETL {TRUE} 42 43 44 45@// Guarding implementation by the processor name 46 47 48 49 50@// Guarding implementation by the processor name 51 52 53@//Input Registers 54 55#define pSrc r0 56#define pDst r2 57#define pTwiddle r1 58#define subFFTNum r6 59#define subFFTSize r7 60@// dest buffer for the next stage (not pSrc for first stage) 61#define pPingPongBuf r5 62 63 64@//Output Registers 65 66 67@//Local Scratch Registers 68 69#define grpSize r3 70@// Reuse grpSize as setCount 71#define setCount r3 72#define pointStep r4 73#define outPointStep r4 74#define setStep r8 75#define step1 r9 76#define step2 r10 77#define t0 r11 78 79 80@// Neon Registers 81 82#define dXr0 D14.S16 83#define dXi0 D15.S16 84#define dXr1 D2.S16 85#define dXi1 D3.S16 86#define dXr2 D4.S16 87#define dXi2 D5.S16 88#define dXr3 D6.S16 89#define dXi3 D7.S16 90#define dXr4 D8.S16 91#define dXi4 D9.S16 92#define dXr5 D10.S16 93#define dXi5 D11.S16 94#define dXr6 D12.S16 95#define dXi6 D13.S16 96#define dXr7 D0.S16 97#define dXi7 D1.S16 98#define qX0 Q7.S16 99#define qX1 Q1.S16 100#define qX2 Q2.S16 101#define qX3 Q3.S16 102#define qX4 Q4.S16 103#define qX5 Q5.S16 104#define qX6 Q6.S16 105#define qX7 Q0.S16 106 107#define dUr0 D16.S16 108#define dUi0 D17.S16 109#define dUr2 D18.S16 110#define dUi2 D19.S16 111#define dUr4 D20.S16 112#define dUi4 D21.S16 113#define dUr6 D22.S16 114#define dUi6 D23.S16 115#define dUr1 D24.S16 116#define dUi1 D25.S16 117#define dUr3 D26.S16 118#define dUi3 D27.S16 119#define dUr5 D28.S16 120#define dUi5 D29.S16 121@// reuse dXr7 and dXi7 122#define dUr7 D30.S16 123#define dUi7 D31.S16 124#define qU0 Q8.S16 125#define qU1 Q12.S16 126#define qU2 Q9.S16 127#define qU3 Q13.S16 128#define qU4 Q10.S16 129#define qU5 Q14.S16 130#define qU6 Q11.S16 131#define qU7 Q15.S16 132 133 134 135#define dVr0 D24.S16 136#define dVi0 D25.S16 137#define dVr2 D26.S16 138#define dVi2 D27.S16 139#define dVr4 D28.S16 140#define dVi4 D29.S16 141#define dVr6 D30.S16 142#define dVi6 D31.S16 143#define dVr1 D16.S16 144#define dVi1 D17.S16 145#define dVr3 D18.S16 146#define dVi3 D19.S16 147#define dVr5 D20.S16 148#define dVi5 D21.S16 149@// reuse dUi7 150#define dVr7 D22.S16 151@// reuse dUr7 152#define dVi7 D23.S16 153#define qV0 Q12.S16 154#define qV1 Q8.S16 155#define qV2 Q13.S16 156#define qV3 Q9.S16 157#define qV4 Q14.S16 158#define qV5 Q10.S16 159#define qV6 Q15.S16 160#define qV7 Q11.S16 161 162 163 164#define dYr0 D16.S16 165#define dYi0 D17.S16 166#define dYr2 D18.S16 167#define dYi2 D19.S16 168#define dYr4 D20.S16 169#define dYi4 D21.S16 170#define dYr6 D22.S16 171#define dYi6 D23.S16 172#define dYr1 D24.S16 173#define dYi1 D25.S16 174#define dYr3 D26.S16 175#define dYi3 D27.S16 176#define dYr5 D28.S16 177#define dYi5 D29.S16 178@// reuse dYr4 and dYi4 179#define dYr7 D30.S16 180#define dYi7 D31.S16 181#define qY0 Q8.S16 182#define qY1 Q12.S16 183#define qY2 Q9.S16 184#define qY3 Q13.S16 185#define qY4 Q10.S16 186#define qY5 Q14.S16 187#define qY6 Q11.S16 188#define qY7 Q15.S16 189 190 191#define dT0 D0.S16 192#define dT1 D1.S16 193 194 195@// Define constants 196 .set ONEBYSQRT2, 0x00005A82 @// Q15 format 197 198 199 .macro FFTSTAGE scaled, inverse , name 200 201 @// Define stack arguments 202 203 @// Update pSubFFTSize and pSubFFTNum regs 204 MOV subFFTSize,#8 @// subFFTSize = 1 for the first stage 205 LDR t0,=ONEBYSQRT2 @// t0=(1/sqrt(2)) as Q15 format 206 207 @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount) 208 LSR grpSize,subFFTNum,#3 209 MOV subFFTNum,grpSize 210 211 212 @// pT0+1 increments pT0 by 4 bytes 213 @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes 214 @// Note: outPointStep = pointStep for firststage 215 216 MOV pointStep,grpSize,LSL #2 217 218 219 @// Calculate the step of input data for the next set 220 @//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep 221 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] 222 MOV step1,grpSize,LSL #3 223 224 MOV step2,pointStep,LSL #3 225 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] 226 SUB step2,step2,pointStep @// step2 = 7*pointStep 227 RSB setStep,step2,#16 @// setStep = - 7*pointStep+16 228 229 230 231 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] 232 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3] 233 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 234 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 235 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 236 @// grp = 0 a special case since all the twiddle factors are 1 237 @// Loop on the sets : 4 sets at a time 238 239grpZeroSetLoop\name: 240 VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] & update pSrc for the next set 241 @// setStep = -7*pointStep + 16 242 243 @// Decrement setcount 244 SUBS setCount,setCount,#4 @// decrement the set loop counter 245 246 247 .ifeqs "\scaled", "TRUE" 248 @// finish first stage of 8 point FFT 249 250 VHADD qU0,qX0,qX4 251 VHADD qU2,qX1,qX5 252 VHADD qU4,qX2,qX6 253 VHADD qU6,qX3,qX7 254 255 @// finish second stage of 8 point FFT 256 257 VHADD qV0,qU0,qU4 258 VHSUB qV2,qU0,qU4 259 VHADD qV4,qU2,qU6 260 VHSUB qV6,qU2,qU6 261 262 @// finish third stage of 8 point FFT 263 264 VHADD qY0,qV0,qV4 265 VHSUB qY4,qV0,qV4 266 VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0 267 268 .ifeqs "\inverse", "TRUE" 269 270 VHSUB dYr2,dVr2,dVi6 271 VHADD dYi2,dVi2,dVr6 272 273 VHADD dYr6,dVr2,dVi6 274 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2 275 VHSUB dYi6,dVi2,dVr6 276 277 VHSUB qU1,qX0,qX4 278 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4 279 280 VHSUB qU3,qX1,qX5 281 VHSUB qU5,qX2,qX6 282 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6 283 284 .else 285 286 VHADD dYr6,dVr2,dVi6 287 VHSUB dYi6,dVi2,dVr6 288 289 VHSUB dYr2,dVr2,dVi6 290 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2 291 VHADD dYi2,dVi2,dVr6 292 293 294 VHSUB qU1,qX0,qX4 295 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4 296 VHSUB qU3,qX1,qX5 297 VHSUB qU5,qX2,qX6 298 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6 299 300 301 .endif 302 303 @// finish first stage of 8 point FFT 304 305 VHSUB qU7,qX3,qX7 306 VMOV dT0[0],t0 307 308 @// finish second stage of 8 point FFT 309 310 VHSUB dVr1,dUr1,dUi5 311 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration 312 VHADD dVi1,dUi1,dUr5 313 VHADD dVr3,dUr1,dUi5 314 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] 315 VHSUB dVi3,dUi1,dUr5 316 317 VHSUB dVr5,dUr3,dUi7 318 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] 319 VHADD dVi5,dUi3,dUr7 320 VHADD dVr7,dUr3,dUi7 321 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3] 322 VHSUB dVi7,dUi3,dUr7 323 324 @// finish third stage of 8 point FFT 325 326 .ifeqs "\inverse", "TRUE" 327 328 @// calculate a*v5 329 VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1 330 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 331 VQRDMULH dVi5,dVi5,dT0[0] 332 333 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 334 VSUB dVr5,dT1,dVi5 @// a * V5 335 VADD dVi5,dT1,dVi5 336 337 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 338 339 @// calculate b*v7 340 VQRDMULH dT1,dVr7,dT0[0] 341 VQRDMULH dVi7,dVi7,dT0[0] 342 343 VHADD qY1,qV1,qV5 344 VHSUB qY5,qV1,qV5 345 346 347 VADD dVr7,dT1,dVi7 @// b * V7 348 VSUB dVi7,dVi7,dT1 349 SUB pDst, pDst, step2 @// set pDst to y1 350 351 VHSUB dYr3,dVr3,dVr7 352 VHSUB dYi3,dVi3,dVi7 353 VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1 354 VHADD dYr7,dVr3,dVr7 355 VHADD dYi7,dVi3,dVi7 356 357 358 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3 359 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5 360#if 0 361 VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7 362#else 363 VST2 {dYr7,dYi7},[pDst :128]! @// store y7 364#endif 365 .else 366 367 @// calculate b*v7 368 VQRDMULH dT1,dVr7,dT0[0] 369 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 370 VQRDMULH dVi7,dVi7,dT0[0] 371 372 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 373 VADD dVr7,dT1,dVi7 @// b * V7 374 VSUB dVi7,dVi7,dT1 375 376 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 377 378 @// calculate a*v5 379 VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1 380 VQRDMULH dVi5,dVi5,dT0[0] 381 382 VHADD dYr7,dVr3,dVr7 383 VHADD dYi7,dVi3,dVi7 384 SUB pDst, pDst, step2 @// set pDst to y1 385 386 VSUB dVr5,dT1,dVi5 @// a * V5 387 VADD dVi5,dT1,dVi5 388 389 VHSUB qY5,qV1,qV5 390 391 VHSUB dYr3,dVr3,dVr7 392 VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1 393 VHSUB dYi3,dVi3,dVi7 394 VHADD qY1,qV1,qV5 395 396 397 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3 398 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5 399#if 0 400 VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7 401#else 402 VST2 {dYr1,dYi1},[pDst :128]! @// store y7 403#endif 404 405 .endif 406 407 408 409 .else 410 @// finish first stage of 8 point FFT 411 412 VADD qU0,qX0,qX4 413 VADD qU2,qX1,qX5 414 VADD qU4,qX2,qX6 415 VADD qU6,qX3,qX7 416 417 @// finish second stage of 8 point FFT 418 419 VADD qV0,qU0,qU4 420 VSUB qV2,qU0,qU4 421 VADD qV4,qU2,qU6 422 VSUB qV6,qU2,qU6 423 424 @// finish third stage of 8 point FFT 425 426 VADD qY0,qV0,qV4 427 VSUB qY4,qV0,qV4 428 VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0 429 430 .ifeqs "\inverse", "TRUE" 431 432 VSUB dYr2,dVr2,dVi6 433 VADD dYi2,dVi2,dVr6 434 435 VADD dYr6,dVr2,dVi6 436 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2 437 VSUB dYi6,dVi2,dVr6 438 439 VSUB qU1,qX0,qX4 440 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4 441 442 VSUB qU3,qX1,qX5 443 VSUB qU5,qX2,qX6 444 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6 445 446 .else 447 448 VADD dYr6,dVr2,dVi6 449 VSUB dYi6,dVi2,dVr6 450 451 VSUB dYr2,dVr2,dVi6 452 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2 453 VADD dYi2,dVi2,dVr6 454 455 456 VSUB qU1,qX0,qX4 457 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4 458 VSUB qU3,qX1,qX5 459 VSUB qU5,qX2,qX6 460 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6 461 462 463 .endif 464 465 @// finish first stage of 8 point FFT 466 467 VSUB qU7,qX3,qX7 468 VMOV dT0[0],t0 469 470 @// finish second stage of 8 point FFT 471 472 VSUB dVr1,dUr1,dUi5 473 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] for next iteration 474 VADD dVi1,dUi1,dUr5 475 VADD dVr3,dUr1,dUi5 476 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] 477 VSUB dVi3,dUi1,dUr5 478 479 VSUB dVr5,dUr3,dUi7 480 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] 481 VADD dVi5,dUi3,dUr7 482 VADD dVr7,dUr3,dUi7 483 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3] 484 VSUB dVi7,dUi3,dUr7 485 486 @// finish third stage of 8 point FFT 487 488 .ifeqs "\inverse", "TRUE" 489 490 @// calculate a*v5 491 VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1 492 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 493 VQRDMULH dVi5,dVi5,dT0[0] 494 495 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 496 VSUB dVr5,dT1,dVi5 @// a * V5 497 VADD dVi5,dT1,dVi5 498 499 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 500 501 @// calculate b*v7 502 VQRDMULH dT1,dVr7,dT0[0] 503 VQRDMULH dVi7,dVi7,dT0[0] 504 505 VADD qY1,qV1,qV5 506 VSUB qY5,qV1,qV5 507 508 509 VADD dVr7,dT1,dVi7 @// b * V7 510 VSUB dVi7,dVi7,dT1 511 SUB pDst, pDst, step2 @// set pDst to y1 512 513 VSUB dYr3,dVr3,dVr7 514 VSUB dYi3,dVi3,dVi7 515 VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1 516 VADD dYr7,dVr3,dVr7 517 VADD dYi7,dVi3,dVi7 518 519 520 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3 521 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5 522#if 0 523 VST2 {dYr7,dYi7},[pDst :128],#16 @// store y7 524#else 525 VST2 {dYr7,dYi7},[pDst :128]! @// store y7 526#endif 527 .else 528 529 @// calculate b*v7 530 VQRDMULH dT1,dVr7,dT0[0] 531 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 532 VQRDMULH dVi7,dVi7,dT0[0] 533 534 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 535 VADD dVr7,dT1,dVi7 @// b * V7 536 VSUB dVi7,dVi7,dT1 537 538 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 539 540 @// calculate a*v5 541 VQRDMULH dT1,dVr5,dT0[0] @// use dVi0 for dT1 542 VQRDMULH dVi5,dVi5,dT0[0] 543 544 VADD dYr7,dVr3,dVr7 545 VADD dYi7,dVi3,dVi7 546 SUB pDst, pDst, step2 @// set pDst to y1 547 548 VSUB dVr5,dT1,dVi5 @// a * V5 549 VADD dVi5,dT1,dVi5 550 551 VSUB qY5,qV1,qV5 552 553 VSUB dYr3,dVr3,dVr7 554 VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1 555 VSUB dYi3,dVi3,dVi7 556 VADD qY1,qV1,qV5 557 558 559 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3 560 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5 561#if 0 562 VST2 {dYr1,dYi1},[pDst :128],#16 @// store y7 563#else 564 VST2 {dYr1,dYi1},[pDst :128]! @// store y7 565#endif 566 567 .endif 568 569 570 .endif 571 572 SUB pDst, pDst, step2 @// update pDst for the next set 573 BGT grpZeroSetLoop\name 574 575 576 @// reset pSrc to pDst for the next stage 577 SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize 578 MOV pDst,pPingPongBuf 579 580 581 582 .endm 583 584 585 @// Allocate stack memory required by the function 586 587 588 M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4 589 FFTSTAGE "FALSE","FALSE",FWD 590 M_END 591 592 593 M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4 594 FFTSTAGE "FALSE","TRUE",INV 595 M_END 596 597 598 M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4 599 FFTSTAGE "TRUE","FALSE",FWDSFS 600 M_END 601 602 603 M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4 604 FFTSTAGE "TRUE","TRUE",INVSFS 605 M_END 606 607 608 609 610 611 .end 612