1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This file was originally licensed as follows. It has been 11@// relicensed with permission from the copyright holders. 12@// 13 14@// 15@// File Name: armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s 16@// OpenMAX DL: v1.0.2 17@// Last Modified Revision: 7767 18@// Last Modified Date: Thu, 27 Sep 2007 19@// 20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21@// 22@// 23@// 24@// Description: 25@// Compute a Radix 4 FFT stage for a N point complex signal 26@// 27 28 29@// Include standard headers 30 31#include "dl/api/arm/armCOMM_s.h" 32#include "dl/api/arm/omxtypes_s.h" 33 34@// Import symbols required from other files 35@// (For example tables) 36 37 38 39 40@// Set debugging level 41@//DEBUG_ON SETL {TRUE} 42 43 44@// Guarding implementation by the processor name 45 46 47@// Import symbols required from other files 48@// (For example tables) 49 @//IMPORT armAAC_constTable 50 51@//Input Registers 52 53#define pSrc r0 54#define pDst r2 55#define pTwiddle r1 56#define subFFTNum r6 57#define subFFTSize r7 58 59 60 61@//Output Registers 62 63 64@//Local Scratch Registers 65 66#define outPointStep r3 67#define grpCount r4 68#define dstStep r5 69#define grpTwStep r8 70#define stepTwiddle r9 71#define twStep r10 72#define pTmp r4 73#define step16 r11 74#define step24 r12 75 76 77@// Neon Registers 78 79#define dButterfly1Real02 D0.S32 80#define dButterfly1Imag02 D1.S32 81#define dButterfly1Real13 D2.S32 82#define dButterfly1Imag13 D3.S32 83#define dButterfly2Real02 D4.S32 84#define dButterfly2Imag02 D5.S32 85#define dButterfly2Real13 D6.S32 86#define dButterfly2Imag13 D7.S32 87#define dXr0 D0.S32 88#define dXi0 D1.S32 89#define dXr1 D2.S32 90#define dXi1 D3.S32 91#define dXr2 D4.S32 92#define dXi2 D5.S32 93#define dXr3 D6.S32 94#define dXi3 D7.S32 95 96#define dYr0 D16.S32 97#define dYi0 D17.S32 98#define dYr1 D18.S32 99#define dYi1 D19.S32 100#define dYr2 D20.S32 101#define dYi2 D21.S32 102#define dYr3 D22.S32 103#define dYi3 D23.S32 104 105#define dW1r D8.S32 106#define dW1i D9.S32 107#define dW2r D10.S32 108#define dW2i D11.S32 109#define dW3r D12.S32 110#define dW3i D13.S32 111#define qT0 Q7.S64 112#define qT1 Q8.S64 113#define qT2 Q9.S64 114#define qT3 Q10.S64 115#define qT4 Q11.S64 116#define qT5 Q12.S64 117 118#define dZr0 D14.S32 119#define dZi0 D15.S32 120#define dZr1 D26.S32 121#define dZi1 D27.S32 122#define dZr2 D28.S32 123#define dZi2 D29.S32 124#define dZr3 D30.S32 125#define dZi3 D31.S32 126 127#define qX0 Q0.S32 128#define qY0 Q8.S32 129#define qY1 Q9.S32 130#define qY2 Q10.S32 131#define qY3 Q11.S32 132#define qZ0 Q7.S32 133#define qZ1 Q13.S32 134#define qZ2 Q14.S32 135#define qZ3 Q15.S32 136 137 138 139 .MACRO FFTSTAGE scaled, inverse , name 140 141 @// Define stack arguments 142 143 144 @// pOut0+1 increments pOut0 by 8 bytes 145 @// pOut0+outPointStep == increment of 8*outPointStep bytes 146 MOV outPointStep,subFFTSize,LSL #3 147 148 @// Update grpCount and grpSize rightaway 149 150 VLD2 {dW1r,dW1i},[pTwiddle :128] @// [wi|wr] 151 MOV step16,#16 152 LSL grpCount,subFFTSize,#2 153 154 VLD1 dW2r,[pTwiddle :64] @// [wi|wr] 155 MOV subFFTNum,#1 @//after the last stage 156 157 VLD1 dW3r,[pTwiddle :64],step16 @// [wi|wr] 158 MOV stepTwiddle,#0 159 160 VLD1 dW2i,[pTwiddle :64]! @// [wi|wr] 161 SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with 162 163 @// update subFFTSize for the next stage 164 MOV subFFTSize,grpCount 165 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr] 166 MOV dstStep,outPointStep,LSL #1 167 168 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i 169 ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep 170 RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16 171 MOV step24,#24 172 173 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i 174 175 176 @// Process two groups at a time 177 178grpLoop\name : 179 180 VZIP dW2r,dW2i 181 ADD stepTwiddle,stepTwiddle,#16 @// increment for the next iteration 182 VZIP dW3r,dW3i 183 ADD grpTwStep,stepTwiddle,#4 184 VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r 185 SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle 186 VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i 187 MOV grpTwStep,grpTwStep,LSL #1 188 VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r 189 RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle 190 191 192 VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i 193 194 195 SUBS grpCount,grpCount,#8 @// grpCount is multiplied by 4 196 197 .ifeqs "\inverse", "TRUE" 198 VMULL qT0,dW1r,dXr1 199 VMLAL qT0,dW1i,dXi1 @// real part 200 VMULL qT1,dW1r,dXi1 201 VMLSL qT1,dW1i,dXr1 @// imag part 202 203 .else 204 205 VMULL qT0,dW1r,dXr1 206 VMLSL qT0,dW1i,dXi1 @// real part 207 VMULL qT1,dW1r,dXi1 208 VMLAL qT1,dW1i,dXr1 @// imag part 209 210 .endif 211 212 VLD2 {dW1r,dW1i},[pTwiddle :128],stepTwiddle @// [wi|wr] 213 214 .ifeqs "\inverse", "TRUE" 215 VMULL qT2,dW2r,dXr2 216 VMLAL qT2,dW2i,dXi2 @// real part 217 VMULL qT3,dW2r,dXi2 218 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr] 219 VMLSL qT3,dW2i,dXr2 @// imag part 220 221 .else 222 223 VMULL qT2,dW2r,dXr2 224 VMLSL qT2,dW2i,dXi2 @// real part 225 VMULL qT3,dW2r,dXi2 226 VLD1 dW2r,[pTwiddle :64],step16 @// [wi|wr] 227 VMLAL qT3,dW2i,dXr2 @// imag part 228 229 .endif 230 231 232 VRSHRN dZr1,qT0,#31 233 VLD1 dW2i,[pTwiddle :64],twStep @// [wi|wr] 234 VRSHRN dZi1,qT1,#31 235 236 VMOV qZ0,qX0 @// move qX0 so as to load for the next iteration 237 VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i 238 239 240 .ifeqs "\inverse", "TRUE" 241 VMULL qT4,dW3r,dXr3 242 VMLAL qT4,dW3i,dXi3 @// real part 243 VMULL qT5,dW3r,dXi3 244 VLD1 dW3r,[pTwiddle :64],step24 245 VMLSL qT5,dW3i,dXr3 @// imag part 246 247 .else 248 249 VMULL qT4,dW3r,dXr3 250 VMLSL qT4,dW3i,dXi3 @// real part 251 VMULL qT5,dW3r,dXi3 252 VLD1 dW3r,[pTwiddle :64],step24 253 VMLAL qT5,dW3i,dXr3 @// imag part 254 255 .endif 256 257 VRSHRN dZr2,qT2,#31 258 VLD1 dW3i,[pTwiddle :64],grpTwStep @// [wi|wr] 259 VRSHRN dZi2,qT3,#31 260 261 VRSHRN dZr3,qT4,#31 262 VRSHRN dZi3,qT5,#31 263 VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i 264 265 266 .ifeqs "\scaled", "TRUE" 267 268 @// finish first stage of 4 point FFT 269 270 VHADD qY0,qZ0,qZ2 271 VHSUB qY2,qZ0,qZ2 272 VHADD qY1,qZ1,qZ3 273 VHSUB qY3,qZ1,qZ3 274 275 276 @// finish second stage of 4 point FFT 277 278 .ifeqs "\inverse", "TRUE" 279 280 VHSUB qZ0,qY2,qY1 281 282 VHADD dZr3,dYr0,dYi3 283 VST2 {dZr0,dZi0},[pDst :128],outPointStep 284 VHSUB dZi3,dYi0,dYr3 285 286 VHADD qZ2,qY2,qY1 287 VST2 {dZr3,dZi3},[pDst :128],outPointStep 288 289 VHSUB dZr1,dYr0,dYi3 290 VST2 {dZr2,dZi2},[pDst :128],outPointStep 291 VHADD dZi1,dYi0,dYr3 292 293 VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -outPointStep + 16 294 295 296 .else 297 298 VHSUB qZ0,qY2,qY1 299 300 VHSUB dZr1,dYr0,dYi3 301 VST2 {dZr0,dZi0},[pDst :128],outPointStep 302 VHADD dZi1,dYi0,dYr3 303 304 VHADD qZ2,qY2,qY1 305 VST2 {dZr1,dZi1},[pDst :128],outPointStep 306 307 VHADD dZr3,dYr0,dYi3 308 VST2 {dZr2,dZi2},[pDst :128],outPointStep 309 VHSUB dZi3,dYi0,dYr3 310 311 VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -outPointStep + 16 312 313 314 .endif 315 316 317 318 .else 319 320 @// finish first stage of 4 point FFT 321 322 VADD qY0,qZ0,qZ2 323 VSUB qY2,qZ0,qZ2 324 VADD qY1,qZ1,qZ3 325 VSUB qY3,qZ1,qZ3 326 327 328 @// finish second stage of 4 point FFT 329 330 .ifeqs "\inverse", "TRUE" 331 332 VSUB qZ0,qY2,qY1 333 334 VADD dZr3,dYr0,dYi3 335 VST2 {dZr0,dZi0},[pDst :128],outPointStep 336 VSUB dZi3,dYi0,dYr3 337 338 VADD qZ2,qY2,qY1 339 VST2 {dZr3,dZi3},[pDst :128],outPointStep 340 341 VSUB dZr1,dYr0,dYi3 342 VST2 {dZr2,dZi2},[pDst :128],outPointStep 343 VADD dZi1,dYi0,dYr3 344 345 VST2 {dZr1,dZi1},[pDst :128],dstStep @// dstStep = -outPointStep + 16 346 347 348 .else 349 350 VSUB qZ0,qY2,qY1 351 352 VSUB dZr1,dYr0,dYi3 353 VST2 {dZr0,dZi0},[pDst :128],outPointStep 354 VADD dZi1,dYi0,dYr3 355 356 VADD qZ2,qY2,qY1 357 VST2 {dZr1,dZi1},[pDst :128],outPointStep 358 359 VADD dZr3,dYr0,dYi3 360 VST2 {dZr2,dZi2},[pDst :128],outPointStep 361 VSUB dZi3,dYi0,dYr3 362 363 VST2 {dZr3,dZi3},[pDst :128],dstStep @// dstStep = -outPointStep + 16 364 365 366 .endif 367 368 .endif 369 370 BGT grpLoop\name 371 372 373 @// Reset and Swap pSrc and pDst for the next stage 374 MOV pTmp,pDst 375 SUB pSrc,pSrc,#64 @// Extra increment done in final iteration of the loop 376 SUB pDst,pSrc,outPointStep,LSL #2 @// pDst -= 4*size; pSrc -= 8*size bytes 377 SUB pSrc,pTmp,outPointStep 378 SUB pTwiddle,pTwiddle,subFFTSize,LSL #1 379 SUB pTwiddle,pTwiddle,#16 @// Extra increment done in final iteration of the loop 380 381 .endm 382 383 384 M_START armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4 385 FFTSTAGE "FALSE","FALSE",fwd 386 M_END 387 388 389 M_START armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4 390 FFTSTAGE "FALSE","TRUE",inv 391 M_END 392 393 394 M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4 395 FFTSTAGE "TRUE","FALSE",fwdsfs 396 M_END 397 398 399 M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4 400 FFTSTAGE "TRUE","TRUE",invsfs 401 M_END 402 403 404 .end 405