1@ 2@ Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@ 4@ Use of this source code is governed by a BSD-style license 5@ that can be found in the LICENSE file in the root of the source 6@ tree. An additional intellectual property rights grant can be found 7@ in the file PATENTS. All contributing project authors may 8@ be found in the AUTHORS file in the root of the source tree. 9@ 10@ Some code in this file was originally from file 11@ armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S which was licensed as 12@ follows. It has been relicensed with permission from the copyright holders. 13@ 14 15@ 16@ OpenMAX DL: v1.0.2 17@ Last Modified Revision: 7485 18@ Last Modified Date: Fri, 21 Sep 2007 19@ 20@ (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21@ 22 23@ 24@ Description: 25@ Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT. 26@ It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation. 27@ It implements both "scaled"(by 1/2) and "unscaled" versions of the above 28@ formula. 29@ 30 31#include "dl/api/arm/armCOMM_s.h" 32#include "dl/api/arm/omxtypes_s.h" 33 34@//Input Registers 35#define pSrc r0 36#define pDst r1 37#define pFFTSpec r2 38#define scale r3 39 40@ Output registers 41#define result r0 42 43@//Local Scratch Registers 44#define argTwiddle r1 45#define argDst r2 46#define argScale r4 47#define tmpOrder r4 48#define pTwiddle r4 49#define pOut r5 50#define subFFTSize r7 51#define subFFTNum r6 52#define N r6 53#define order r14 54#define diff r9 55@ Total num of radix stages to comple the FFT. 56#define count r8 57#define x0r r4 58#define x0i r5 59#define diffMinusOne r2 60#define round r3 61#define pOut1 r2 62#define size r7 63#define step r8 64#define step1 r9 65#define step2 r10 66#define twStep r10 67#define pTwiddleTmp r11 68#define argTwiddle1 r12 69#define zero r14 70 71@ Neon registers 72#define dX0 D0.S16 73#define dX0S32 D0.S32 74#define dShift D1.S16 75#define dX1 D1.S16 76#define dX1S32 D1.S32 77#define dY0 D2.S16 78#define dY1 D3.S16 79#define dX0r D0.S16 80#define dX0rS32 D0.S32 81#define dX0i D1.S16 82#define dX1r D2.S16 83#define dX1i D3.S16 84#define qX1 Q1.S16 85#define dW0r D4.S16 86#define dW0i D5.S16 87#define dW1r D6.S16 88#define dW1i D7.S16 89#define dW0rS32 D4.S32 90#define dW0iS32 D5.S32 91#define dW1rS32 D6.S32 92#define dW1iS32 D7.S32 93#define dT0 D8.S16 94#define dT1 D9.S16 95#define dT2 D10.S16 96#define dT3 D11.S16 97#define qT0 Q6.S32 98#define qT1 Q7.S32 99#define qT2 Q8.S32 100#define qT3 Q9.S32 101#define dY0r D4.S16 102#define dY0i D5.S16 103#define dY1r D6.S16 104#define dY1i D7.S16 105#define qY1 Q3.S16 106#define dY2 D4.S16 107#define dY3 D5.S16 108#define dW0 D6.S16 109#define dW1 D7.S16 110#define dW0Tmp D10.S16 111#define dW1Neg D11.S16 112 113 @ Structure offsets for the FFTSpec 114 .set ARMsFFTSpec_N, 0 115 .set ARMsFFTSpec_pBitRev, 4 116 .set ARMsFFTSpec_pTwiddle, 8 117 .set ARMsFFTSpec_pBuf, 12 118 119 .MACRO FFTSTAGE scaled, inverse, name 120 121 @ Read the size from structure and take log 122 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 123 124 @ Read other structure parameters 125 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 126 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 127 128 MOV size,N,ASR #1 @ preserve the contents of N 129 MOV step,N,LSL #1 @ step = N/2 * 4 bytes 130 131 @ Process different FFT sizes with different loops. 132 CMP size,#4 133 BLE smallFFTSize\name 134 135 @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} 136 @ Note: W^(k) is stored as negated value and also need to 137 @ conjugate the values from the table. 138 139 @ Z(0) : no need of twiddle multiply 140 @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } 141 142 VLD1 dX0S32[0],[pSrc],step 143 ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes 144 145 VLD1 dX1S32[0],[pSrc]! 146 SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1 147 148 MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes 149 SUB step1,step1,#4 @ (N/4-1)*4 bytes 150 151 VHADD dY0,dX0,dX1 @ [b+d | a+c] 152 VHSUB dY1,dX0,dX1 @ [b-d | a-c] 153 VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 154 155 .ifeqs "\scaled", "TRUE" 156 VHSUB dX0,dY0,dY1 157 SUBS size,size,#2 158 VHADD dX1,dY0,dY1 159 .else 160 VSUB dX0,dY0,dY1 161 SUBS size,size,#2 162 VADD dX1,dY0,dY1 163 .endif 164 165 SUB pSrc,pSrc,step 166 VST1 dX0[0],[pOut1]! 167 ADD pTwiddleTmp,pTwiddle,#4 @ W^2 168 VST1 dX1[1],[pOut1]! 169 ADD argTwiddle1,pTwiddle,twStep @ W^1 170 171 BLT decrementScale\name 172 BEQ lastElement\name 173 174 SUB step,step,#20 175 SUB step1,step1,#4 @ (N/4-1)*8 bytes 176 SUB step2, step1, #4 177 178 @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] 179 @ Note: W^k is stored as negative values in the table and also need to 180 @ conjugate the values from the table. 181 @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 182 @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1). 183 184evenOddButterflyLoop\name: 185 VLD2 {dX0r,dX0i},[pSrc],step 186 VLD2 {dX1r,dX1i},[pSrc]! 187 SUB pSrc, pSrc, step 188 189 VLD1 dW0r,[argTwiddle1],step1 190 VREV64 qX1,qX1 191 VLD1 dW1r,[argTwiddle1]! 192 VHSUB dT2,dX0r,dX1r @ a-c 193 SUB argTwiddle1, argTwiddle1, step1 194 SUB step1,step1,#16 195 196 VLD1 dW0i,[pTwiddleTmp],step2 197 VHADD dT3,dX0i,dX1i @ b+d 198 VLD1 dW1i,[pTwiddleTmp]! 199 VHADD dT0,dX0r,dX1r @ a+c 200 VHSUB dT1,dX0i,dX1i @ b-d 201 SUB pTwiddleTmp, pTwiddleTmp, step2 202 SUB step2,step2,#16 203 204 SUBS size,size,#8 205 206 VZIP dW1r,dW1i 207 VTRN dW0r,dW0i 208 VZIP dW1iS32, dW1rS32 209 210 VMULL qT0,dW1i,dT2 211 VMLSL qT0,dW1r,dT3 212 VMULL qT1,dW1i,dT3 213 VMLAL qT1,dW1r,dT2 214 VMULL qT2,dW0r,dT2 215 VMLAL qT2,dW0i,dT3 216 VMULL qT3,dW0r,dT3 217 VMLSL qT3,dW0i,dT2 218 219 VRSHRN dX1r,qT0,#15 220 VRSHRN dX1i,qT1,#15 221 VRSHRN dX0r,qT2,#15 222 VRSHRN dX0i,qT3,#15 223 224 .ifeqs "\scaled", "TRUE" 225 VHADD dY1r,dT0,dX1i @ F(N/2 -1) 226 VHSUB dY1i,dX1r,dT1 227 .else 228 VADD dY1r,dT0,dX1i @ F(N/2 -1) 229 VSUB dY1i,dX1r,dT1 230 .endif 231 232 .ifeqs "\scaled", "TRUE" 233 VHADD dY0r,dT0,dX0i @ F(1) 234 VHSUB dY0i,dT1,dX0r 235 .else 236 VADD dY0r,dT0,dX0i @ F(1) 237 VSUB dY0i,dT1,dX0r 238 .endif 239 240 VREV64 qY1,qY1 241 242 VST2 {dY0r,dY0i},[pOut1],step 243 VST2 {dY1r,dY1i},[pOut1] 244 ADD pOut1,pOut1,#16 245 SUB pOut1, pOut1, step 246 SUB step,step,#32 247 248 BGT evenOddButterflyLoop\name 249 250 SUB pSrc,pSrc,#4 @ set both the ptrs to the last element 251 SUB pOut1,pOut1,#4 252 B lastElement\name 253 254smallFFTSize\name: 255 @ Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} 256 @ Note: W^(k) is stored as negated value and also need to 257 @ conjugate the values from the table. 258 259 @ Z(0) : no need of twiddle multiply 260 @ Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } 261 262 VLD1 dX0S32[0],[pSrc],step 263 ADD pOut1,pOut,step @ pOut1 = pOut+ N/2*4 bytes 264 265 VLD1 dX1S32[0],[pSrc]! 266 SUB twStep,step,size @ twStep = 3N/8 * 4 bytes pointing to W^1 267 268 MOV step1,size,LSL #1 @ step1 = N/4 * 4 = N/2*2 bytes 269 SUB step1,step1,#4 @ (N/4-1)*4 bytes 270 271 VHADD dY0,dX0,dX1 @ [b+d | a+c] 272 VHSUB dY1,dX0,dX1 @ [b-d | a-c] 273 VTRN dY0,dY1 @ dY0= [a-c | a+c] ;dY1= [b-d | b+d] 274 275 .ifeqs "\scaled", "TRUE" 276 VHSUB dX0,dY0,dY1 277 SUBS size,size,#2 278 VHADD dX1,dY0,dY1 279 .else 280 VSUB dX0,dY0,dY1 281 SUBS size,size,#2 282 VADD dX1,dY0,dY1 283 .endif 284 285 SUB pSrc,pSrc,step 286 VST1 dX0[0],[pOut1]! 287 ADD pTwiddleTmp,pTwiddle,#4 @ W^2 288 VST1 dX1[1],[pOut1]! 289 ADD argTwiddle1,pTwiddle,twStep @ W^1 290 291 BLT decrementScale\name 292 BEQ lastElement\name 293 294 @ Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] 295 @ Note: W^k is stored as negative values in the table and also need to 296 @ conjugate the values from the table. 297 @ Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 298 @ since both of them require F(1),F(2) and F(N/2-2),F(N/2-1). 299 300 SUB step,step,#12 301 302evenOddButterflyLoopSize4\name: 303 VLD1 dW0rS32[0],[argTwiddle1],step1 304 VLD1 dW1rS32[0],[argTwiddle1]! 305 306 VLD2 {dX0r[0],dX0i[0]},[pSrc]! 307 VLD2 {dX0r[1],dX0i[1]},[pSrc],step 308 SUB pSrc,pSrc,#4 309 SUB argTwiddle1,argTwiddle1,step1 310 VLD2 {dX1r[0],dX1i[0]},[pSrc]! 311 VLD2 {dX1r[1],dX1i[1]},[pSrc]! 312 313 SUB step1,step1,#4 @ (N/4-2)*4 bytes 314 VLD1 dW0iS32[0],[pTwiddleTmp],step1 315 VLD1 dW1iS32[0],[pTwiddleTmp]! 316 SUB pSrc,pSrc,step 317 318 SUB pTwiddleTmp,pTwiddleTmp,step1 319 VREV32 dX1r,dX1r 320 VREV32 dX1i,dX1i 321 SUBS size,size,#4 322 323 VHSUB dT2,dX0r,dX1r @ a-c 324 VHADD dT3,dX0i,dX1i @ b+d 325 SUB step1,step1,#4 326 VHADD dT0,dX0r,dX1r @ a+c 327 VHSUB dT1,dX0i,dX1i @ b-d 328 329 VTRN dW1r,dW1i 330 VTRN dW0r,dW0i 331 332 VMULL qT0,dW1r,dT2 333 VMLSL qT0,dW1i,dT3 334 VMULL qT1,dW1r,dT3 335 VMLAL qT1,dW1i,dT2 336 VMULL qT2,dW0r,dT2 337 VMLAL qT2,dW0i,dT3 338 VMULL qT3,dW0r,dT3 339 VMLSL qT3,dW0i,dT2 340 341 VRSHRN dX1r,qT0,#15 342 VRSHRN dX1i,qT1,#15 343 344 .ifeqs "\scaled", "TRUE" 345 VHADD dY1r,dT0,dX1i @ F(N/2 -1) 346 VHSUB dY1i,dX1r,dT1 347 .else 348 VADD dY1r,dT0,dX1i @ F(N/2 -1) 349 VSUB dY1i,dX1r,dT1 350 .endif 351 352 VREV32 dY1r,dY1r 353 VREV32 dY1i,dY1i 354 355 VRSHRN dX0r,qT2,#15 356 VRSHRN dX0i,qT3,#15 357 358 .ifeqs "\scaled", "TRUE" 359 VHADD dY0r,dT0,dX0i @ F(1) 360 VHSUB dY0i,dT1,dX0r 361 .else 362 VADD dY0r,dT0,dX0i @ F(1) 363 VSUB dY0i,dT1,dX0r 364 .endif 365 366 VST2 {dY0r[0],dY0i[0]},[pOut1]! 367 VST2 {dY0r[1],dY0i[1]},[pOut1],step 368 SUB pOut1, #4 369 VST2 {dY1r[0],dY1i[0]},[pOut1]! 370 VST2 {dY1r[1],dY1i[1]},[pOut1]! 371 SUB pOut1,pOut1,step 372 SUB pSrc,pSrc,#4 @ set both the ptrs to the last element 373 SUB pOut1,pOut1,#4 374 375 @ Last element can be expanded as follows 376 @ 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (W^k is stored as -ve) 377 @ 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] 378 @ 1/2[2a+j0] - j (c-jd) [0+j2b] 379 @ (a+bc, -bd) 380 @ Since (c,d) = (0,1) for the last element, result is just (a,-b) 381 382lastElement\name: 383 VLD1 dX0rS32[0],[pSrc] 384 385 .ifeqs "\scaled", "TRUE" 386 VSHR dX0r,dX0r,#1 387 .endif 388 389 VST1 dX0r[0],[pOut1]! 390 VNEG dX0r,dX0r 391 VST1 dX0r[1],[pOut1] 392 393decrementScale\name: 394 .ifeqs "\scaled", "TRUE" 395 SUB scale,scale,#1 396 .endif 397 398 .endm 399 400 M_START armSP_FFTInv_CCSToR_S16_preTwiddleRadix2_unsafe,r4 401 FFTSTAGE "FALSE","TRUE",Inv 402 M_END 403 404 M_START armSP_FFTInv_CCSToR_S16_Sfs_preTwiddleRadix2_unsafe,r4 405 FFTSTAGE "TRUE","TRUE",InvSfs 406 M_END 407 408 409 .end 410