1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This file was originally licensed as follows. It has been 11@// relicensed with permission from the copyright holders. 12@// 13 14@// 15@// File Name: armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s 16@// OpenMAX DL: v1.0.2 17@// Last Modified Revision: 7485 18@// Last Modified Date: Fri, 21 Sep 2007 19@// 20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 21@// 22@// 23@// 24@// Description: 25@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT 26@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation 27@// It implements both "scaled"(by 1/2) and "unsclaed" versions of the above formula 28@// 29 30 31@// Include standard headers 32 33#include "dl/api/arm/armCOMM_s.h" 34#include "dl/api/arm/omxtypes_s.h" 35 36 37@// Import symbols required from other files 38@// (For example tables) 39 40 41@// Set debugging level 42@//DEBUG_ON SETL {TRUE} 43 44 45 46@// Guarding implementation by the processor name 47 48 49 50 @// Guarding implementation by the processor name 51 52 53 54@//Input Registers 55 56#define pSrc r0 57#define pDst r1 58#define pFFTSpec r2 59#define scale r3 60 61 62@// Output registers 63#define result r0 64 65@//Local Scratch Registers 66 67#define argTwiddle r1 68#define argDst r2 69#define argScale r4 70#define tmpOrder r4 71#define pTwiddle r4 72#define pOut r5 73#define subFFTSize r7 74#define subFFTNum r6 75#define N r6 76#define order r14 77#define diff r9 78#define count r8 @// Total num of radix stages required to comple the FFT 79#define x0r r4 80#define x0i r5 81#define diffMinusOne r2 82#define round r3 83 84#define pOut1 r2 85#define size r7 86#define step r8 87#define step1 r9 88#define twStep r10 89#define pTwiddleTmp r11 90#define argTwiddle1 r12 91#define zero r14 92 93@// Neon registers 94 95#define dX0 D0.S32 96#define dShift D1.S32 97#define dX1 D1.S32 98#define dY0 D2.S32 99#define dY1 D3.S32 100#define dX0r D0.S32 101#define dX0i D1.S32 102#define dX1r D2.S32 103#define dX1i D3.S32 104#define dW0r D4.S32 105#define dW0i D5.S32 106#define dW1r D6.S32 107#define dW1i D7.S32 108#define dT0 D8.S32 109#define dT1 D9.S32 110#define dT2 D10.S32 111#define dT3 D11.S32 112#define qT0 Q6.S64 113#define qT1 Q7.S64 114#define qT2 Q8.S64 115#define qT3 Q9.S64 116#define dY0r D4.S32 117#define dY0i D5.S32 118#define dY1r D6.S32 119#define dY1i D7.S32 120 121#define dY2 D4.S32 122#define dY3 D5.S32 123#define dW0 D6.S32 124#define dW1 D7.S32 125#define dW0Tmp D10.S32 126#define dW1Neg D11.S32 127 128 129@ Structure offsets for the FFTSpec 130 .set ARMsFFTSpec_N, 0 131 .set ARMsFFTSpec_pBitRev, 4 132 .set ARMsFFTSpec_pTwiddle, 8 133 .set ARMsFFTSpec_pBuf, 12 134 135 136 .macro FFTSTAGE scaled, inverse, name 137 138 @// Read the size from structure and take log 139 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 140 141 @// Read other structure parameters 142 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 143 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 144 145 146 147 MOV size,N,ASR #1 @// preserve the contents of N 148 MOV step,N,LSL #2 @// step = N/2 * 8 bytes 149 150 151 @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} 152 @// Note: W^(k) is stored as negated value and also need to conjugate the values from the table 153 154 @// Z(0) : no need of twiddle multiply 155 @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } 156 157 VLD1 dX0,[pSrc],step 158 ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes 159 160 VLD1 dX1,[pSrc]! 161 SUB twStep,step,size,LSL #1 @// twStep = 3N/8 * 8 bytes pointing to W^1 162 163 MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes 164 SUB step1,step1,#8 @// (N/4-1)*8 bytes 165 166 VHADD dY0,dX0,dX1 @// [b+d | a+c] 167 VHSUB dY1,dX0,dX1 @// [b-d | a-c] 168 VZIP dY0,dY1 @// dY0= [a-c | a+c] ;dY1= [b-d | b+d] 169 170 .ifeqs "\scaled", "TRUE" 171 VHSUB dX0,dY0,dY1 172 SUBS size,size,#2 173 VHADD dX1,dY0,dY1 174 .else 175 VSUB dX0,dY0,dY1 176 SUBS size,size,#2 177 VADD dX1,dY0,dY1 178 .endif 179 180 SUB pSrc,pSrc,step 181 182 VST1 dX0[0],[pOut1]! 183 ADD pTwiddleTmp,pTwiddle,#8 @// W^2 184 VST1 dX1[1],[pOut1]! 185 ADD argTwiddle1,pTwiddle,twStep @// W^1 186 187 188 BLT decrementScale\name 189 BEQ lastElement\name 190 191 192 @// Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] 193 @// Note: W^k is stored as negative values in the table and also need to conjugate the values from the table 194 @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) since both of them 195 @// require F(1),F(2) and F(N/2-2),F(N/2-1) 196 197 198 SUB step,step,#24 199evenOddButterflyLoop\name : 200 201 202 VLD1 dW0r,[argTwiddle1],step1 203 VLD1 dW1r,[argTwiddle1]! 204 205 VLD2 {dX0r,dX0i},[pSrc],step 206 SUB argTwiddle1,argTwiddle1,step1 207 VLD2 {dX1r,dX1i},[pSrc]! 208 209 SUB step1,step1,#8 @// (N/4-2)*8 bytes 210 VLD1 dW0i,[pTwiddleTmp],step1 211 VLD1 dW1i,[pTwiddleTmp]! 212 SUB pSrc,pSrc,step 213 214 SUB pTwiddleTmp,pTwiddleTmp,step1 215 VREV64 dX1r,dX1r 216 VREV64 dX1i,dX1i 217 SUBS size,size,#4 218 219 220 VHSUB dT2,dX0r,dX1r @// a-c 221 VHADD dT3,dX0i,dX1i @// b+d 222 SUB step1,step1,#8 223 VHADD dT0,dX0r,dX1r @// a+c 224 VHSUB dT1,dX0i,dX1i @// b-d 225 226 VZIP dW1r,dW1i 227 VZIP dW0r,dW0i 228 229 230 VMULL qT0,dW1r,dT2 231 VMLSL qT0,dW1i,dT3 232 VMULL qT1,dW1r,dT3 233 VMLAL qT1,dW1i,dT2 234 235 VMULL qT2,dW0r,dT2 236 VMLAL qT2,dW0i,dT3 237 VMULL qT3,dW0r,dT3 238 VMLSL qT3,dW0i,dT2 239 240 241 VRSHRN dX1r,qT0,#31 242 VRSHRN dX1i,qT1,#31 243 244 .ifeqs "\scaled", "TRUE" 245 VHADD dY1r,dT0,dX1i @// F(N/2 -1) 246 VHSUB dY1i,dX1r,dT1 247 .else 248 VADD dY1r,dT0,dX1i @// F(N/2 -1) 249 VSUB dY1i,dX1r,dT1 250 251 .endif 252 253 254 VREV64 dY1r,dY1r 255 VREV64 dY1i,dY1i 256 257 258 VRSHRN dX0r,qT2,#31 259 VRSHRN dX0i,qT3,#31 260 261 .ifeqs "\scaled", "TRUE" 262 VHADD dY0r,dT0,dX0i @// F(1) 263 VHSUB dY0i,dT1,dX0r 264 .else 265 VADD dY0r,dT0,dX0i @// F(1) 266 VSUB dY0i,dT1,dX0r 267 .endif 268 269 270 VST2 {dY0r,dY0i},[pOut1],step 271 VST2 {dY1r,dY1i},[pOut1]! 272 SUB pOut1,pOut1,step 273 SUB step,step,#32 @// (N/2-4)*8 bytes 274 275 276 BGT evenOddButterflyLoop\name 277 278 279 SUB pSrc,pSrc,#8 @// set both the ptrs to the last element 280 SUB pOut1,pOut1,#8 281 282 @// Last element can be expanded as follows 283 @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as -ve) 284 @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] 285 @// 1/2[2a+j0] - j (c-jd) [0+j2b] 286 @// (a+bc, -bd) 287 @// Since (c,d) = (0,1) for the last element, result is just (a,-b) 288 289lastElement\name : 290 VLD1 dX0r,[pSrc] 291 292 .ifeqs "\scaled", "TRUE" 293 VSHR dX0r,dX0r,#1 294 .endif 295 296 VST1 dX0r[0],[pOut1]! 297 VNEG dX0r,dX0r 298 VST1 dX0r[1],[pOut1] 299 300 301 302decrementScale\name : 303 304 .ifeqs "\scaled", "TRUE" 305 SUB scale,scale,#1 306 .endif 307 308 .endm 309 310 M_START armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe,r4 311 312 FFTSTAGE "FALSE","TRUE",Inv 313 M_END 314 315 M_START armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe,r4 316 317 FFTSTAGE "TRUE","TRUE",InvSfs 318 M_END 319 320 321 .end 322