1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This is a modification of 11@// armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float 12@// instead of SC32. 13@// 14 15@// 16@// Description: 17@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT 18@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation 19@// 20@// 21 22 23@// Include standard headers 24 25#include "dl/api/arm/armCOMM_s.h" 26#include "dl/api/arm/omxtypes_s.h" 27 28 29@// Import symbols required from other files 30@// (For example tables) 31 32 33@// Set debugging level 34@//DEBUG_ON SETL {TRUE} 35 36 37 38@// Guarding implementation by the processor name 39 40 41 42 @// Guarding implementation by the processor name 43 44 45 46@//Input Registers 47 48#define pSrc r0 49#define pDst r1 50#define pFFTSpec r2 51#define scale r3 52 53 54@// Output registers 55#define result r0 56 57@//Local Scratch Registers 58 59#define argTwiddle r1 60#define argDst r2 61#define argScale r4 62#define tmpOrder r4 63#define pTwiddle r4 64#define pOut r5 65#define subFFTSize r7 66#define subFFTNum r6 67#define N r6 68#define order r14 69#define diff r9 70@// Total num of radix stages required to complete the FFT 71#define count r8 72#define x0r r4 73#define x0i r5 74#define diffMinusOne r2 75#define round r3 76 77#define pOut1 r2 78#define size r7 79#define step r8 80#define step1 r9 81#define twStep r10 82#define pTwiddleTmp r11 83#define argTwiddle1 r12 84#define zero r14 85 86@// Neon registers 87 88#define dX0 D0.F32 89#define dShift D1.F32 90#define dX1 D1.F32 91#define dY0 D2.F32 92#define dY1 D3.F32 93#define dX0r D0.F32 94#define dX0i D1.F32 95#define dX1r D2.F32 96#define dX1i D3.F32 97#define dW0r D4.F32 98#define dW0i D5.F32 99#define dW1r D6.F32 100#define dW1i D7.F32 101#define dT0 D8.F32 102#define dT1 D9.F32 103#define dT2 D10.F32 104#define dT3 D11.F32 105#define qT0 D12.F32 106#define qT1 D14.F32 107#define qT2 D16.F32 108#define qT3 D18.F32 109#define dY0r D4.F32 110#define dY0i D5.F32 111#define dY1r D6.F32 112#define dY1i D7.F32 113 114#define dY2 D4.F32 115#define dY3 D5.F32 116#define dW0 D6.F32 117#define dW1 D7.F32 118#define dW0Tmp D10.F32 119#define dW1Neg D11.F32 120 121#define half D13.F32 122 123@ Structure offsets for the FFTSpec 124 .set ARMsFFTSpec_N, 0 125 .set ARMsFFTSpec_pBitRev, 4 126 .set ARMsFFTSpec_pTwiddle, 8 127 .set ARMsFFTSpec_pBuf, 12 128 129 .macro FFTSTAGE scaled, inverse, name 130 131 @// Read the size from structure and take log 132 LDR N, [pFFTSpec, #ARMsFFTSpec_N] 133 134 @// Read other structure parameters 135 LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle] 136 LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf] 137 138 VMOV half, 0.5 139 140 141 MOV size,N,ASR #1 @// preserve the contents of N 142 MOV step,N,LSL #2 @// step = N/2 * 8 bytes 143 144 145 @// Z(k) = 1/2 {[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)]} 146 @// Note: W^(k) is stored as negated value and also need to 147 @// conjugate the values from the table 148 149 @// Z(0) : no need of twiddle multiply 150 @// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] } 151 152 VLD1 dX0,[pSrc],step 153 ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes 154 155 VLD1 dX1,[pSrc]! 156 @// twStep = 3N/8 * 8 bytes pointing to W^1 157 SUB twStep,step,size,LSL #1 158 159 MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes 160 SUB step1,step1,#8 @// (N/4-1)*8 bytes 161 162 VADD dY0,dX0,dX1 @// [b+d | a+c] 163 VSUB dY1,dX0,dX1 @// [b-d | a-c] 164 VMUL dY0, dY0, half[0] 165 VMUL dY1, dY1, half[0] 166 167 @// dY0= [a-c | a+c] ;dY1= [b-d | b+d] 168 VZIP dY0,dY1 169 170 VSUB dX0,dY0,dY1 171 SUBS size,size,#2 172 VADD dX1,dY0,dY1 173 174 SUB pSrc,pSrc,step 175 176 VST1 dX0[0],[pOut1]! 177 ADD pTwiddleTmp,pTwiddle,#8 @// W^2 178 VST1 dX1[1],[pOut1]! 179 ADD argTwiddle1,pTwiddle,twStep @// W^1 180 181 182 BLT decrementScale\name 183 BEQ lastElement\name 184 185 186 @// Z(k) = 1/2[F(k) + F'(N/2-k)] +j*W^(-k) [F(k) - F'(N/2-k)] 187 @// Note: W^k is stored as negative values in the table and also 188 @// need to conjugate the values from the table. 189 @// 190 @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) 191 @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1) 192 193 194 SUB step,step,#24 195evenOddButterflyLoop\name : 196 197 198 VLD1 dW0r,[argTwiddle1],step1 199 VLD1 dW1r,[argTwiddle1]! 200 201 VLD2 {dX0r,dX0i},[pSrc],step 202 SUB argTwiddle1,argTwiddle1,step1 203 VLD2 {dX1r,dX1i},[pSrc]! 204 205 SUB step1,step1,#8 @// (N/4-2)*8 bytes 206 VLD1 dW0i,[pTwiddleTmp],step1 207 VLD1 dW1i,[pTwiddleTmp]! 208 SUB pSrc,pSrc,step 209 210 SUB pTwiddleTmp,pTwiddleTmp,step1 211 VREV64 dX1r,dX1r 212 VREV64 dX1i,dX1i 213 SUBS size,size,#4 214 215 216 VSUB dT2,dX0r,dX1r @// a-c 217 VADD dT3,dX0i,dX1i @// b+d 218 VADD dT0,dX0r,dX1r @// a+c 219 VSUB dT1,dX0i,dX1i @// b-d 220 SUB step1,step1,#8 221 222 VMUL dT2, dT2, half[0] 223 VMUL dT3, dT3, half[0] 224 225 VMUL dT0, dT0, half[0] 226 VMUL dT1, dT1, half[0] 227 228 VZIP dW1r,dW1i 229 VZIP dW0r,dW0i 230 231 232 VMUL dX1r,dW1r,dT2 233 VMUL dX1i,dW1r,dT3 234 VMUL dX0r,dW0r,dT2 235 VMUL dX0i,dW0r,dT3 236 237 VMLS dX1r,dW1i,dT3 238 VMLA dX1i,dW1i,dT2 239 240 VMLA dX0r,dW0i,dT3 241 VMLS dX0i,dW0i,dT2 242 243 244 VADD dY1r,dT0,dX1i @// F(N/2 -1) 245 VSUB dY1i,dX1r,dT1 246 247 VREV64 dY1r,dY1r 248 VREV64 dY1i,dY1i 249 250 251 VADD dY0r,dT0,dX0i @// F(1) 252 VSUB dY0i,dT1,dX0r 253 254 255 VST2 {dY0r,dY0i},[pOut1],step 256 VST2 {dY1r,dY1i},[pOut1]! 257 SUB pOut1,pOut1,step 258 SUB step,step,#32 @// (N/2-4)*8 bytes 259 260 261 BGT evenOddButterflyLoop\name 262 263 264 @// set both the ptrs to the last element 265 SUB pSrc,pSrc,#8 266 SUB pOut1,pOut1,#8 267 268 @// Last element can be expanded as follows 269 @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as 270 @// -ve) 271 @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)] 272 @// 1/2[2a+j0] - j (c-jd) [0+j2b] 273 @// (a+bc, -bd) 274 @// Since (c,d) = (0,1) for the last element, result is just (a,-b) 275 276lastElement\name : 277 VLD1 dX0r,[pSrc] 278 279 VST1 dX0r[0],[pOut1]! 280 VNEG dX0r,dX0r 281 VST1 dX0r[1],[pOut1] 282 283 284 285decrementScale\name : 286 287 .endm 288 289 M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4 290 291 FFTSTAGE "FALSE","TRUE",Inv 292 M_END 293 294 .end 295