1@// 2@// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3@// 4@// Use of this source code is governed by a BSD-style license 5@// that can be found in the LICENSE file in the root of the source 6@// tree. An additional intellectual property rights grant can be found 7@// in the file PATENTS. All contributing project authors may 8@// be found in the AUTHORS file in the root of the source tree. 9@// 10@// This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s 11@// to support float instead of SC32. 12@// 13 14@// 15@// Description: 16@// Compute a first stage Radix 8 FFT stage for a N point complex signal 17@// 18@// 19 20 21@// Include standard headers 22 23#include "dl/api/arm/armCOMM_s.h" 24#include "dl/api/arm/omxtypes_s.h" 25 26@// Import symbols required from other files 27@// (For example tables) 28 29 30@// Set debugging level 31@//DEBUG_ON SETL {TRUE} 32 33 34 35@// Guarding implementation by the processor name 36 37 38 39 40@// Guarding implementation by the processor name 41 42@//Input Registers 43 44#define pSrc r0 45#define pDst r2 46#define pTwiddle r1 47#define subFFTNum r6 48#define subFFTSize r7 49@// dest buffer for the next stage (not pSrc for first stage) 50#define pPingPongBuf r5 51 52 53@//Output Registers 54 55 56@//Local Scratch Registers 57 58#define grpSize r3 59@// Reuse grpSize as setCount 60#define setCount r3 61#define pointStep r4 62#define outPointStep r4 63#define setStep r8 64#define step1 r9 65#define step2 r10 66#define t0 r11 67 68 69@// Neon Registers 70 71#define dXr0 D0.F32 72#define dXi0 D1.F32 73#define dXr1 D2.F32 74#define dXi1 D3.F32 75#define dXr2 D4.F32 76#define dXi2 D5.F32 77#define dXr3 D6.F32 78#define dXi3 D7.F32 79#define dXr4 D8.F32 80#define dXi4 D9.F32 81#define dXr5 D10.F32 82#define dXi5 D11.F32 83#define dXr6 D12.F32 84#define dXi6 D13.F32 85#define dXr7 D14.F32 86#define dXi7 D15.F32 87#define qX0 Q0.F32 88#define qX1 Q1.F32 89#define qX2 Q2.F32 90#define qX3 Q3.F32 91#define qX4 Q4.F32 92#define qX5 Q5.F32 93#define qX6 Q6.F32 94#define qX7 Q7.F32 95 96#define dUr0 D16.F32 97#define dUi0 D17.F32 98#define dUr2 D18.F32 99#define dUi2 D19.F32 100#define dUr4 D20.F32 101#define dUi4 D21.F32 102#define dUr6 D22.F32 103#define dUi6 D23.F32 104#define dUr1 D24.F32 105#define dUi1 D25.F32 106#define dUr3 D26.F32 107#define dUi3 D27.F32 108#define dUr5 D28.F32 109#define dUi5 D29.F32 110@// reuse dXr7 and dXi7 111#define dUr7 D30.F32 112#define dUi7 D31.F32 113#define qU0 Q8.F32 114#define qU1 Q12.F32 115#define qU2 Q9.F32 116#define qU3 Q13.F32 117#define qU4 Q10.F32 118#define qU5 Q14.F32 119#define qU6 Q11.F32 120#define qU7 Q15.F32 121 122 123#define dVr0 D24.F32 124#define dVi0 D25.F32 125#define dVr2 D26.F32 126#define dVi2 D27.F32 127#define dVr4 D28.F32 128#define dVi4 D29.F32 129#define dVr6 D30.F32 130#define dVi6 D31.F32 131#define dVr1 D16.F32 132#define dVi1 D17.F32 133#define dVr3 D18.F32 134#define dVi3 D19.F32 135#define dVr5 D20.F32 136#define dVi5 D21.F32 137#define dVr7 D22.F32 138#define dVi7 D23.F32 139#define qV0 Q12.F32 140#define qV1 Q8.F32 141#define qV2 Q13.F32 142#define qV3 Q9.F32 143#define qV4 Q14.F32 144#define qV5 Q10.F32 145#define qV6 Q15.F32 146#define qV7 Q11.F32 147 148#define dYr0 D16.F32 149#define dYi0 D17.F32 150#define dYr2 D18.F32 151#define dYi2 D19.F32 152#define dYr4 D20.F32 153#define dYi4 D21.F32 154#define dYr6 D22.F32 155#define dYi6 D23.F32 156#define dYr1 D24.F32 157#define dYi1 D25.F32 158#define dYr3 D26.F32 159#define dYi3 D27.F32 160#define dYr5 D28.F32 161#define dYi5 D29.F32 162#define dYr7 D30.F32 163#define dYi7 D31.F32 164#define qY0 Q8.F32 165#define qY1 Q12.F32 166#define qY2 Q9.F32 167#define qY3 Q13.F32 168#define qY4 Q10.F32 169#define qY5 Q14.F32 170#define qY6 Q11.F32 171#define qY7 Q15.F32 172 173#define dT0 D14.F32 174#define dT1 D15.F32 175 176 .MACRO FFTSTAGE scaled, inverse, name 177 178 @// Define stack arguments 179 180 @// Update pSubFFTSize and pSubFFTNum regs 181 @// subFFTSize = 1 for the first stage 182 MOVW t0, 0x04f3 @// Low half word of sqrt(1/2). 183 MOV subFFTSize,#8 184 MOVT t0, 0x3f35 @// High half word of sqrt(1/2). 185 186 @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount) 187 LSR grpSize,subFFTNum,#3 188 MOV subFFTNum,grpSize 189 190 191 @// pT0+1 increments pT0 by 8 bytes 192 @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes 193 @// Note: outPointStep = pointStep for firststage 194 195 MOV pointStep,grpSize,LSL #3 196 197 198 @// Calculate the step of input data for the next set 199 @//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep 200 VLD2 {dXr0,dXi0},[pSrc :128],pointStep @// data[0] 201 MOV step1,grpSize,LSL #4 202 203 MOV step2,pointStep,LSL #3 204 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] 205 SUB step2,step2,pointStep @// step2 = 7*pointStep 206 @// setStep = - 7*pointStep+16 207 RSB setStep,step2,#16 208 209 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] 210 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3] 211 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 212 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 213 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 214 @// data[7] & update pSrc for the next set 215 @// setStep = -7*pointStep + 16 216 VLD2 {dXr7,dXi7},[pSrc :128],setStep 217 @// grp = 0 a special case since all the twiddle factors are 1 218 @// Loop on the sets 219 220radix8fsGrpZeroSetLoop\name : 221 222 @// Decrement setcount 223 SUBS setCount,setCount,#2 224 225 226 @// finish first stage of 8 point FFT 227 228 VADD qU0,qX0,qX4 229 VADD qU2,qX1,qX5 230 VADD qU4,qX2,qX6 231 VADD qU6,qX3,qX7 232 233 @// finish second stage of 8 point FFT 234 235 VADD qV0,qU0,qU4 236 VSUB qV2,qU0,qU4 237 VADD qV4,qU2,qU6 238 VSUB qV6,qU2,qU6 239 240 @// finish third stage of 8 point FFT 241 242 VADD qY0,qV0,qV4 243 VSUB qY4,qV0,qV4 244 VST2 {dYr0,dYi0},[pDst :128],step1 @// store y0 245 246 .ifeqs "\inverse", "TRUE" 247 248 VSUB dYr2,dVr2,dVi6 249 VADD dYi2,dVi2,dVr6 250 251 VADD dYr6,dVr2,dVi6 252 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y2 253 VSUB dYi6,dVi2,dVr6 254 255 VSUB qU1,qX0,qX4 256 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4 257 258 VSUB qU3,qX1,qX5 259 VSUB qU5,qX2,qX6 260 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y6 261 262 .ELSE 263 264 VADD dYr6,dVr2,dVi6 265 VSUB dYi6,dVi2,dVr6 266 267 VSUB dYr2,dVr2,dVi6 268 VST2 {dYr6,dYi6},[pDst :128],step1 @// store y2 269 VADD dYi2,dVi2,dVr6 270 271 272 VSUB qU1,qX0,qX4 273 VST2 {dYr4,dYi4},[pDst :128],step1 @// store y4 274 VSUB qU3,qX1,qX5 275 VSUB qU5,qX2,qX6 276 VST2 {dYr2,dYi2},[pDst :128],step1 @// store y6 277 278 279 .ENDIF 280 281 @// finish first stage of 8 point FFT 282 283 VSUB qU7,qX3,qX7 284 VMOV dT0[0], t0 285 286 @// finish second stage of 8 point FFT 287 288 VSUB dVr1,dUr1,dUi5 289 @// data[0] for next iteration 290 VLD2 {dXr0,dXi0},[pSrc :128],pointStep 291 VADD dVi1,dUi1,dUr5 292 VADD dVr3,dUr1,dUi5 293 VLD2 {dXr1,dXi1},[pSrc :128],pointStep @// data[1] 294 VSUB dVi3,dUi1,dUr5 295 296 VSUB dVr5,dUr3,dUi7 297 VLD2 {dXr2,dXi2},[pSrc :128],pointStep @// data[2] 298 VADD dVi5,dUi3,dUr7 299 VADD dVr7,dUr3,dUi7 300 VLD2 {dXr3,dXi3},[pSrc :128],pointStep @// data[3] 301 VSUB dVi7,dUi3,dUr7 302 303 @// finish third stage of 8 point FFT 304 305 .ifeqs "\inverse", "TRUE" 306 307 @// calculate a*v5 308 VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1 309 310 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 311 VMUL dVi5,dVi5,dT0[0] 312 313 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 314 VSUB dVr5,dT1,dVi5 @// a * V5 315 VADD dVi5,dT1,dVi5 316 317 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 318 319 @// calculate b*v7 320 VMUL dT1,dVr7,dT0[0] 321 VMUL dVi7,dVi7,dT0[0] 322 323 VADD qY1,qV1,qV5 324 VSUB qY5,qV1,qV5 325 326 327 VADD dVr7,dT1,dVi7 @// b * V7 328 VSUB dVi7,dVi7,dT1 329 SUB pDst, pDst, step2 @// set pDst to y1 330 331 @// On the last iteration, this will read past the end of pSrc, 332 @// so skip this read. 333 BEQ radix8SkipLastUpdateInv\name 334 VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] 335radix8SkipLastUpdateInv\name: 336 337 VSUB dYr3,dVr3,dVr7 338 VSUB dYi3,dVi3,dVi7 339 VST2 {dYr1,dYi1},[pDst :128],step1 @// store y1 340 VADD dYr7,dVr3,dVr7 341 VADD dYi7,dVi3,dVi7 342 343 344 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y3 345 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y5 346 VST2 {dYr7,dYi7},[pDst :128] @// store y7 347 ADD pDst, pDst, #16 348 349 .ELSE 350 351 @// calculate b*v7 352 VMUL dT1,dVr7,dT0[0] 353 VLD2 {dXr4,dXi4},[pSrc :128],pointStep @// data[4] 354 VMUL dVi7,dVi7,dT0[0] 355 356 VLD2 {dXr5,dXi5},[pSrc :128],pointStep @// data[5] 357 VADD dVr7,dT1,dVi7 @// b * V7 358 VSUB dVi7,dVi7,dT1 359 360 VLD2 {dXr6,dXi6},[pSrc :128],pointStep @// data[6] 361 362 @// calculate a*v5 363 VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1 364 VMUL dVi5,dVi5,dT0[0] 365 366 VADD dYr7,dVr3,dVr7 367 VADD dYi7,dVi3,dVi7 368 SUB pDst, pDst, step2 @// set pDst to y1 369 370 VSUB dVr5,dT1,dVi5 @// a * V5 371 VADD dVi5,dT1,dVi5 372 373 @// On the last iteration, this will read past the end of pSrc, 374 @// so skip this read. 375 BEQ radix8SkipLastUpdateFwd\name 376 VLD2 {dXr7,dXi7},[pSrc :128],setStep @// data[7] 377radix8SkipLastUpdateFwd\name: 378 379 VSUB qY5,qV1,qV5 380 381 VSUB dYr3,dVr3,dVr7 382 VST2 {dYr7,dYi7},[pDst :128],step1 @// store y1 383 VSUB dYi3,dVi3,dVi7 384 VADD qY1,qV1,qV5 385 386 387 VST2 {dYr5,dYi5},[pDst :128],step1 @// store y3 388 VST2 {dYr3,dYi3},[pDst :128],step1 @// store y5 389 VST2 {dYr1,dYi1},[pDst :128]! @// store y7 390 391 .ENDIF 392 393 394 @// update pDst for the next set 395 SUB pDst, pDst, step2 396 BGT radix8fsGrpZeroSetLoop\name 397 398 399 @// reset pSrc to pDst for the next stage 400 SUB pSrc,pDst,pointStep @// pDst -= 2*grpSize 401 MOV pDst,pPingPongBuf 402 403 404 405 .endm 406 407 408 @// Allocate stack memory required by the function 409 410 411 M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4 412 FFTSTAGE "FALSE","FALSE",FWD 413 M_END 414 415 416 M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4 417 FFTSTAGE "FALSE","TRUE",INV 418 M_END 419 420 421 422 .end 423