1// 2// Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3// 4// Use of this source code is governed by a BSD-style license 5// that can be found in the LICENSE file in the root of the source 6// tree. An additional intellectual property rights grant can be found 7// in the file PATENTS. All contributing project authors may 8// be found in the AUTHORS file in the root of the source tree. 9// 10// This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s 11// to support float instead of SC32. 12// 13 14// Description: 15// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point 16// complex signal. This handles the general stage, not the first or last 17// stage. 18// 19// 20 21 22// Include standard headers 23 24#include "dl/api/arm/arm64COMM_s.h" 25#include "dl/api/arm/omxtypes_s.h" 26 27 28// Import symbols required from other files 29// (For example tables) 30 31 32 33// Set debugging level 34//DEBUG_ON SETL {TRUE} 35 36 37 38// Guarding implementation by the processor name 39 40 41 42 43// Guarding implementation by the processor name 44 45//Input Registers 46 47#define pSrc x0 48#define pDst x1 49#define pTwiddle x2 50#define pSubFFTNum x3 51#define pSubFFTSize x4 52 53 54//Output Registers 55 56 57//Local Scratch Registers 58 59#define subFFTNum x5 60#define subFFTSize x6 61#define outPointStep x8 62#define pointStep x9 63#define pointStep32 w9 64#define grpCount x10 65#define grpCount32 w10 66#define setCount x13 67#define step x15 68#define dstStep x11 69 70// Neon Registers 71 72#define dW v0.2s 73#define dX0 v2.2s 74#define dX1 v3.2s 75#define dX2 v4.2s 76#define dX3 v5.2s 77#define dY0 v6.2s 78#define dY1 v7.2s 79#define dY2 v8.2s 80#define dY3 v9.2s 81#define qT0 v10.2s 82#define qT1 v11.2s 83 84 .macro FFTSTAGE scaled, inverse, name 85 86 // Define stack arguments 87 88 // Move args values into our work registers 89 ldr subFFTNum, [pSubFFTNum] 90 ldr subFFTSize, [pSubFFTSize] 91 92 // Update grpCount and grpSize rightaway inorder to reuse pGrpCount 93 // and pGrpSize regs 94 95 LSR subFFTNum,subFFTNum,#1 //grpSize 96 LSL grpCount,subFFTSize,#1 97 98 99 // pT0+1 increments pT0 by 8 bytes 100 // pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes 101 lsl pointStep, subFFTNum, #2 102 103 // update subFFTSize for the next stage 104 MOV subFFTSize,grpCount 105 106 // pOut0+1 increments pOut0 by 8 bytes 107 // pOut0+outPointStep == increment of 8*outPointStep bytes = 108 // 4*size bytes 109 smull outPointStep, grpCount32, pointStep32 110 111 LSL pointStep,pointStep,#1 112 113 114 rsb step,pointStep,#16 115 rsb dstStep,outPointStep,#16 116 117 // Loop on the groups 118 119radix2GrpLoop\name : 120 lsr setCount, pointStep, #3 121 LD1 {dW},[pTwiddle],pointStep //[wi | wr] 122 123 124 // Loop on the sets 125 126 127radix2SetLoop\name : 128 129 130 // point0: dX0-real part dX1-img part 131 LD2 {dX0,dX1},[pSrc],pointStep 132 // point1: dX2-real part dX3-img part 133 LD2 {dX2,dX3},[pSrc],step 134 135 SUBS setCount,setCount,#2 136 137 .ifeqs "\inverse", "TRUE" 138 fmul qT0,dX2,dW[0] 139 fmla qT0,dX3,dW[1] // real part 140 fmul qT1,dX3,dW[0] 141 fmls qT1,dX2,dW[1] // imag part 142 143 .else 144 145 fmul qT0,dX2,dW[0] 146 fmls qT0,dX3,dW[1] // real part 147 fmul qT1,dX3,dW[0] 148 fmla qT1,dX2,dW[1] // imag part 149 150 .endif 151 152 fsub dY0,dX0,qT0 153 fsub dY1,dX1,qT1 154 fadd dY2,dX0,qT0 155 fadd dY3,dX1,qT1 156 157 st2 {dY0,dY1},[pDst],outPointStep 158 // dstStep = -outPointStep + 16 159 st2 {dY2,dY3},[pDst],dstStep 160 161 BGT radix2SetLoop\name 162 163 SUBS grpCount,grpCount,#2 164 ADD pSrc,pSrc,pointStep 165 BGT radix2GrpLoop\name 166 167 168 str subFFTNum, [pSubFFTNum] 169 str subFFTSize, [pSubFFTSize] 170 .endm 171 172 173 174 M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace,,d11 175 FFTSTAGE "FALSE","FALSE",FWD 176 M_END 177 178 179 180 M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace,,d11 181 FFTSTAGE "FALSE","TRUE",INV 182 M_END 183 184 185 .end 186