1/* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 * 10 */ 11 12#include <stdint.h> 13 14#include "dl/api/omxtypes.h" 15#include "dl/sp/api/mipsSP.h" 16 17/* 18 * Forward real FFT for FFT sizes larger than 16. Computed using the complex 19 * FFT for half the size. 20 */ 21OMXResult mips_FFTFwd_RToCCS_F32_complex(const OMX_F32* pSrc, 22 OMX_F32* pDst, 23 const MIPSFFTSpec_R_FC32* pFFTSpec) { 24 OMX_U32 n1_4, num_transforms, step; 25 const OMX_F32* w_re_ptr; 26 const OMX_F32* w_im_ptr; 27 OMX_U32 fft_size = 1 << pFFTSpec->order; 28 OMX_FC32* p_dst = (OMX_FC32*)pDst; 29 OMX_FC32* p_buf = (OMX_FC32*)pFFTSpec->pBuf; 30 OMX_F32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; 31 OMX_F32 w_re, w_im; 32 33 /* 34 * Loop performing sub-transforms of size 4, 35 * which contain 2 butterfly operations. 36 */ 37 num_transforms = (SUBTRANSFORM_CONST >> (17 - pFFTSpec->order)) | 1; 38 for (uint32_t n = 0; n < num_transforms; ++n) { 39 /* 40 * n is in the range (0 .. num_transforms - 1). 41 * The size of the pFFTSpec->pOffset is (((SUBTRANSFORM_CONST >> 42 * (16 - TWIDDLE_TABLE_ORDER)) | 1)). 43 */ 44 OMX_U32 offset = pFFTSpec->pOffset[n] << 2; 45 /* 46 * Offset takes it's value from pFFTSpec->pOffset table which is initialized 47 * in the omxSP_FFTInit_R_F32 function, and is constant afterwards. 48 */ 49 OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset; 50 OMX_FC32* p_tmp = p_buf + offset; 51 /* Treating the input as a complex vector. */ 52 const OMX_FC32* p_src = (const OMX_FC32*)pSrc; 53 54 tmp1 = p_src[p_bitrev[0]].Re + p_src[p_bitrev[1]].Re; 55 tmp2 = p_src[p_bitrev[2]].Re + p_src[p_bitrev[3]].Re; 56 tmp3 = p_src[p_bitrev[0]].Im + p_src[p_bitrev[1]].Im; 57 tmp4 = p_src[p_bitrev[2]].Im + p_src[p_bitrev[3]].Im; 58 59 p_tmp[0].Re = tmp1 + tmp2; 60 p_tmp[2].Re = tmp1 - tmp2; 61 p_tmp[0].Im = tmp3 + tmp4; 62 p_tmp[2].Im = tmp3 - tmp4; 63 64 tmp1 = p_src[p_bitrev[0]].Re - p_src[p_bitrev[1]].Re; 65 tmp2 = p_src[p_bitrev[2]].Re - p_src[p_bitrev[3]].Re; 66 tmp3 = p_src[p_bitrev[0]].Im - p_src[p_bitrev[1]].Im; 67 tmp4 = p_src[p_bitrev[2]].Im - p_src[p_bitrev[3]].Im; 68 69 p_tmp[1].Re = tmp1 + tmp4; 70 p_tmp[3].Re = tmp1 - tmp4; 71 p_tmp[1].Im = tmp3 - tmp2; 72 p_tmp[3].Im = tmp3 + tmp2; 73 } 74 75 /* 76 * Loop performing sub-transforms of size 8, 77 * which contain four butterfly operations. 78 */ 79 num_transforms = (num_transforms >> 1) | 1; 80 for (uint32_t n = 0; n < num_transforms; ++n) { 81 OMX_U32 offset = pFFTSpec->pOffset[n] << 3; 82 OMX_U16* p_bitrev = pFFTSpec->pBitRev + offset; 83 OMX_FC32* p_tmp = p_buf + offset; 84 const OMX_FC32* p_src = (const OMX_FC32*)pSrc; 85 86 tmp1 = p_src[p_bitrev[4]].Re + p_src[p_bitrev[5]].Re; 87 tmp2 = p_src[p_bitrev[6]].Re + p_src[p_bitrev[7]].Re; 88 tmp3 = p_src[p_bitrev[4]].Im + p_src[p_bitrev[5]].Im; 89 tmp4 = p_src[p_bitrev[6]].Im + p_src[p_bitrev[7]].Im; 90 91 tmp5 = tmp1 + tmp2; 92 tmp1 = tmp1 - tmp2; 93 tmp2 = tmp3 + tmp4; 94 tmp3 = tmp3 - tmp4; 95 96 p_tmp[4].Re = p_tmp[0].Re - tmp5; 97 p_tmp[0].Re = p_tmp[0].Re + tmp5; 98 p_tmp[4].Im = p_tmp[0].Im - tmp2; 99 p_tmp[0].Im = p_tmp[0].Im + tmp2; 100 p_tmp[6].Re = p_tmp[2].Re - tmp3; 101 p_tmp[2].Re = p_tmp[2].Re + tmp3; 102 p_tmp[6].Im = p_tmp[2].Im + tmp1; 103 p_tmp[2].Im = p_tmp[2].Im - tmp1; 104 105 tmp1 = p_src[p_bitrev[4]].Re - p_src[p_bitrev[5]].Re; 106 tmp2 = p_src[p_bitrev[6]].Re - p_src[p_bitrev[7]].Re; 107 tmp3 = p_src[p_bitrev[4]].Im - p_src[p_bitrev[5]].Im; 108 tmp4 = p_src[p_bitrev[6]].Im - p_src[p_bitrev[7]].Im; 109 110 tmp5 = SQRT1_2 * (tmp1 + tmp3); 111 tmp1 = SQRT1_2 * (tmp3 - tmp1); 112 tmp3 = SQRT1_2 * (tmp2 - tmp4); 113 tmp2 = SQRT1_2 * (tmp2 + tmp4); 114 115 tmp4 = tmp5 + tmp3; 116 tmp5 = tmp5 - tmp3; 117 tmp3 = tmp1 + tmp2; 118 tmp1 = tmp1 - tmp2; 119 120 p_tmp[5].Re = p_tmp[1].Re - tmp4; 121 p_tmp[1].Re = p_tmp[1].Re + tmp4; 122 p_tmp[5].Im = p_tmp[1].Im - tmp3; 123 p_tmp[1].Im = p_tmp[1].Im + tmp3; 124 p_tmp[7].Re = p_tmp[3].Re - tmp1; 125 p_tmp[3].Re = p_tmp[3].Re + tmp1; 126 p_tmp[7].Im = p_tmp[3].Im + tmp5; 127 p_tmp[3].Im = p_tmp[3].Im - tmp5; 128 } 129 130 step = 1 << (pFFTSpec->order - 4); 131 n1_4 = 4; /* Quarter of the sub-transform size. */ 132 /* Outer loop that loops over FFT stages. */ 133 for (uint32_t fft_stage = 4; fft_stage <= pFFTSpec->order - 1; ++fft_stage) { 134 OMX_U32 n1_2 = 2 * n1_4; 135 OMX_U32 n3_4 = 3 * n1_4; 136 num_transforms = (num_transforms >> 1) | 1; 137 /* 138 * Loop performing sub-transforms of size 16 and higher. 139 * The actual size depends on the stage. 140 */ 141 for (uint32_t n = 0; n < num_transforms; ++n) { 142 OMX_U32 offset = pFFTSpec->pOffset[n] << fft_stage; 143 OMX_FC32* p_tmp = p_buf + offset; 144 145 tmp1 = p_tmp[n1_2].Re + p_tmp[n3_4].Re; 146 tmp2 = p_tmp[n1_2].Re - p_tmp[n3_4].Re; 147 tmp3 = p_tmp[n1_2].Im + p_tmp[n3_4].Im; 148 tmp4 = p_tmp[n1_2].Im - p_tmp[n3_4].Im; 149 150 p_tmp[n1_2].Re = p_tmp[0].Re - tmp1; 151 p_tmp[n1_2].Im = p_tmp[0].Im - tmp3; 152 p_tmp[0].Re = p_tmp[0].Re + tmp1; 153 p_tmp[0].Im = p_tmp[0].Im + tmp3; 154 p_tmp[n3_4].Re = p_tmp[n1_4].Re - tmp4; 155 p_tmp[n3_4].Im = p_tmp[n1_4].Im + tmp2; 156 p_tmp[n1_4].Re = p_tmp[n1_4].Re + tmp4; 157 p_tmp[n1_4].Im = p_tmp[n1_4].Im - tmp2; 158 159 /* Twiddle table is initialized for the maximal FFT size. */ 160 w_re_ptr = pFFTSpec->pTwiddle + step; 161 w_im_ptr = 162 pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step; 163 164 /* 165 * Loop performing split-radix butterfly operations for 166 * one sub-transform. 167 */ 168 for (uint32_t i = 1; i < n1_4; ++i) { 169 w_re = *w_re_ptr; 170 w_im = *w_im_ptr; 171 172 tmp1 = w_re * p_tmp[n1_2 + i].Re + w_im * p_tmp[n1_2 + i].Im; 173 tmp2 = w_re * p_tmp[n1_2 + i].Im - w_im * p_tmp[n1_2 + i].Re; 174 tmp3 = w_re * p_tmp[n3_4 + i].Re - w_im * p_tmp[n3_4 + i].Im; 175 tmp4 = w_re * p_tmp[n3_4 + i].Im + w_im * p_tmp[n3_4 + i].Re; 176 177 tmp5 = tmp1 + tmp3; 178 tmp1 = tmp1 - tmp3; 179 tmp6 = tmp2 + tmp4; 180 tmp2 = tmp2 - tmp4; 181 182 p_tmp[n1_2 + i].Re = p_tmp[i].Re - tmp5; 183 p_tmp[n1_2 + i].Im = p_tmp[i].Im - tmp6; 184 p_tmp[i].Re = p_tmp[i].Re + tmp5; 185 p_tmp[i].Im = p_tmp[i].Im + tmp6; 186 p_tmp[n3_4 + i].Re = p_tmp[n1_4 + i].Re - tmp2; 187 p_tmp[n3_4 + i].Im = p_tmp[n1_4 + i].Im + tmp1; 188 p_tmp[n1_4 + i].Re = p_tmp[n1_4 + i].Re + tmp2; 189 p_tmp[n1_4 + i].Im = p_tmp[n1_4 + i].Im - tmp1; 190 191 w_re_ptr += step; 192 w_im_ptr -= step; 193 } 194 } 195 step >>= 1; 196 n1_4 <<= 1; 197 } 198 199 /* Additional computation to get the output for full FFT size. */ 200 w_re_ptr = pFFTSpec->pTwiddle + step; 201 w_im_ptr = pFFTSpec->pTwiddle + (OMX_U32)(1 << pFFTSpec->order - 2) - step; 202 203 for (uint32_t i = 1; i < fft_size / 8; ++i) { 204 tmp1 = p_buf[i].Re; 205 tmp2 = p_buf[i].Im; 206 tmp3 = p_buf[fft_size / 2 - i].Re; 207 tmp4 = p_buf[fft_size / 2 - i].Im; 208 209 tmp5 = tmp1 + tmp3; 210 tmp6 = tmp1 - tmp3; 211 tmp7 = tmp2 + tmp4; 212 tmp8 = tmp2 - tmp4; 213 214 tmp1 = p_buf[i + fft_size / 4].Re; 215 tmp2 = p_buf[i + fft_size / 4].Im; 216 tmp3 = p_buf[fft_size / 4 - i].Re; 217 tmp4 = p_buf[fft_size / 4 - i].Im; 218 219 w_re = *w_re_ptr; 220 w_im = *w_im_ptr; 221 222 p_dst[i].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6); 223 p_dst[i].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7); 224 p_dst[fft_size / 2 - i].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6); 225 p_dst[fft_size / 2 - i].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7); 226 227 tmp5 = tmp1 + tmp3; 228 tmp6 = tmp1 - tmp3; 229 tmp7 = tmp2 + tmp4; 230 tmp8 = tmp2 - tmp4; 231 232 p_dst[i + fft_size / 4].Re = 0.5f * (tmp5 - w_im * tmp7 - w_re * tmp6); 233 p_dst[i + fft_size / 4].Im = 0.5f * (tmp8 + w_im * tmp6 - w_re * tmp7); 234 p_dst[fft_size / 4 - i].Re = 0.5f * (tmp5 + w_im * tmp7 + w_re * tmp6); 235 p_dst[fft_size / 4 - i].Im = 0.5f * (-tmp8 + w_im * tmp6 - w_re * tmp7); 236 237 w_re_ptr += step; 238 w_im_ptr -= step; 239 } 240 tmp1 = p_buf[fft_size / 8].Re; 241 tmp2 = p_buf[fft_size / 8].Im; 242 tmp3 = p_buf[3 * fft_size / 8].Re; 243 tmp4 = p_buf[3 * fft_size / 8].Im; 244 245 tmp5 = tmp1 + tmp3; 246 tmp6 = tmp1 - tmp3; 247 tmp7 = tmp2 + tmp4; 248 tmp8 = tmp2 - tmp4; 249 250 w_re = *w_re_ptr; 251 w_im = *w_im_ptr; 252 253 p_dst[fft_size / 8].Re = 0.5f * (tmp5 + w_re * tmp7 - w_im * tmp6); 254 p_dst[fft_size / 8].Im = 0.5f * (tmp8 - w_re * tmp6 - w_im * tmp7); 255 p_dst[3 * fft_size / 8].Re = 0.5f * (tmp5 - w_re * tmp7 + w_im * tmp6); 256 p_dst[3 * fft_size / 8].Im = 0.5f * (-tmp8 - w_re * tmp6 - w_im * tmp7); 257 258 p_dst[0].Re = p_buf[0].Re + p_buf[0].Im; 259 p_dst[fft_size / 4].Re = p_buf[fft_size / 4].Re; 260 p_dst[fft_size / 2].Re = p_buf[0].Re - p_buf[0].Im; 261 p_dst[0].Im = 0.0f; 262 p_dst[fft_size / 4].Im = -p_buf[fft_size / 4].Im; 263 p_dst[fft_size / 2].Im = 0.0f; 264 265 return OMX_Sts_NoErr; 266} 267