1/* 2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 * 10 */ 11 12#include "dl/api/omxtypes.h" 13#include "dl/sp/src/x86/x86SP_SSE_Math.h" 14 15// This function handles the case when set_count = 2, in which we cannot 16// unroll the set loop by 4 to meet the SSE requirement (4 elements). 17static void InternalUnroll2Inv( 18 const OMX_F32 *in, 19 OMX_F32 *out, 20 const OMX_F32 *twiddle, 21 OMX_INT n) { 22 OMX_INT i; 23 OMX_INT n_by_2 = n >> 1; 24 OMX_INT n_by_4 = n >> 2; 25 OMX_INT n_mul_2 = n << 1; 26 OMX_F32 *out0 = out; 27 28 for (i = 0; i < n_by_2; i += 8) { 29 const OMX_F32 *tw1 = twiddle + i; 30 const OMX_F32 *tw2 = tw1 + i; 31 const OMX_F32 *tw3 = tw2 + i; 32 const OMX_F32 *tw1e = tw1 + 4; 33 const OMX_F32 *tw2e = tw2 + 8; 34 const OMX_F32 *tw3e = tw3 + 12; 35 36 VC v_tw1; 37 VC v_tw2; 38 VC v_tw3; 39 VC v_t0; 40 VC v_t1; 41 VC v_t2; 42 VC v_t3; 43 VC v_t4; 44 VC v_t5; 45 VC v_t6; 46 VC v_t7; 47 48 v_tw1.real = _mm_shuffle_ps(_mm_load_ss(tw1), 49 _mm_load_ss(tw1e), 50 _MM_SHUFFLE(0, 0, 0, 0)); 51 v_tw1.imag = _mm_shuffle_ps(_mm_load_ss(tw1 + n_mul_2), 52 _mm_load_ss(tw1e + n_mul_2), 53 _MM_SHUFFLE(0, 0, 0, 0)); 54 v_tw2.real = _mm_shuffle_ps(_mm_load_ss(tw2), 55 _mm_load_ss(tw2e), 56 _MM_SHUFFLE(0, 0, 0, 0)); 57 v_tw2.imag = _mm_shuffle_ps(_mm_load_ss(tw2 + n_mul_2), 58 _mm_load_ss(tw2e + n_mul_2), 59 _MM_SHUFFLE(0, 0, 0, 0)); 60 v_tw3.real = _mm_shuffle_ps(_mm_load_ss(tw3), 61 _mm_load_ss(tw3e), 62 _MM_SHUFFLE(0, 0, 0, 0)); 63 v_tw3.imag = _mm_shuffle_ps(_mm_load_ss(tw3 + n_mul_2), 64 _mm_load_ss(tw3e + n_mul_2), 65 _MM_SHUFFLE(0, 0, 0, 0)); 66 67 __m128 xmm0; 68 __m128 xmm1; 69 __m128 xmm2; 70 __m128 xmm3; 71 __m128 xmm4; 72 __m128 xmm5; 73 __m128 xmm6; 74 __m128 xmm7; 75 76 const OMX_F32 *in0 = in + (i << 1); 77 xmm0 = _mm_load_ps(in0); 78 xmm1 = _mm_load_ps(in0 + 4); 79 xmm2 = _mm_load_ps(in0 + 8); 80 xmm3 = _mm_load_ps(in0 + 12); 81 v_t0.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(1, 0, 1, 0)); 82 v_t1.real = _mm_shuffle_ps(xmm0, xmm2, _MM_SHUFFLE(3, 2, 3, 2)); 83 v_t2.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(1, 0, 1, 0)); 84 v_t3.real = _mm_shuffle_ps(xmm1, xmm3, _MM_SHUFFLE(3, 2, 3, 2)); 85 86 xmm4 = _mm_load_ps(in0 + n); 87 xmm5 = _mm_load_ps(in0 + n + 4); 88 xmm6 = _mm_load_ps(in0 + n + 8); 89 xmm7 = _mm_load_ps(in0 + n + 12); 90 v_t0.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); 91 v_t1.imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); 92 v_t2.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); 93 v_t3.imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); 94 95 OMX_F32 *out1 = out0 + n_by_4; 96 OMX_F32 *out2 = out1 + n_by_4; 97 OMX_F32 *out3 = out2 + n_by_4; 98 99 RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, 100 &v_tw1, &v_tw2, &v_tw3, 101 &v_t0, &v_t1, &v_t2, &v_t3); 102 103 RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, 104 &v_t4, &v_t5, &v_t6, &v_t7, n); 105 106 out0 += 4; 107 } 108} 109 110void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse( 111 const OMX_F32 *in, 112 OMX_F32 *out, 113 const OMX_F32 *twiddle, 114 OMX_INT n, 115 OMX_INT sub_size, 116 OMX_INT sub_num) { 117 OMX_INT set; 118 OMX_INT grp; 119 OMX_INT step = sub_num >> 1; 120 OMX_INT set_count = sub_num >> 2; 121 OMX_INT n_by_4 = n >> 2; 122 OMX_INT n_mul_2 = n << 1; 123 124 OMX_F32 *out0 = out; 125 126 if (set_count == 2) { 127 InternalUnroll2Inv(in, out, twiddle, n); 128 return; 129 } 130 131 // grp == 0 132 for (set = 0; set < set_count; set += 4) { 133 const OMX_F32 * in0 = in + set; 134 const OMX_F32 *in1 = in0 + set_count; 135 const OMX_F32 *in2 = in1 + set_count; 136 const OMX_F32 *in3 = in2 + set_count; 137 138 VC v_t0; 139 VC v_t1; 140 VC v_t2; 141 VC v_t3; 142 VC v_t4; 143 VC v_t5; 144 VC v_t6; 145 VC v_t7; 146 147 VC_LOAD_SPLIT(&v_t0, in0, n); 148 VC_LOAD_SPLIT(&v_t1, in1, n); 149 VC_LOAD_SPLIT(&v_t2, in2, n); 150 VC_LOAD_SPLIT(&v_t3, in3, n); 151 152 OMX_F32 *out1 = out0 + n_by_4; 153 OMX_F32 *out2 = out1 + n_by_4; 154 OMX_F32 *out3 = out2 + n_by_4; 155 156 RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, 157 &v_t0, &v_t1, &v_t2, &v_t3); 158 159 RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, 160 &v_t4, &v_t5, &v_t6, &v_t7, n); 161 162 out0 += 4; 163 } 164 165 for (grp = 1; grp < sub_size; ++grp) { 166 const OMX_F32 *tw1 = twiddle + grp * step; 167 const OMX_F32 *tw2 = tw1 + grp * step; 168 const OMX_F32 *tw3 = tw2 + grp * step; 169 170 VC v_tw1; 171 VC v_tw2; 172 VC v_tw3; 173 174 v_tw1.real = _mm_load1_ps(tw1); 175 v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2); 176 v_tw2.real = _mm_load1_ps(tw2); 177 v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2); 178 v_tw3.real = _mm_load1_ps(tw3); 179 v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2); 180 181 for (set = 0; set < set_count; set += 4) { 182 const OMX_F32 *in0 = in + set + grp * sub_num; 183 const OMX_F32 *in1 = in0 + set_count; 184 const OMX_F32 *in2 = in1 + set_count; 185 const OMX_F32 *in3 = in2 + set_count; 186 187 VC v_t0; 188 VC v_t1; 189 VC v_t2; 190 VC v_t3; 191 VC v_t4; 192 VC v_t5; 193 VC v_t6; 194 VC v_t7; 195 196 VC_LOAD_SPLIT(&v_t0, in0, n); 197 VC_LOAD_SPLIT(&v_t1, in1, n); 198 VC_LOAD_SPLIT(&v_t2, in2, n); 199 VC_LOAD_SPLIT(&v_t3, in3, n); 200 201 OMX_F32 *out1 = out0 + n_by_4; 202 OMX_F32 *out2 = out1 + n_by_4; 203 OMX_F32 *out3 = out2 + n_by_4; 204 205 RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, 206 &v_tw1, &v_tw2, &v_tw3, 207 &v_t0, &v_t1, &v_t2, &v_t3); 208 209 RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, 210 &v_t4, &v_t5, &v_t6, &v_t7, n); 211 212 out0 += 4; 213 } 214 } 215} 216