1/* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H 18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H 19 20namespace android { 21 22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 23 24#if USE_SSE 25 26#define TO_STRING2(x) #x 27#define TO_STRING(x) TO_STRING2(x) 28// uncomment to print GCC version, may be relevant for intrinsic optimizations 29/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \ 30 "." TO_STRING(__GNUC_MINOR__) \ 31 "." TO_STRING(__GNUC_PATCHLEVEL__)) */ 32 33// 34// SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h 35// 36 37template <int CHANNELS, int STRIDE, bool FIXED> 38static inline void ProcessSSEIntrinsic(float* out, 39 int count, 40 const float* coefsP, 41 const float* coefsN, 42 const float* sP, 43 const float* sN, 44 const float* volumeLR, 45 float lerpP, 46 const float* coefsP1, 47 const float* coefsN1) 48{ 49 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 50 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2"); 51 52 sP -= CHANNELS*(4-1); // adjust sP for a loop iteration of four 53 54 __m128 interp; 55 if (!FIXED) { 56 interp = _mm_set1_ps(lerpP); 57 } 58 59 __m128 accL, accR; 60 accL = _mm_setzero_ps(); 61 if (CHANNELS == 2) { 62 accR = _mm_setzero_ps(); 63 } 64 65 do { 66 __m128 posCoef = _mm_load_ps(coefsP); 67 __m128 negCoef = _mm_load_ps(coefsN); 68 coefsP += 4; 69 coefsN += 4; 70 71 if (!FIXED) { // interpolate 72 __m128 posCoef1 = _mm_load_ps(coefsP1); 73 __m128 negCoef1 = _mm_load_ps(coefsN1); 74 coefsP1 += 4; 75 coefsN1 += 4; 76 77 // Calculate the final coefficient for interpolation 78 // posCoef = interp * (posCoef1 - posCoef) + posCoef 79 // negCoef = interp * (negCoef - negCoef1) + negCoef1 80 posCoef1 = _mm_sub_ps(posCoef1, posCoef); 81 negCoef = _mm_sub_ps(negCoef, negCoef1); 82 83 posCoef1 = _mm_mul_ps(posCoef1, interp); 84 negCoef = _mm_mul_ps(negCoef, interp); 85 86 posCoef = _mm_add_ps(posCoef1, posCoef); 87 negCoef = _mm_add_ps(negCoef, negCoef1); 88 } 89 switch (CHANNELS) { 90 case 1: { 91 __m128 posSamp = _mm_loadu_ps(sP); 92 __m128 negSamp = _mm_loadu_ps(sN); 93 sP -= 4; 94 sN += 4; 95 96 posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B); 97 posSamp = _mm_mul_ps(posSamp, posCoef); 98 negSamp = _mm_mul_ps(negSamp, negCoef); 99 100 accL = _mm_add_ps(accL, posSamp); 101 accL = _mm_add_ps(accL, negSamp); 102 } break; 103 case 2: { 104 __m128 posSamp0 = _mm_loadu_ps(sP); 105 __m128 posSamp1 = _mm_loadu_ps(sP+4); 106 __m128 negSamp0 = _mm_loadu_ps(sN); 107 __m128 negSamp1 = _mm_loadu_ps(sN+4); 108 sP -= 8; 109 sN += 8; 110 111 // deinterleave everything and reverse the positives 112 __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22); 113 __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77); 114 __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88); 115 __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD); 116 117 posSampL = _mm_mul_ps(posSampL, posCoef); 118 posSampR = _mm_mul_ps(posSampR, posCoef); 119 negSampL = _mm_mul_ps(negSampL, negCoef); 120 negSampR = _mm_mul_ps(negSampR, negCoef); 121 122 accL = _mm_add_ps(accL, posSampL); 123 accR = _mm_add_ps(accR, posSampR); 124 accL = _mm_add_ps(accL, negSampL); 125 accR = _mm_add_ps(accR, negSampR); 126 } break; 127 } 128 } while (count -= 4); 129 130 // multiply by volume and save 131 __m128 vLR = _mm_setzero_ps(); 132 __m128 outSamp; 133 vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR)); 134 outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out)); 135 136 // combine and funnel down accumulator 137 __m128 outAccum = _mm_setzero_ps(); 138 if (CHANNELS == 1) { 139 // duplicate accL to both L and R 140 outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL)); 141 outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11)); 142 } else if (CHANNELS == 2) { 143 // accR contains R, fold in 144 outAccum = _mm_hadd_ps(accL, accR); 145 outAccum = _mm_hadd_ps(outAccum, outAccum); 146 } 147 148 outAccum = _mm_mul_ps(outAccum, vLR); 149 outSamp = _mm_add_ps(outSamp, outAccum); 150 _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp); 151} 152 153template<> 154inline void ProcessL<1, 16>(float* const out, 155 int count, 156 const float* coefsP, 157 const float* coefsN, 158 const float* sP, 159 const float* sN, 160 const float* const volumeLR) 161{ 162 ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 163 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 164} 165 166template<> 167inline void ProcessL<2, 16>(float* const out, 168 int count, 169 const float* coefsP, 170 const float* coefsN, 171 const float* sP, 172 const float* sN, 173 const float* const volumeLR) 174{ 175 ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 176 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 177} 178 179template<> 180inline void Process<1, 16>(float* const out, 181 int count, 182 const float* coefsP, 183 const float* coefsN, 184 const float* coefsP1, 185 const float* coefsN1, 186 const float* sP, 187 const float* sN, 188 float lerpP, 189 const float* const volumeLR) 190{ 191 ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 192 lerpP, coefsP1, coefsN1); 193} 194 195template<> 196inline void Process<2, 16>(float* const out, 197 int count, 198 const float* coefsP, 199 const float* coefsN, 200 const float* coefsP1, 201 const float* coefsN1, 202 const float* sP, 203 const float* sN, 204 float lerpP, 205 const float* const volumeLR) 206{ 207 ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 208 lerpP, coefsP1, coefsN1); 209} 210 211#endif //USE_SSE 212 213} // namespace android 214 215#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/ 216