1/*
2 * Copyright (C) 2016 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_SSE
25
26#define TO_STRING2(x) #x
27#define TO_STRING(x) TO_STRING2(x)
28// uncomment to print GCC version, may be relevant for intrinsic optimizations
29/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
30        "." TO_STRING(__GNUC_MINOR__) \
31        "." TO_STRING(__GNUC_PATCHLEVEL__)) */
32
33//
34// SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
35//
36
37template <int CHANNELS, int STRIDE, bool FIXED>
38static inline void ProcessSSEIntrinsic(float* out,
39        int count,
40        const float* coefsP,
41        const float* coefsN,
42        const float* sP,
43        const float* sN,
44        const float* volumeLR,
45        float lerpP,
46        const float* coefsP1,
47        const float* coefsN1)
48{
49    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
50    static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
51
52    sP -= CHANNELS*(4-1);   // adjust sP for a loop iteration of four
53
54    __m128 interp;
55    if (!FIXED) {
56        interp = _mm_set1_ps(lerpP);
57    }
58
59    __m128 accL, accR;
60    accL = _mm_setzero_ps();
61    if (CHANNELS == 2) {
62        accR = _mm_setzero_ps();
63    }
64
65    do {
66        __m128 posCoef = _mm_load_ps(coefsP);
67        __m128 negCoef = _mm_load_ps(coefsN);
68        coefsP += 4;
69        coefsN += 4;
70
71        if (!FIXED) { // interpolate
72            __m128 posCoef1 = _mm_load_ps(coefsP1);
73            __m128 negCoef1 = _mm_load_ps(coefsN1);
74            coefsP1 += 4;
75            coefsN1 += 4;
76
77            // Calculate the final coefficient for interpolation
78            // posCoef = interp * (posCoef1 - posCoef) + posCoef
79            // negCoef = interp * (negCoef - negCoef1) + negCoef1
80            posCoef1 = _mm_sub_ps(posCoef1, posCoef);
81            negCoef = _mm_sub_ps(negCoef, negCoef1);
82
83            posCoef1 = _mm_mul_ps(posCoef1, interp);
84            negCoef = _mm_mul_ps(negCoef, interp);
85
86            posCoef = _mm_add_ps(posCoef1, posCoef);
87            negCoef = _mm_add_ps(negCoef, negCoef1);
88        }
89        switch (CHANNELS) {
90        case 1: {
91            __m128 posSamp = _mm_loadu_ps(sP);
92            __m128 negSamp = _mm_loadu_ps(sN);
93            sP -= 4;
94            sN += 4;
95
96            posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
97            posSamp = _mm_mul_ps(posSamp, posCoef);
98            negSamp = _mm_mul_ps(negSamp, negCoef);
99
100            accL = _mm_add_ps(accL, posSamp);
101            accL = _mm_add_ps(accL, negSamp);
102        } break;
103        case 2: {
104            __m128 posSamp0 = _mm_loadu_ps(sP);
105            __m128 posSamp1 = _mm_loadu_ps(sP+4);
106            __m128 negSamp0 = _mm_loadu_ps(sN);
107            __m128 negSamp1 = _mm_loadu_ps(sN+4);
108            sP -= 8;
109            sN += 8;
110
111            // deinterleave everything and reverse the positives
112            __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
113            __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
114            __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
115            __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
116
117            posSampL = _mm_mul_ps(posSampL, posCoef);
118            posSampR = _mm_mul_ps(posSampR, posCoef);
119            negSampL = _mm_mul_ps(negSampL, negCoef);
120            negSampR = _mm_mul_ps(negSampR, negCoef);
121
122            accL = _mm_add_ps(accL, posSampL);
123            accR = _mm_add_ps(accR, posSampR);
124            accL = _mm_add_ps(accL, negSampL);
125            accR = _mm_add_ps(accR, negSampR);
126        } break;
127        }
128    } while (count -= 4);
129
130    // multiply by volume and save
131    __m128 vLR = _mm_setzero_ps();
132    __m128 outSamp;
133    vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
134    outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
135
136    // combine and funnel down accumulator
137    __m128 outAccum = _mm_setzero_ps();
138    if (CHANNELS == 1) {
139        // duplicate accL to both L and R
140        outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
141        outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
142    } else if (CHANNELS == 2) {
143        // accR contains R, fold in
144        outAccum = _mm_hadd_ps(accL, accR);
145        outAccum = _mm_hadd_ps(outAccum, outAccum);
146    }
147
148    outAccum = _mm_mul_ps(outAccum, vLR);
149    outSamp = _mm_add_ps(outSamp, outAccum);
150    _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
151}
152
153template<>
154inline void ProcessL<1, 16>(float* const out,
155        int count,
156        const float* coefsP,
157        const float* coefsN,
158        const float* sP,
159        const float* sN,
160        const float* const volumeLR)
161{
162    ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
163            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
164}
165
166template<>
167inline void ProcessL<2, 16>(float* const out,
168        int count,
169        const float* coefsP,
170        const float* coefsN,
171        const float* sP,
172        const float* sN,
173        const float* const volumeLR)
174{
175    ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
176            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
177}
178
179template<>
180inline void Process<1, 16>(float* const out,
181        int count,
182        const float* coefsP,
183        const float* coefsN,
184        const float* coefsP1,
185        const float* coefsN1,
186        const float* sP,
187        const float* sN,
188        float lerpP,
189        const float* const volumeLR)
190{
191    ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
192            lerpP, coefsP1, coefsN1);
193}
194
195template<>
196inline void Process<2, 16>(float* const out,
197        int count,
198        const float* coefsP,
199        const float* coefsN,
200        const float* coefsP1,
201        const float* coefsN1,
202        const float* sP,
203        const float* sN,
204        float lerpP,
205        const float* const volumeLR)
206{
207    ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
208            lerpP, coefsP1, coefsN1);
209}
210
211#endif //USE_SSE
212
213} // namespace android
214
215#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
216