1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_NEON
25
26// use intrinsics if inline arm32 assembly is not possible
27#if !USE_INLINE_ASSEMBLY
28#define USE_INTRINSIC
29#endif
30
31// following intrinsics available only on ARM 64 bit ACLE
32#ifndef __aarch64__
33#undef vld1q_f32_x2
34#undef vld1q_s32_x2
35#endif
36
37#define TO_STRING2(x) #x
38#define TO_STRING(x) TO_STRING2(x)
39// uncomment to print GCC version, may be relevant for intrinsic optimizations
40/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
41        "." TO_STRING(__GNUC_MINOR__) \
42        "." TO_STRING(__GNUC_PATCHLEVEL__)) */
43
44//
45// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
46//
47// Two variants are presented here:
48// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
49// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
50//
51
52// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
53// These are only used for inline assembly.
54#define ASSEMBLY_ACCUMULATE_MONO \
55        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
56        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
57        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
58        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
59        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
60        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
61        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
62
63#define ASSEMBLY_ACCUMULATE_STEREO \
64        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
65        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
66        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
67        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
68        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
69        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
70        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
71        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
72
73template <int CHANNELS, int STRIDE, bool FIXED>
74static inline void ProcessNeonIntrinsic(int32_t* out,
75        int count,
76        const int16_t* coefsP,
77        const int16_t* coefsN,
78        const int16_t* sP,
79        const int16_t* sN,
80        const int32_t* volumeLR,
81        uint32_t lerpP,
82        const int16_t* coefsP1,
83        const int16_t* coefsN1)
84{
85    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
86    static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
87
88    sP -= CHANNELS*((STRIDE>>1)-1);
89    coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
90    coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
91
92    int16x4_t interp;
93    if (!FIXED) {
94        interp = vdup_n_s16(lerpP);
95        //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
96        coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
97        coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
98    }
99    int32x4_t accum, accum2;
100    // warning uninitialized if we use veorq_s32
101    // (alternative to below) accum = veorq_s32(accum, accum);
102    accum = vdupq_n_s32(0);
103    if (CHANNELS == 2) {
104        // (alternative to below) accum2 = veorq_s32(accum2, accum2);
105        accum2 = vdupq_n_s32(0);
106    }
107    do {
108        int16x8_t posCoef = vld1q_s16(coefsP);
109        coefsP += 8;
110        int16x8_t negCoef = vld1q_s16(coefsN);
111        coefsN += 8;
112        if (!FIXED) { // interpolate
113            int16x8_t posCoef1 = vld1q_s16(coefsP1);
114            coefsP1 += 8;
115            int16x8_t negCoef1 = vld1q_s16(coefsN1);
116            coefsN1 += 8;
117
118            posCoef1 = vsubq_s16(posCoef1, posCoef);
119            negCoef = vsubq_s16(negCoef, negCoef1);
120
121            posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
122            negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
123
124            posCoef = vaddq_s16(posCoef, posCoef1);
125            negCoef = vaddq_s16(negCoef, negCoef1);
126        }
127        switch (CHANNELS) {
128        case 1: {
129            int16x8_t posSamp = vld1q_s16(sP);
130            int16x8_t negSamp = vld1q_s16(sN);
131            sN += 8;
132            posSamp = vrev64q_s16(posSamp);
133
134            // dot product
135            accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
136            accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
137            accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
138            accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
139            sP -= 8;
140        } break;
141        case 2: {
142            int16x8x2_t posSamp = vld2q_s16(sP);
143            int16x8x2_t negSamp = vld2q_s16(sN);
144            sN += 16;
145            posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
146            posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
147
148            // dot product
149            accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
150            accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
151            accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
152            accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
153            accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
154            accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
155            accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
156            accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
157            sP -= 16;
158        } break;
159        }
160    } while (count -= 8);
161
162    // multiply by volume and save
163    volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
164    int32x2_t vLR = vld1_s32(volumeLR);
165    int32x2_t outSamp = vld1_s32(out);
166    // combine and funnel down accumulator
167    int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
168    if (CHANNELS == 1) {
169        // duplicate accum to both L and R
170        outAccum = vpadd_s32(outAccum, outAccum);
171    } else if (CHANNELS == 2) {
172        // accum2 contains R, fold in
173        int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
174        outAccum = vpadd_s32(outAccum, outAccum2);
175    }
176    outAccum = vqrdmulh_s32(outAccum, vLR);
177    outSamp = vqadd_s32(outSamp, outAccum);
178    vst1_s32(out, outSamp);
179}
180
181template <int CHANNELS, int STRIDE, bool FIXED>
182static inline void ProcessNeonIntrinsic(int32_t* out,
183        int count,
184        const int32_t* coefsP,
185        const int32_t* coefsN,
186        const int16_t* sP,
187        const int16_t* sN,
188        const int32_t* volumeLR,
189        uint32_t lerpP,
190        const int32_t* coefsP1,
191        const int32_t* coefsN1)
192{
193    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
194    static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
195
196    sP -= CHANNELS*((STRIDE>>1)-1);
197    coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
198    coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
199
200    int32x2_t interp;
201    if (!FIXED) {
202        interp = vdup_n_s32(lerpP);
203        coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
204        coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
205    }
206    int32x4_t accum, accum2;
207    // warning uninitialized if we use veorq_s32
208    // (alternative to below) accum = veorq_s32(accum, accum);
209    accum = vdupq_n_s32(0);
210    if (CHANNELS == 2) {
211        // (alternative to below) accum2 = veorq_s32(accum2, accum2);
212        accum2 = vdupq_n_s32(0);
213    }
214    do {
215#ifdef vld1q_s32_x2
216        int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
217        coefsP += 8;
218        int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
219        coefsN += 8;
220#else
221        int32x4x2_t posCoef;
222        posCoef.val[0] = vld1q_s32(coefsP);
223        coefsP += 4;
224        posCoef.val[1] = vld1q_s32(coefsP);
225        coefsP += 4;
226        int32x4x2_t negCoef;
227        negCoef.val[0] = vld1q_s32(coefsN);
228        coefsN += 4;
229        negCoef.val[1] = vld1q_s32(coefsN);
230        coefsN += 4;
231#endif
232        if (!FIXED) { // interpolate
233#ifdef vld1q_s32_x2
234            int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
235            coefsP1 += 8;
236            int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
237            coefsN1 += 8;
238#else
239            int32x4x2_t posCoef1;
240            posCoef1.val[0] = vld1q_s32(coefsP1);
241            coefsP1 += 4;
242            posCoef1.val[1] = vld1q_s32(coefsP1);
243            coefsP1 += 4;
244            int32x4x2_t negCoef1;
245            negCoef1.val[0] = vld1q_s32(coefsN1);
246            coefsN1 += 4;
247            negCoef1.val[1] = vld1q_s32(coefsN1);
248            coefsN1 += 4;
249#endif
250
251            posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
252            posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
253            negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
254            negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
255
256            posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
257            posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
258            negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
259            negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
260
261            posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
262            posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
263            negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
264            negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
265        }
266        switch (CHANNELS) {
267        case 1: {
268            int16x8_t posSamp = vld1q_s16(sP);
269            int16x8_t negSamp = vld1q_s16(sN);
270            sN += 8;
271            posSamp = vrev64q_s16(posSamp);
272
273            int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
274            int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
275            int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
276            int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
277
278            // dot product
279            posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
280            posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
281            negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
282            negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
283
284            accum = vaddq_s32(accum, posSamp0);
285            negSamp0 = vaddq_s32(negSamp0, negSamp1);
286            accum = vaddq_s32(accum, posSamp1);
287            accum = vaddq_s32(accum, negSamp0);
288
289            sP -= 8;
290        } break;
291        case 2: {
292            int16x8x2_t posSamp = vld2q_s16(sP);
293            int16x8x2_t negSamp = vld2q_s16(sN);
294            sN += 16;
295            posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
296            posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
297
298            // left
299            int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
300            int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
301            int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
302            int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
303
304            // dot product
305            posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
306            posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
307            negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
308            negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
309
310            accum = vaddq_s32(accum, posSamp0);
311            negSamp0 = vaddq_s32(negSamp0, negSamp1);
312            accum = vaddq_s32(accum, posSamp1);
313            accum = vaddq_s32(accum, negSamp0);
314
315            // right
316            posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
317            posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
318            negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
319            negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
320
321            // dot product
322            posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
323            posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
324            negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
325            negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
326
327            accum2 = vaddq_s32(accum2, posSamp0);
328            negSamp0 = vaddq_s32(negSamp0, negSamp1);
329            accum2 = vaddq_s32(accum2, posSamp1);
330            accum2 = vaddq_s32(accum2, negSamp0);
331
332            sP -= 16;
333        } break;
334        }
335    } while (count -= 8);
336
337    // multiply by volume and save
338    volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
339    int32x2_t vLR = vld1_s32(volumeLR);
340    int32x2_t outSamp = vld1_s32(out);
341    // combine and funnel down accumulator
342    int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
343    if (CHANNELS == 1) {
344        // duplicate accum to both L and R
345        outAccum = vpadd_s32(outAccum, outAccum);
346    } else if (CHANNELS == 2) {
347        // accum2 contains R, fold in
348        int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
349        outAccum = vpadd_s32(outAccum, outAccum2);
350    }
351    outAccum = vqrdmulh_s32(outAccum, vLR);
352    outSamp = vqadd_s32(outSamp, outAccum);
353    vst1_s32(out, outSamp);
354}
355
356template <int CHANNELS, int STRIDE, bool FIXED>
357static inline void ProcessNeonIntrinsic(float* out,
358        int count,
359        const float* coefsP,
360        const float* coefsN,
361        const float* sP,
362        const float* sN,
363        const float* volumeLR,
364        float lerpP,
365        const float* coefsP1,
366        const float* coefsN1)
367{
368    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
369    static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
370
371    sP -= CHANNELS*((STRIDE>>1)-1);
372    coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
373    coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
374
375    float32x2_t interp;
376    if (!FIXED) {
377        interp = vdup_n_f32(lerpP);
378        coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
379        coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
380    }
381    float32x4_t accum, accum2;
382    // warning uninitialized if we use veorq_s32
383    // (alternative to below) accum = veorq_s32(accum, accum);
384    accum = vdupq_n_f32(0);
385    if (CHANNELS == 2) {
386        // (alternative to below) accum2 = veorq_s32(accum2, accum2);
387        accum2 = vdupq_n_f32(0);
388    }
389    do {
390#ifdef vld1q_f32_x2
391        float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
392        coefsP += 8;
393        float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
394        coefsN += 8;
395#else
396        float32x4x2_t posCoef;
397        posCoef.val[0] = vld1q_f32(coefsP);
398        coefsP += 4;
399        posCoef.val[1] = vld1q_f32(coefsP);
400        coefsP += 4;
401        float32x4x2_t negCoef;
402        negCoef.val[0] = vld1q_f32(coefsN);
403        coefsN += 4;
404        negCoef.val[1] = vld1q_f32(coefsN);
405        coefsN += 4;
406#endif
407        if (!FIXED) { // interpolate
408#ifdef vld1q_f32_x2
409            float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
410            coefsP1 += 8;
411            float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
412            coefsN1 += 8;
413#else
414            float32x4x2_t posCoef1;
415            posCoef1.val[0] = vld1q_f32(coefsP1);
416            coefsP1 += 4;
417            posCoef1.val[1] = vld1q_f32(coefsP1);
418            coefsP1 += 4;
419            float32x4x2_t negCoef1;
420            negCoef1.val[0] = vld1q_f32(coefsN1);
421            coefsN1 += 4;
422            negCoef1.val[1] = vld1q_f32(coefsN1);
423            coefsN1 += 4;
424#endif
425            posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
426            posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
427            negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
428            negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
429
430            posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
431            posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
432            negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
433            negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
434        }
435        switch (CHANNELS) {
436        case 1: {
437#ifdef vld1q_f32_x2
438            float32x4x2_t posSamp = vld1q_f32_x2(sP);
439            float32x4x2_t negSamp = vld1q_f32_x2(sN);
440            sN += 8;
441            sP -= 8;
442#else
443            float32x4x2_t posSamp;
444            posSamp.val[0] = vld1q_f32(sP);
445            sP += 4;
446            posSamp.val[1] = vld1q_f32(sP);
447            sP -= 12;
448            float32x4x2_t negSamp;
449            negSamp.val[0] = vld1q_f32(sN);
450            sN += 4;
451            negSamp.val[1] = vld1q_f32(sN);
452            sN += 4;
453#endif
454            // effectively we want a vrev128q_f32()
455            posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
456            posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
457            posSamp.val[0] = vcombine_f32(
458                    vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
459            posSamp.val[1] = vcombine_f32(
460                    vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
461
462            accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
463            accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
464            accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
465            accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
466        } break;
467        case 2: {
468            float32x4x2_t posSamp0 = vld2q_f32(sP);
469            sP += 8;
470            float32x4x2_t negSamp0 = vld2q_f32(sN);
471            sN += 8;
472            posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
473            posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
474            posSamp0.val[0] = vcombine_f32(
475                    vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
476            posSamp0.val[1] = vcombine_f32(
477                    vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
478
479            float32x4x2_t posSamp1 = vld2q_f32(sP);
480            sP -= 24;
481            float32x4x2_t negSamp1 = vld2q_f32(sN);
482            sN += 8;
483            posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
484            posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
485            posSamp1.val[0] = vcombine_f32(
486                    vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
487            posSamp1.val[1] = vcombine_f32(
488                    vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
489
490            // Note: speed is affected by accumulation order.
491            // Also, speed appears slower using vmul/vadd instead of vmla for
492            // stereo case, comparable for mono.
493
494            accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
495            accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
496            accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
497            accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
498
499            accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
500            accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
501            accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
502            accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
503        } break;
504        }
505    } while (count -= 8);
506
507    // multiply by volume and save
508    volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
509    float32x2_t vLR = vld1_f32(volumeLR);
510    float32x2_t outSamp = vld1_f32(out);
511    // combine and funnel down accumulator
512    float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
513    if (CHANNELS == 1) {
514        // duplicate accum to both L and R
515        outAccum = vpadd_f32(outAccum, outAccum);
516    } else if (CHANNELS == 2) {
517        // accum2 contains R, fold in
518        float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
519        outAccum = vpadd_f32(outAccum, outAccum2);
520    }
521    outSamp = vmla_f32(outSamp, outAccum, vLR);
522    vst1_f32(out, outSamp);
523}
524
525template <>
526inline void ProcessL<1, 16>(int32_t* const out,
527        int count,
528        const int16_t* coefsP,
529        const int16_t* coefsN,
530        const int16_t* sP,
531        const int16_t* sN,
532        const int32_t* const volumeLR)
533{
534#ifdef USE_INTRINSIC
535    ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
536            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
537#else
538    const int CHANNELS = 1; // template specialization does not preserve params
539    const int STRIDE = 16;
540    sP -= CHANNELS*((STRIDE>>1)-1);
541    asm (
542        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
543
544        "1:                                      \n"
545
546        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
547        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
548        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
549        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
550
551        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
552
553        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
554        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
555        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
556        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
557        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
558
559        // moving these ARM instructions before neon above seems to be slower
560        "subs           %[count], %[count], #8   \n"// (1) update loop counter
561        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
562
563        // sP used after branch (warning)
564        "bne            1b                       \n"// loop
565
566         ASSEMBLY_ACCUMULATE_MONO
567
568        : [out]     "=Uv" (out[0]),
569          [count]   "+r" (count),
570          [coefsP0] "+r" (coefsP),
571          [coefsN0] "+r" (coefsN),
572          [sP]      "+r" (sP),
573          [sN]      "+r" (sN)
574        : [vLR]     "r" (volumeLR)
575        : "cc", "memory",
576          "q0", "q1", "q2", "q3",
577          "q8", "q10"
578    );
579#endif
580}
581
582template <>
583inline void ProcessL<2, 16>(int32_t* const out,
584        int count,
585        const int16_t* coefsP,
586        const int16_t* coefsN,
587        const int16_t* sP,
588        const int16_t* sN,
589        const int32_t* const volumeLR)
590{
591#ifdef USE_INTRINSIC
592    ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
593            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
594#else
595    const int CHANNELS = 2; // template specialization does not preserve params
596    const int STRIDE = 16;
597    sP -= CHANNELS*((STRIDE>>1)-1);
598    asm (
599        "veor           q0, q0, q0               \n"// (1) acc_L = 0
600        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
601
602        "1:                                      \n"
603
604        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
605        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
606        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
607        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
608
609        "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
610        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right
611
612        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
613        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
614        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
615        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
616        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
617        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
618        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
619        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
620
621        // moving these ARM before neon seems to be slower
622        "subs           %[count], %[count], #8   \n"// (1) update loop counter
623        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
624
625        // sP used after branch (warning)
626        "bne            1b                       \n"// loop
627
628        ASSEMBLY_ACCUMULATE_STEREO
629
630        : [out] "=Uv" (out[0]),
631          [count] "+r" (count),
632          [coefsP0] "+r" (coefsP),
633          [coefsN0] "+r" (coefsN),
634          [sP] "+r" (sP),
635          [sN] "+r" (sN)
636        : [vLR] "r" (volumeLR)
637        : "cc", "memory",
638          "q0", "q1", "q2", "q3",
639          "q4", "q5", "q6",
640          "q8", "q10"
641     );
642#endif
643}
644
645template <>
646inline void Process<1, 16>(int32_t* const out,
647        int count,
648        const int16_t* coefsP,
649        const int16_t* coefsN,
650        const int16_t* coefsP1,
651        const int16_t* coefsN1,
652        const int16_t* sP,
653        const int16_t* sN,
654        uint32_t lerpP,
655        const int32_t* const volumeLR)
656{
657#ifdef USE_INTRINSIC
658    ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
659            lerpP, coefsP1, coefsN1);
660#else
661
662    const int CHANNELS = 1; // template specialization does not preserve params
663    const int STRIDE = 16;
664    sP -= CHANNELS*((STRIDE>>1)-1);
665    asm (
666        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
667        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
668
669        "1:                                      \n"
670
671        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
672        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
673        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
674        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
675        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
676        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
677
678        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
679        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
680
681        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
682        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
683
684        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
685
686        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
687        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
688
689        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
690        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
691        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
692        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
693        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
694
695        // moving these ARM instructions before neon above seems to be slower
696        "subs           %[count], %[count], #8   \n"// (1) update loop counter
697        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
698
699        // sP used after branch (warning)
700        "bne            1b                       \n"// loop
701
702        ASSEMBLY_ACCUMULATE_MONO
703
704        : [out]     "=Uv" (out[0]),
705          [count]   "+r" (count),
706          [coefsP0] "+r" (coefsP),
707          [coefsN0] "+r" (coefsN),
708          [coefsP1] "+r" (coefsP1),
709          [coefsN1] "+r" (coefsN1),
710          [sP]      "+r" (sP),
711          [sN]      "+r" (sN)
712        : [lerpP]   "r" (lerpP),
713          [vLR]     "r" (volumeLR)
714        : "cc", "memory",
715          "q0", "q1", "q2", "q3",
716          "q8", "q9", "q10", "q11"
717    );
718#endif
719}
720
721template <>
722inline void Process<2, 16>(int32_t* const out,
723        int count,
724        const int16_t* coefsP,
725        const int16_t* coefsN,
726        const int16_t* coefsP1,
727        const int16_t* coefsN1,
728        const int16_t* sP,
729        const int16_t* sN,
730        uint32_t lerpP,
731        const int32_t* const volumeLR)
732{
733#ifdef USE_INTRINSIC
734    ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
735            lerpP, coefsP1, coefsN1);
736#else
737    const int CHANNELS = 2; // template specialization does not preserve params
738    const int STRIDE = 16;
739    sP -= CHANNELS*((STRIDE>>1)-1);
740    asm (
741        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
742        "veor           q0, q0, q0               \n"// (1) acc_L = 0
743        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
744
745        "1:                                      \n"
746
747        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
748        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
749        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
750        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
751        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
752        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
753
754        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
755        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
756
757        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
758        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
759
760        "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
761        "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right
762
763        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
764        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
765
766        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
767        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
768        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
769        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
770        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
771        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
772        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
773        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
774
775        // moving these ARM before neon seems to be slower
776        "subs           %[count], %[count], #8   \n"// (1) update loop counter
777        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
778
779        // sP used after branch (warning)
780        "bne            1b                       \n"// loop
781
782        ASSEMBLY_ACCUMULATE_STEREO
783
784        : [out] "=Uv" (out[0]),
785          [count] "+r" (count),
786          [coefsP0] "+r" (coefsP),
787          [coefsN0] "+r" (coefsN),
788          [coefsP1] "+r" (coefsP1),
789          [coefsN1] "+r" (coefsN1),
790          [sP] "+r" (sP),
791          [sN] "+r" (sN)
792        : [lerpP]   "r" (lerpP),
793          [vLR] "r" (volumeLR)
794        : "cc", "memory",
795          "q0", "q1", "q2", "q3",
796          "q4", "q5", "q6",
797          "q8", "q9", "q10", "q11"
798    );
799#endif
800}
801
802template <>
803inline void ProcessL<1, 16>(int32_t* const out,
804        int count,
805        const int32_t* coefsP,
806        const int32_t* coefsN,
807        const int16_t* sP,
808        const int16_t* sN,
809        const int32_t* const volumeLR)
810{
811#ifdef USE_INTRINSIC
812    ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
813            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
814#else
815    const int CHANNELS = 1; // template specialization does not preserve params
816    const int STRIDE = 16;
817    sP -= CHANNELS*((STRIDE>>1)-1);
818    asm (
819        "veor           q0, q0, q0                    \n"// result, initialize to 0
820
821        "1:                                           \n"
822
823        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
824        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
825        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
826        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
827
828        "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
829
830        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
831        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
832
833        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
834        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
835
836        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
837        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
838        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
839        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples
840
841        "vadd.s32       q0, q0, q12                   \n"// accumulate result
842        "vadd.s32       q13, q13, q14                 \n"// accumulate result
843        "vadd.s32       q0, q0, q15                   \n"// accumulate result
844        "vadd.s32       q0, q0, q13                   \n"// accumulate result
845
846        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
847        "subs           %[count], %[count], #8        \n"// update loop counter
848
849        "bne            1b                            \n"// loop
850
851        ASSEMBLY_ACCUMULATE_MONO
852
853        : [out]     "=Uv" (out[0]),
854          [count]   "+r" (count),
855          [coefsP0] "+r" (coefsP),
856          [coefsN0] "+r" (coefsN),
857          [sP]      "+r" (sP),
858          [sN]      "+r" (sN)
859        : [vLR]     "r" (volumeLR)
860        : "cc", "memory",
861          "q0", "q1", "q2", "q3",
862          "q8", "q9", "q10", "q11",
863          "q12", "q13", "q14", "q15"
864    );
865#endif
866}
867
868template <>
869inline void ProcessL<2, 16>(int32_t* const out,
870        int count,
871        const int32_t* coefsP,
872        const int32_t* coefsN,
873        const int16_t* sP,
874        const int16_t* sN,
875        const int32_t* const volumeLR)
876{
877#ifdef USE_INTRINSIC
878    ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
879            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
880#else
881    const int CHANNELS = 2; // template specialization does not preserve params
882    const int STRIDE = 16;
883    sP -= CHANNELS*((STRIDE>>1)-1);
884    asm (
885        "veor           q0, q0, q0                    \n"// result, initialize to 0
886        "veor           q4, q4, q4                    \n"// result, initialize to 0
887
888        "1:                                           \n"
889
890        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
891        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
892        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
893        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
894
895        "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
896        "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
897
898        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
899        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
900
901        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
902        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
903
904        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
905        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
906        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
907        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
908
909        "vadd.s32       q0, q0, q12                   \n"// accumulate result
910        "vadd.s32       q13, q13, q14                 \n"// accumulate result
911        "vadd.s32       q0, q0, q15                   \n"// accumulate result
912        "vadd.s32       q0, q0, q13                   \n"// accumulate result
913
914        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
915        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
916
917        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
918        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
919
920        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
921        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
922        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
923        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
924
925        "vadd.s32       q4, q4, q12                   \n"// accumulate result
926        "vadd.s32       q13, q13, q14                 \n"// accumulate result
927        "vadd.s32       q4, q4, q15                   \n"// accumulate result
928        "vadd.s32       q4, q4, q13                   \n"// accumulate result
929
930        "subs           %[count], %[count], #8        \n"// update loop counter
931        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
932
933        "bne            1b                            \n"// loop
934
935        ASSEMBLY_ACCUMULATE_STEREO
936
937        : [out]     "=Uv" (out[0]),
938          [count]   "+r" (count),
939          [coefsP0] "+r" (coefsP),
940          [coefsN0] "+r" (coefsN),
941          [sP]      "+r" (sP),
942          [sN]      "+r" (sN)
943        : [vLR]     "r" (volumeLR)
944        : "cc", "memory",
945          "q0", "q1", "q2", "q3",
946          "q4", "q5", "q6",
947          "q8", "q9", "q10", "q11",
948          "q12", "q13", "q14", "q15"
949    );
950#endif
951}
952
953template <>
954inline void Process<1, 16>(int32_t* const out,
955        int count,
956        const int32_t* coefsP,
957        const int32_t* coefsN,
958        const int32_t* coefsP1,
959        const int32_t* coefsN1,
960        const int16_t* sP,
961        const int16_t* sN,
962        uint32_t lerpP,
963        const int32_t* const volumeLR)
964{
965#ifdef USE_INTRINSIC
966    ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
967            lerpP, coefsP1, coefsN1);
968#else
969    const int CHANNELS = 1; // template specialization does not preserve params
970    const int STRIDE = 16;
971    sP -= CHANNELS*((STRIDE>>1)-1);
972    asm (
973        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
974        "veor           q0, q0, q0                    \n"// result, initialize to 0
975
976        "1:                                           \n"
977
978        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
979        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
980        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
981        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
982        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
983        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
984
985        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
986        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
987        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
988        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
989
990        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
991        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
992        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
993        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
994
995        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
996        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
997        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
998        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
999
1000        "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
1001
1002        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
1003        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
1004
1005        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
1006        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
1007
1008        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1009        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1010        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1011        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1012
1013        "vadd.s32       q0, q0, q12                   \n"// accumulate result
1014        "vadd.s32       q13, q13, q14                 \n"// accumulate result
1015        "vadd.s32       q0, q0, q15                   \n"// accumulate result
1016        "vadd.s32       q0, q0, q13                   \n"// accumulate result
1017
1018        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
1019        "subs           %[count], %[count], #8        \n"// update loop counter
1020
1021        "bne            1b                            \n"// loop
1022
1023        ASSEMBLY_ACCUMULATE_MONO
1024
1025        : [out]     "=Uv" (out[0]),
1026          [count]   "+r" (count),
1027          [coefsP0] "+r" (coefsP),
1028          [coefsN0] "+r" (coefsN),
1029          [coefsP1] "+r" (coefsP1),
1030          [coefsN1] "+r" (coefsN1),
1031          [sP]      "+r" (sP),
1032          [sN]      "+r" (sN)
1033        : [lerpP]   "r" (lerpP),
1034          [vLR]     "r" (volumeLR)
1035        : "cc", "memory",
1036          "q0", "q1", "q2", "q3",
1037          "q8", "q9", "q10", "q11",
1038          "q12", "q13", "q14", "q15"
1039    );
1040#endif
1041}
1042
1043template <>
1044inline void Process<2, 16>(int32_t* const out,
1045        int count,
1046        const int32_t* coefsP,
1047        const int32_t* coefsN,
1048        const int32_t* coefsP1,
1049        const int32_t* coefsN1,
1050        const int16_t* sP,
1051        const int16_t* sN,
1052        uint32_t lerpP,
1053        const int32_t* const volumeLR)
1054{
1055#ifdef USE_INTRINSIC
1056    ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1057            lerpP, coefsP1, coefsN1);
1058#else
1059    const int CHANNELS = 2; // template specialization does not preserve params
1060    const int STRIDE = 16;
1061    sP -= CHANNELS*((STRIDE>>1)-1);
1062    asm (
1063        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
1064        "veor           q0, q0, q0                    \n"// result, initialize to 0
1065        "veor           q4, q4, q4                    \n"// result, initialize to 0
1066
1067        "1:                                           \n"
1068
1069        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
1070        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
1071        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
1072        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
1073        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
1074        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
1075
1076        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
1077        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
1078        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
1079        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
1080
1081        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
1082        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
1083        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
1084        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
1085
1086        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
1087        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
1088        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
1089        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
1090
1091        "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
1092        "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
1093
1094        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
1095        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
1096
1097        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
1098        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
1099
1100        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1101        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1102        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1103        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1104
1105        "vadd.s32       q0, q0, q12                   \n"// accumulate result
1106        "vadd.s32       q13, q13, q14                 \n"// accumulate result
1107        "vadd.s32       q0, q0, q15                   \n"// accumulate result
1108        "vadd.s32       q0, q0, q13                   \n"// accumulate result
1109
1110        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
1111        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
1112
1113        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
1114        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
1115
1116        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
1117        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
1118        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
1119        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
1120
1121        "vadd.s32       q4, q4, q12                   \n"// accumulate result
1122        "vadd.s32       q13, q13, q14                 \n"// accumulate result
1123        "vadd.s32       q4, q4, q15                   \n"// accumulate result
1124        "vadd.s32       q4, q4, q13                   \n"// accumulate result
1125
1126        "subs           %[count], %[count], #8        \n"// update loop counter
1127        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
1128
1129        "bne            1b                            \n"// loop
1130
1131        ASSEMBLY_ACCUMULATE_STEREO
1132
1133        : [out]     "=Uv" (out[0]),
1134          [count]   "+r" (count),
1135          [coefsP0] "+r" (coefsP),
1136          [coefsN0] "+r" (coefsN),
1137          [coefsP1] "+r" (coefsP1),
1138          [coefsN1] "+r" (coefsN1),
1139          [sP]      "+r" (sP),
1140          [sN]      "+r" (sN)
1141        : [lerpP]   "r" (lerpP),
1142          [vLR]     "r" (volumeLR)
1143        : "cc", "memory",
1144          "q0", "q1", "q2", "q3",
1145          "q4", "q5", "q6",
1146          "q8", "q9", "q10", "q11",
1147          "q12", "q13", "q14", "q15"
1148    );
1149#endif
1150}
1151
1152template<>
1153inline void ProcessL<1, 16>(float* const out,
1154        int count,
1155        const float* coefsP,
1156        const float* coefsN,
1157        const float* sP,
1158        const float* sN,
1159        const float* const volumeLR)
1160{
1161    ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1162            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1163}
1164
1165template<>
1166inline void ProcessL<2, 16>(float* const out,
1167        int count,
1168        const float* coefsP,
1169        const float* coefsN,
1170        const float* sP,
1171        const float* sN,
1172        const float* const volumeLR)
1173{
1174    ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1175            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1176}
1177
1178template<>
1179inline void Process<1, 16>(float* const out,
1180        int count,
1181        const float* coefsP,
1182        const float* coefsN,
1183        const float* coefsP1,
1184        const float* coefsN1,
1185        const float* sP,
1186        const float* sN,
1187        float lerpP,
1188        const float* const volumeLR)
1189{
1190    ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1191            lerpP, coefsP1, coefsN1);
1192}
1193
1194template<>
1195inline void Process<2, 16>(float* const out,
1196        int count,
1197        const float* coefsP,
1198        const float* coefsN,
1199        const float* coefsP1,
1200        const float* coefsN1,
1201        const float* sP,
1202        const float* sN,
1203        float lerpP,
1204        const float* const volumeLR)
1205{
1206    ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1207            lerpP, coefsP1, coefsN1);
1208}
1209
1210#endif //USE_NEON
1211
1212} // namespace android
1213
1214#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1215