1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_NEON
25//
26// NEON specializations are enabled for Process() and ProcessL()
27//
28// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
29// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
30// issues with S16 coefs. Consider this later.
31
32// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
33#define ASSEMBLY_ACCUMULATE_MONO \
34        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
35        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
36        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
37        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
38        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
39        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
40        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
41
42#define ASSEMBLY_ACCUMULATE_STEREO \
43        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
44        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
45        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
46        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
47        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
48        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
49        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
50        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
51
52template <>
53inline void ProcessL<1, 16>(int32_t* const out,
54        int count,
55        const int16_t* coefsP,
56        const int16_t* coefsN,
57        const int16_t* sP,
58        const int16_t* sN,
59        const int32_t* const volumeLR)
60{
61    const int CHANNELS = 1; // template specialization does not preserve params
62    const int STRIDE = 16;
63    sP -= CHANNELS*((STRIDE>>1)-1);
64    asm (
65        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
66
67        "1:                                      \n"
68
69        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
70        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
71        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
72        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
73
74        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
75
76        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
77        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
78        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
79        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
80        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
81
82        // moving these ARM instructions before neon above seems to be slower
83        "subs           %[count], %[count], #8   \n"// (1) update loop counter
84        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
85
86        // sP used after branch (warning)
87        "bne            1b                       \n"// loop
88
89         ASSEMBLY_ACCUMULATE_MONO
90
91        : [out]     "=Uv" (out[0]),
92          [count]   "+r" (count),
93          [coefsP0] "+r" (coefsP),
94          [coefsN0] "+r" (coefsN),
95          [sP]      "+r" (sP),
96          [sN]      "+r" (sN)
97        : [vLR]     "r" (volumeLR)
98        : "cc", "memory",
99          "q0", "q1", "q2", "q3",
100          "q8", "q10"
101    );
102}
103
104template <>
105inline void ProcessL<2, 16>(int32_t* const out,
106        int count,
107        const int16_t* coefsP,
108        const int16_t* coefsN,
109        const int16_t* sP,
110        const int16_t* sN,
111        const int32_t* const volumeLR)
112{
113    const int CHANNELS = 2; // template specialization does not preserve params
114    const int STRIDE = 16;
115    sP -= CHANNELS*((STRIDE>>1)-1);
116    asm (
117        "veor           q0, q0, q0               \n"// (1) acc_L = 0
118        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
119
120        "1:                                      \n"
121
122        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
123        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
124        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
125        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
126
127        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
128        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
129
130        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
131        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
132        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
133        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
134        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
135        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
136        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
137        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
138
139        // moving these ARM before neon seems to be slower
140        "subs           %[count], %[count], #8   \n"// (1) update loop counter
141        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
142
143        // sP used after branch (warning)
144        "bne            1b                       \n"// loop
145
146        ASSEMBLY_ACCUMULATE_STEREO
147
148        : [out] "=Uv" (out[0]),
149          [count] "+r" (count),
150          [coefsP0] "+r" (coefsP),
151          [coefsN0] "+r" (coefsN),
152          [sP] "+r" (sP),
153          [sN] "+r" (sN)
154        : [vLR] "r" (volumeLR)
155        : "cc", "memory",
156          "q0", "q1", "q2", "q3",
157          "q4", "q5", "q6",
158          "q8", "q10"
159     );
160}
161
162template <>
163inline void Process<1, 16>(int32_t* const out,
164        int count,
165        const int16_t* coefsP,
166        const int16_t* coefsN,
167        const int16_t* coefsP1,
168        const int16_t* coefsN1,
169        const int16_t* sP,
170        const int16_t* sN,
171        uint32_t lerpP,
172        const int32_t* const volumeLR)
173{
174    const int CHANNELS = 1; // template specialization does not preserve params
175    const int STRIDE = 16;
176    sP -= CHANNELS*((STRIDE>>1)-1);
177    asm (
178        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
179        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
180
181        "1:                                      \n"
182
183        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
184        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
185        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
186        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
187        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
188        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
189
190        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
191        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
192
193        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
194        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
195
196        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
197
198        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
199        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
200
201        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
202        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
203        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
204        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
205        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
206
207        // moving these ARM instructions before neon above seems to be slower
208        "subs           %[count], %[count], #8   \n"// (1) update loop counter
209        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
210
211        // sP used after branch (warning)
212        "bne            1b                       \n"// loop
213
214        ASSEMBLY_ACCUMULATE_MONO
215
216        : [out]     "=Uv" (out[0]),
217          [count]   "+r" (count),
218          [coefsP0] "+r" (coefsP),
219          [coefsN0] "+r" (coefsN),
220          [coefsP1] "+r" (coefsP1),
221          [coefsN1] "+r" (coefsN1),
222          [sP]      "+r" (sP),
223          [sN]      "+r" (sN)
224        : [lerpP]   "r" (lerpP),
225          [vLR]     "r" (volumeLR)
226        : "cc", "memory",
227          "q0", "q1", "q2", "q3",
228          "q8", "q9", "q10", "q11"
229    );
230}
231
232template <>
233inline void Process<2, 16>(int32_t* const out,
234        int count,
235        const int16_t* coefsP,
236        const int16_t* coefsN,
237        const int16_t* coefsP1,
238        const int16_t* coefsN1,
239        const int16_t* sP,
240        const int16_t* sN,
241        uint32_t lerpP,
242        const int32_t* const volumeLR)
243{
244    const int CHANNELS = 2; // template specialization does not preserve params
245    const int STRIDE = 16;
246    sP -= CHANNELS*((STRIDE>>1)-1);
247    asm (
248        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
249        "veor           q0, q0, q0               \n"// (1) acc_L = 0
250        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
251
252        "1:                                      \n"
253
254        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
255        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
256        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
257        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
258        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
259        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
260
261        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
262        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
263
264        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
265        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
266
267        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
268        "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
269
270        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
271        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
272
273        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
274        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
275        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
276        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
277        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
278        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
279        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
280        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
281
282        // moving these ARM before neon seems to be slower
283        "subs           %[count], %[count], #8   \n"// (1) update loop counter
284        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
285
286        // sP used after branch (warning)
287        "bne            1b                       \n"// loop
288
289        ASSEMBLY_ACCUMULATE_STEREO
290
291        : [out] "=Uv" (out[0]),
292          [count] "+r" (count),
293          [coefsP0] "+r" (coefsP),
294          [coefsN0] "+r" (coefsN),
295          [coefsP1] "+r" (coefsP1),
296          [coefsN1] "+r" (coefsN1),
297          [sP] "+r" (sP),
298          [sN] "+r" (sN)
299        : [lerpP]   "r" (lerpP),
300          [vLR] "r" (volumeLR)
301        : "cc", "memory",
302          "q0", "q1", "q2", "q3",
303          "q4", "q5", "q6",
304          "q8", "q9", "q10", "q11"
305    );
306}
307
308template <>
309inline void ProcessL<1, 16>(int32_t* const out,
310        int count,
311        const int32_t* coefsP,
312        const int32_t* coefsN,
313        const int16_t* sP,
314        const int16_t* sN,
315        const int32_t* const volumeLR)
316{
317    const int CHANNELS = 1; // template specialization does not preserve params
318    const int STRIDE = 16;
319    sP -= CHANNELS*((STRIDE>>1)-1);
320    asm (
321        "veor           q0, q0, q0                    \n"// result, initialize to 0
322
323        "1:                                           \n"
324
325        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
326        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
327        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
328        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
329
330        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
331
332        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
333        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
334
335        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
336        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
337
338        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
339        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
340        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
341        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
342
343        "vadd.s32       q0, q0, q12                   \n"// accumulate result
344        "vadd.s32       q13, q13, q14                 \n"// accumulate result
345        "vadd.s32       q0, q0, q15                   \n"// accumulate result
346        "vadd.s32       q0, q0, q13                   \n"// accumulate result
347
348        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
349        "subs           %[count], %[count], #8        \n"// update loop counter
350
351        "bne            1b                            \n"// loop
352
353        ASSEMBLY_ACCUMULATE_MONO
354
355        : [out]     "=Uv" (out[0]),
356          [count]   "+r" (count),
357          [coefsP0] "+r" (coefsP),
358          [coefsN0] "+r" (coefsN),
359          [sP]      "+r" (sP),
360          [sN]      "+r" (sN)
361        : [vLR]     "r" (volumeLR)
362        : "cc", "memory",
363          "q0", "q1", "q2", "q3",
364          "q8", "q9", "q10", "q11",
365          "q12", "q13", "q14", "q15"
366    );
367}
368
369template <>
370inline void ProcessL<2, 16>(int32_t* const out,
371        int count,
372        const int32_t* coefsP,
373        const int32_t* coefsN,
374        const int16_t* sP,
375        const int16_t* sN,
376        const int32_t* const volumeLR)
377{
378    const int CHANNELS = 2; // template specialization does not preserve params
379    const int STRIDE = 16;
380    sP -= CHANNELS*((STRIDE>>1)-1);
381    asm (
382        "veor           q0, q0, q0                    \n"// result, initialize to 0
383        "veor           q4, q4, q4                    \n"// result, initialize to 0
384
385        "1:                                           \n"
386
387        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
388        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
389        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
390        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
391
392        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
393        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
394
395        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
396        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
397
398        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
399        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
400
401        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
402        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
403        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
404        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
405
406        "vadd.s32       q0, q0, q12                   \n"// accumulate result
407        "vadd.s32       q13, q13, q14                 \n"// accumulate result
408        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
409        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
410
411        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
412        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
413
414        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
415        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
416
417        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
418        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
419        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
420        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
421
422        "vadd.s32       q4, q4, q12                   \n"// accumulate result
423        "vadd.s32       q13, q13, q14                 \n"// accumulate result
424        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
425        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
426
427        "subs           %[count], %[count], #8        \n"// update loop counter
428        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
429
430        "bne            1b                            \n"// loop
431
432        ASSEMBLY_ACCUMULATE_STEREO
433
434        : [out]     "=Uv" (out[0]),
435          [count]   "+r" (count),
436          [coefsP0] "+r" (coefsP),
437          [coefsN0] "+r" (coefsN),
438          [sP]      "+r" (sP),
439          [sN]      "+r" (sN)
440        : [vLR]     "r" (volumeLR)
441        : "cc", "memory",
442          "q0", "q1", "q2", "q3",
443          "q4", "q5", "q6",
444          "q8", "q9", "q10", "q11",
445          "q12", "q13", "q14", "q15"
446    );
447}
448
449template <>
450inline void Process<1, 16>(int32_t* const out,
451        int count,
452        const int32_t* coefsP,
453        const int32_t* coefsN,
454        const int32_t* coefsP1,
455        const int32_t* coefsN1,
456        const int16_t* sP,
457        const int16_t* sN,
458        uint32_t lerpP,
459        const int32_t* const volumeLR)
460{
461    const int CHANNELS = 1; // template specialization does not preserve params
462    const int STRIDE = 16;
463    sP -= CHANNELS*((STRIDE>>1)-1);
464    asm (
465        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
466        "veor           q0, q0, q0                    \n"// result, initialize to 0
467
468        "1:                                           \n"
469
470        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
471        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
472        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
473        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
474        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
475        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
476
477        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
478        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
479        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
480        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
481
482        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
483        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
484        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
485        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
486
487        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
488        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
489        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
490        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
491
492        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
493
494        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
495        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
496
497        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
498        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
499
500        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
501        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
502        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
503        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
504
505        "vadd.s32       q0, q0, q12                   \n"// accumulate result
506        "vadd.s32       q13, q13, q14                 \n"// accumulate result
507        "vadd.s32       q0, q0, q15                   \n"// accumulate result
508        "vadd.s32       q0, q0, q13                   \n"// accumulate result
509
510        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
511        "subs           %[count], %[count], #8        \n"// update loop counter
512
513        "bne            1b                            \n"// loop
514
515        ASSEMBLY_ACCUMULATE_MONO
516
517        : [out]     "=Uv" (out[0]),
518          [count]   "+r" (count),
519          [coefsP0] "+r" (coefsP),
520          [coefsN0] "+r" (coefsN),
521          [coefsP1] "+r" (coefsP1),
522          [coefsN1] "+r" (coefsN1),
523          [sP]      "+r" (sP),
524          [sN]      "+r" (sN)
525        : [lerpP]   "r" (lerpP),
526          [vLR]     "r" (volumeLR)
527        : "cc", "memory",
528          "q0", "q1", "q2", "q3",
529          "q8", "q9", "q10", "q11",
530          "q12", "q13", "q14", "q15"
531    );
532}
533
534template <>
535inline void Process<2, 16>(int32_t* const out,
536        int count,
537        const int32_t* coefsP,
538        const int32_t* coefsN,
539        const int32_t* coefsP1,
540        const int32_t* coefsN1,
541        const int16_t* sP,
542        const int16_t* sN,
543        uint32_t lerpP,
544        const int32_t* const volumeLR)
545{
546    const int CHANNELS = 2; // template specialization does not preserve params
547    const int STRIDE = 16;
548    sP -= CHANNELS*((STRIDE>>1)-1);
549    asm (
550        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
551        "veor           q0, q0, q0                    \n"// result, initialize to 0
552        "veor           q4, q4, q4                    \n"// result, initialize to 0
553
554        "1:                                           \n"
555
556        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
557        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
558        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
559        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
560        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
561        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
562
563        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
564        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
565        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
566        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
567
568        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
569        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
570        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
571        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
572
573        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
574        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
575        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
576        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
577
578        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
579        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
580
581        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
582        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
583
584        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
585        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
586
587        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
588        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
589        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
590        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
591
592        "vadd.s32       q0, q0, q12                   \n"// accumulate result
593        "vadd.s32       q13, q13, q14                 \n"// accumulate result
594        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
595        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
596
597        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
598        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
599
600        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
601        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
602
603        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
604        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
605        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
606        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
607
608        "vadd.s32       q4, q4, q12                   \n"// accumulate result
609        "vadd.s32       q13, q13, q14                 \n"// accumulate result
610        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
611        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
612
613        "subs           %[count], %[count], #8        \n"// update loop counter
614        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
615
616        "bne            1b                            \n"// loop
617
618        ASSEMBLY_ACCUMULATE_STEREO
619
620        : [out]     "=Uv" (out[0]),
621          [count]   "+r" (count),
622          [coefsP0] "+r" (coefsP),
623          [coefsN0] "+r" (coefsN),
624          [coefsP1] "+r" (coefsP1),
625          [coefsN1] "+r" (coefsN1),
626          [sP]      "+r" (sP),
627          [sN]      "+r" (sN)
628        : [lerpP]   "r" (lerpP),
629          [vLR]     "r" (volumeLR)
630        : "cc", "memory",
631          "q0", "q1", "q2", "q3",
632          "q4", "q5", "q6",
633          "q8", "q9", "q10", "q11",
634          "q12", "q13", "q14", "q15"
635    );
636}
637
638template <>
639inline void ProcessL<1, 8>(int32_t* const out,
640        int count,
641        const int16_t* coefsP,
642        const int16_t* coefsN,
643        const int16_t* sP,
644        const int16_t* sN,
645        const int32_t* const volumeLR)
646{
647    const int CHANNELS = 1; // template specialization does not preserve params
648    const int STRIDE = 8;
649    sP -= CHANNELS*((STRIDE>>1)-1);
650    asm (
651        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
652
653        "1:                                      \n"
654
655        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
656        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
657        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
658        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
659
660        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
661
662        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
663        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
664        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
665
666        // moving these ARM instructions before neon above seems to be slower
667        "subs           %[count], %[count], #4   \n"// (1) update loop counter
668        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
669
670        // sP used after branch (warning)
671        "bne            1b                       \n"// loop
672
673        ASSEMBLY_ACCUMULATE_MONO
674
675        : [out]     "=Uv" (out[0]),
676          [count]   "+r" (count),
677          [coefsP0] "+r" (coefsP),
678          [coefsN0] "+r" (coefsN),
679          [sP]      "+r" (sP),
680          [sN]      "+r" (sN)
681        : [vLR]     "r" (volumeLR)
682        : "cc", "memory",
683          "q0", "q1", "q2", "q3",
684          "q8", "q10"
685    );
686}
687
688template <>
689inline void ProcessL<2, 8>(int32_t* const out,
690        int count,
691        const int16_t* coefsP,
692        const int16_t* coefsN,
693        const int16_t* sP,
694        const int16_t* sN,
695        const int32_t* const volumeLR)
696{
697    const int CHANNELS = 2; // template specialization does not preserve params
698    const int STRIDE = 8;
699    sP -= CHANNELS*((STRIDE>>1)-1);
700    asm (
701        "veor           q0, q0, q0               \n"// (1) acc_L = 0
702        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
703
704        "1:                                      \n"
705
706        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
707        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
708        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
709        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
710
711        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
712
713        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
714        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
715        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
716        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
717
718        // moving these ARM before neon seems to be slower
719        "subs           %[count], %[count], #4   \n"// (1) update loop counter
720        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
721
722        // sP used after branch (warning)
723        "bne            1b                       \n"// loop
724
725        ASSEMBLY_ACCUMULATE_STEREO
726
727        : [out] "=Uv" (out[0]),
728          [count] "+r" (count),
729          [coefsP0] "+r" (coefsP),
730          [coefsN0] "+r" (coefsN),
731          [sP] "+r" (sP),
732          [sN] "+r" (sN)
733        : [vLR] "r" (volumeLR)
734        : "cc", "memory",
735          "q0", "q1", "q2", "q3",
736          "q4", "q5", "q6",
737          "q8", "q10"
738     );
739}
740
741template <>
742inline void Process<1, 8>(int32_t* const out,
743        int count,
744        const int16_t* coefsP,
745        const int16_t* coefsN,
746        const int16_t* coefsP1,
747        const int16_t* coefsN1,
748        const int16_t* sP,
749        const int16_t* sN,
750        uint32_t lerpP,
751        const int32_t* const volumeLR)
752{
753    const int CHANNELS = 1; // template specialization does not preserve params
754    const int STRIDE = 8;
755    sP -= CHANNELS*((STRIDE>>1)-1);
756    asm (
757        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
758        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
759
760        "1:                                      \n"
761
762        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
763        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
764        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
765        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
766        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
767        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
768
769        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
770        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
771
772        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
773        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
774
775        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
776
777        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
778        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
779
780        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
781        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
782        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
783
784        // moving these ARM instructions before neon above seems to be slower
785        "subs           %[count], %[count], #4   \n"// (1) update loop counter
786        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
787
788        // sP used after branch (warning)
789        "bne            1b                       \n"// loop
790
791        ASSEMBLY_ACCUMULATE_MONO
792
793        : [out]     "=Uv" (out[0]),
794          [count]   "+r" (count),
795          [coefsP0] "+r" (coefsP),
796          [coefsN0] "+r" (coefsN),
797          [coefsP1] "+r" (coefsP1),
798          [coefsN1] "+r" (coefsN1),
799          [sP]      "+r" (sP),
800          [sN]      "+r" (sN)
801        : [lerpP]   "r" (lerpP),
802          [vLR]     "r" (volumeLR)
803        : "cc", "memory",
804          "q0", "q1", "q2", "q3",
805          "q8", "q9", "q10", "q11"
806    );
807}
808
809template <>
810inline void Process<2, 8>(int32_t* const out,
811        int count,
812        const int16_t* coefsP,
813        const int16_t* coefsN,
814        const int16_t* coefsP1,
815        const int16_t* coefsN1,
816        const int16_t* sP,
817        const int16_t* sN,
818        uint32_t lerpP,
819        const int32_t* const volumeLR)
820{
821    const int CHANNELS = 2; // template specialization does not preserve params
822    const int STRIDE = 8;
823    sP -= CHANNELS*((STRIDE>>1)-1);
824    asm (
825        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
826        "veor           q0, q0, q0               \n"// (1) acc_L = 0
827        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
828
829        "1:                                      \n"
830
831        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
832        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
833        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
834        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
835        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
836        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
837
838        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
839        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
840
841        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
842        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
843
844        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
845
846        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
847        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
848
849        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
850        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
851        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
852        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
853
854        // moving these ARM before neon seems to be slower
855        "subs           %[count], %[count], #4   \n"// (1) update loop counter
856        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
857
858        // sP used after branch (warning)
859        "bne            1b                       \n"// loop
860
861        ASSEMBLY_ACCUMULATE_STEREO
862
863        : [out] "=Uv" (out[0]),
864          [count] "+r" (count),
865          [coefsP0] "+r" (coefsP),
866          [coefsN0] "+r" (coefsN),
867          [coefsP1] "+r" (coefsP1),
868          [coefsN1] "+r" (coefsN1),
869          [sP] "+r" (sP),
870          [sN] "+r" (sN)
871        : [lerpP]   "r" (lerpP),
872          [vLR] "r" (volumeLR)
873        : "cc", "memory",
874          "q0", "q1", "q2", "q3",
875          "q4", "q5", "q6",
876          "q8", "q9", "q10", "q11"
877    );
878}
879
880template <>
881inline void ProcessL<1, 8>(int32_t* const out,
882        int count,
883        const int32_t* coefsP,
884        const int32_t* coefsN,
885        const int16_t* sP,
886        const int16_t* sN,
887        const int32_t* const volumeLR)
888{
889    const int CHANNELS = 1; // template specialization does not preserve params
890    const int STRIDE = 8;
891    sP -= CHANNELS*((STRIDE>>1)-1);
892    asm (
893        "veor           q0, q0, q0               \n"// result, initialize to 0
894
895        "1:                                      \n"
896
897        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
898        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
899        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
900        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
901
902        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
903
904        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
905        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
906
907        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
908        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
909
910        "vadd.s32       q0, q0, q12              \n"// accumulate result
911        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
912
913        "subs           %[count], %[count], #4   \n"// update loop counter
914        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
915
916        "bne            1b                       \n"// loop
917
918        ASSEMBLY_ACCUMULATE_MONO
919
920        : [out] "=Uv" (out[0]),
921          [count] "+r" (count),
922          [coefsP0] "+r" (coefsP),
923          [coefsN0] "+r" (coefsN),
924          [sP] "+r" (sP),
925          [sN] "+r" (sN)
926        : [vLR] "r" (volumeLR)
927        : "cc", "memory",
928          "q0", "q1", "q2", "q3",
929          "q8", "q9", "q10", "q11",
930          "q12", "q14"
931    );
932}
933
934template <>
935inline void ProcessL<2, 8>(int32_t* const out,
936        int count,
937        const int32_t* coefsP,
938        const int32_t* coefsN,
939        const int16_t* sP,
940        const int16_t* sN,
941        const int32_t* const volumeLR)
942{
943    const int CHANNELS = 2; // template specialization does not preserve params
944    const int STRIDE = 8;
945    sP -= CHANNELS*((STRIDE>>1)-1);
946    asm (
947        "veor           q0, q0, q0               \n"// result, initialize to 0
948        "veor           q4, q4, q4               \n"// result, initialize to 0
949
950        "1:                                      \n"
951
952        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
953        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
954        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
955        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
956
957        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
958
959        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
960        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
961
962        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
963        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
964
965        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
966        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
967        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
968        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
969
970        "vadd.s32       q0, q0, q12              \n"// accumulate result
971        "vadd.s32       q4, q4, q13              \n"// accumulate result
972        "vadd.s32       q0, q0, q14              \n"// accumulate result
973        "vadd.s32       q4, q4, q15              \n"// accumulate result
974
975        "subs           %[count], %[count], #4   \n"// update loop counter
976        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
977
978        "bne            1b                       \n"// loop
979
980        ASSEMBLY_ACCUMULATE_STEREO
981
982        : [out]     "=Uv" (out[0]),
983          [count]   "+r" (count),
984          [coefsP0] "+r" (coefsP),
985          [coefsN0] "+r" (coefsN),
986          [sP]      "+r" (sP),
987          [sN]      "+r" (sN)
988        : [vLR]     "r" (volumeLR)
989        : "cc", "memory",
990          "q0", "q1", "q2", "q3", "q4",
991          "q8", "q9", "q10", "q11",
992          "q12", "q13", "q14", "q15"
993    );
994}
995
996template <>
997inline void Process<1, 8>(int32_t* const out,
998        int count,
999        const int32_t* coefsP,
1000        const int32_t* coefsN,
1001        const int32_t* coefsP1,
1002        const int32_t* coefsN1,
1003        const int16_t* sP,
1004        const int16_t* sN,
1005        uint32_t lerpP,
1006        const int32_t* const volumeLR)
1007{
1008    const int CHANNELS = 1; // template specialization does not preserve params
1009    const int STRIDE = 8;
1010    sP -= CHANNELS*((STRIDE>>1)-1);
1011    asm (
1012        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
1013        "veor           q0, q0, q0               \n"// result, initialize to 0
1014
1015        "1:                                      \n"
1016
1017        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
1018        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
1019        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
1020        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
1021        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
1022        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
1023
1024        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
1025
1026        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
1027        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
1028        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
1029
1030        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
1031        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
1032        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
1033
1034        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
1035        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
1036
1037        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
1038        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
1039
1040        "vadd.s32       q0, q0, q12              \n"// accumulate result
1041        "vadd.s32       q0, q0, q14              \n"// accumulate result
1042
1043        "subs           %[count], %[count], #4   \n"// update loop counter
1044        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
1045
1046        "bne            1b                       \n"// loop
1047
1048        ASSEMBLY_ACCUMULATE_MONO
1049
1050        : [out]     "=Uv" (out[0]),
1051          [count]   "+r" (count),
1052          [coefsP0] "+r" (coefsP),
1053          [coefsP1] "+r" (coefsP1),
1054          [coefsN0] "+r" (coefsN),
1055          [coefsN1] "+r" (coefsN1),
1056          [sP]      "+r" (sP),
1057          [sN]      "+r" (sN)
1058        : [lerpP]   "r" (lerpP),
1059          [vLR]     "r" (volumeLR)
1060        : "cc", "memory",
1061          "q0", "q1", "q2", "q3",
1062          "q8", "q9", "q10", "q11",
1063          "q12", "q14"
1064    );
1065}
1066
1067template <>
1068inline
1069void Process<2, 8>(int32_t* const out,
1070        int count,
1071        const int32_t* coefsP,
1072        const int32_t* coefsN,
1073        const int32_t* coefsP1,
1074        const int32_t* coefsN1,
1075        const int16_t* sP,
1076        const int16_t* sN,
1077        uint32_t lerpP,
1078        const int32_t* const volumeLR)
1079{
1080    const int CHANNELS = 2; // template specialization does not preserve params
1081    const int STRIDE = 8;
1082    sP -= CHANNELS*((STRIDE>>1)-1);
1083    asm (
1084        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
1085        "veor           q0, q0, q0               \n"// result, initialize to 0
1086        "veor           q4, q4, q4               \n"// result, initialize to 0
1087
1088        "1:                                      \n"
1089        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
1090        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
1091        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
1092        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
1093        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
1094        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
1095
1096        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
1097
1098        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
1099        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
1100        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
1101        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
1102
1103        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
1104        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
1105        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
1106        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
1107
1108        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
1109        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
1110
1111        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
1112        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
1113        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
1114        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
1115
1116        "vadd.s32       q0, q0, q12              \n"// accumulate result
1117        "vadd.s32       q4, q4, q13              \n"// accumulate result
1118        "vadd.s32       q0, q0, q14              \n"// accumulate result
1119        "vadd.s32       q4, q4, q15              \n"// accumulate result
1120
1121        "subs           %[count], %[count], #4   \n"// update loop counter
1122        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
1123
1124        "bne            1b                       \n"// loop
1125
1126        ASSEMBLY_ACCUMULATE_STEREO
1127
1128        : [out]     "=Uv" (out[0]),
1129          [count]   "+r" (count),
1130          [coefsP0] "+r" (coefsP),
1131          [coefsP1] "+r" (coefsP1),
1132          [coefsN0] "+r" (coefsN),
1133          [coefsN1] "+r" (coefsN1),
1134          [sP]      "+r" (sP),
1135          [sN]      "+r" (sN)
1136        : [lerpP]   "r" (lerpP),
1137          [vLR]     "r" (volumeLR)
1138        : "cc", "memory",
1139          "q0", "q1", "q2", "q3", "q4",
1140          "q8", "q9", "q10", "q11",
1141          "q12", "q13", "q14", "q15"
1142    );
1143}
1144
1145#endif //USE_NEON
1146
1147}; // namespace android
1148
1149#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1150