AudioResamplerFirProcessNeon.h revision d7a77156eb13973f7fce5c9db6113bef83bc205b
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19
20namespace android {
21
22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24#if USE_NEON
25//
26// NEON specializations are enabled for Process() and ProcessL()
27
28// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
29#define ASSEMBLY_ACCUMULATE_MONO \
30        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
31        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
32        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
33        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
34        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
35        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
36        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
37
38#define ASSEMBLY_ACCUMULATE_STEREO \
39        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
40        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
41        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
42        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
43        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
44        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
45        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
46        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
47
48template <>
49inline void ProcessL<1, 16>(int32_t* const out,
50        int count,
51        const int16_t* coefsP,
52        const int16_t* coefsN,
53        const int16_t* sP,
54        const int16_t* sN,
55        const int32_t* const volumeLR)
56{
57    const int CHANNELS = 1; // template specialization does not preserve params
58    const int STRIDE = 16;
59    sP -= CHANNELS*((STRIDE>>1)-1);
60    asm (
61        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
62
63        "1:                                      \n"
64
65        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
66        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
67        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
68        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
69
70        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
71
72        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
73        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
74        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
75        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
76        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
77
78        // moving these ARM instructions before neon above seems to be slower
79        "subs           %[count], %[count], #8   \n"// (1) update loop counter
80        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
81
82        // sP used after branch (warning)
83        "bne            1b                       \n"// loop
84
85         ASSEMBLY_ACCUMULATE_MONO
86
87        : [out]     "=Uv" (out[0]),
88          [count]   "+r" (count),
89          [coefsP0] "+r" (coefsP),
90          [coefsN0] "+r" (coefsN),
91          [sP]      "+r" (sP),
92          [sN]      "+r" (sN)
93        : [vLR]     "r" (volumeLR)
94        : "cc", "memory",
95          "q0", "q1", "q2", "q3",
96          "q8", "q10"
97    );
98}
99
100template <>
101inline void ProcessL<2, 16>(int32_t* const out,
102        int count,
103        const int16_t* coefsP,
104        const int16_t* coefsN,
105        const int16_t* sP,
106        const int16_t* sN,
107        const int32_t* const volumeLR)
108{
109    const int CHANNELS = 2; // template specialization does not preserve params
110    const int STRIDE = 16;
111    sP -= CHANNELS*((STRIDE>>1)-1);
112    asm (
113        "veor           q0, q0, q0               \n"// (1) acc_L = 0
114        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
115
116        "1:                                      \n"
117
118        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
119        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
120        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
121        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
122
123        "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
124        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right
125
126        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
127        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
128        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
129        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
130        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
131        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
132        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
133        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
134
135        // moving these ARM before neon seems to be slower
136        "subs           %[count], %[count], #8   \n"// (1) update loop counter
137        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
138
139        // sP used after branch (warning)
140        "bne            1b                       \n"// loop
141
142        ASSEMBLY_ACCUMULATE_STEREO
143
144        : [out] "=Uv" (out[0]),
145          [count] "+r" (count),
146          [coefsP0] "+r" (coefsP),
147          [coefsN0] "+r" (coefsN),
148          [sP] "+r" (sP),
149          [sN] "+r" (sN)
150        : [vLR] "r" (volumeLR)
151        : "cc", "memory",
152          "q0", "q1", "q2", "q3",
153          "q4", "q5", "q6",
154          "q8", "q10"
155     );
156}
157
158template <>
159inline void Process<1, 16>(int32_t* const out,
160        int count,
161        const int16_t* coefsP,
162        const int16_t* coefsN,
163        const int16_t* coefsP1,
164        const int16_t* coefsN1,
165        const int16_t* sP,
166        const int16_t* sN,
167        uint32_t lerpP,
168        const int32_t* const volumeLR)
169{
170    const int CHANNELS = 1; // template specialization does not preserve params
171    const int STRIDE = 16;
172    sP -= CHANNELS*((STRIDE>>1)-1);
173    asm (
174        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
175        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
176
177        "1:                                      \n"
178
179        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
180        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
181        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
182        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
183        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
184        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
185
186        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
187        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
188
189        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
190        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
191
192        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
193
194        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
195        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
196
197        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
198        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
199        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
200        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
201        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
202
203        // moving these ARM instructions before neon above seems to be slower
204        "subs           %[count], %[count], #8   \n"// (1) update loop counter
205        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
206
207        // sP used after branch (warning)
208        "bne            1b                       \n"// loop
209
210        ASSEMBLY_ACCUMULATE_MONO
211
212        : [out]     "=Uv" (out[0]),
213          [count]   "+r" (count),
214          [coefsP0] "+r" (coefsP),
215          [coefsN0] "+r" (coefsN),
216          [coefsP1] "+r" (coefsP1),
217          [coefsN1] "+r" (coefsN1),
218          [sP]      "+r" (sP),
219          [sN]      "+r" (sN)
220        : [lerpP]   "r" (lerpP),
221          [vLR]     "r" (volumeLR)
222        : "cc", "memory",
223          "q0", "q1", "q2", "q3",
224          "q8", "q9", "q10", "q11"
225    );
226}
227
228template <>
229inline void Process<2, 16>(int32_t* const out,
230        int count,
231        const int16_t* coefsP,
232        const int16_t* coefsN,
233        const int16_t* coefsP1,
234        const int16_t* coefsN1,
235        const int16_t* sP,
236        const int16_t* sN,
237        uint32_t lerpP,
238        const int32_t* const volumeLR)
239{
240    const int CHANNELS = 2; // template specialization does not preserve params
241    const int STRIDE = 16;
242    sP -= CHANNELS*((STRIDE>>1)-1);
243    asm (
244        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
245        "veor           q0, q0, q0               \n"// (1) acc_L = 0
246        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
247
248        "1:                                      \n"
249
250        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
251        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
252        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
253        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
254        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
255        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
256
257        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
258        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
259
260        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
261        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
262
263        "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
264        "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right
265
266        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
267        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
268
269        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
270        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
271        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
272        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
273        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
274        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
275        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
276        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
277
278        // moving these ARM before neon seems to be slower
279        "subs           %[count], %[count], #8   \n"// (1) update loop counter
280        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
281
282        // sP used after branch (warning)
283        "bne            1b                       \n"// loop
284
285        ASSEMBLY_ACCUMULATE_STEREO
286
287        : [out] "=Uv" (out[0]),
288          [count] "+r" (count),
289          [coefsP0] "+r" (coefsP),
290          [coefsN0] "+r" (coefsN),
291          [coefsP1] "+r" (coefsP1),
292          [coefsN1] "+r" (coefsN1),
293          [sP] "+r" (sP),
294          [sN] "+r" (sN)
295        : [lerpP]   "r" (lerpP),
296          [vLR] "r" (volumeLR)
297        : "cc", "memory",
298          "q0", "q1", "q2", "q3",
299          "q4", "q5", "q6",
300          "q8", "q9", "q10", "q11"
301    );
302}
303
304template <>
305inline void ProcessL<1, 16>(int32_t* const out,
306        int count,
307        const int32_t* coefsP,
308        const int32_t* coefsN,
309        const int16_t* sP,
310        const int16_t* sN,
311        const int32_t* const volumeLR)
312{
313    const int CHANNELS = 1; // template specialization does not preserve params
314    const int STRIDE = 16;
315    sP -= CHANNELS*((STRIDE>>1)-1);
316    asm (
317        "veor           q0, q0, q0                    \n"// result, initialize to 0
318
319        "1:                                           \n"
320
321        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
322        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
323        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
324        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
325
326        "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
327
328        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
329        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
330
331        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
332        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
333
334        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
335        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
336        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
337        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples
338
339        "vadd.s32       q0, q0, q12                   \n"// accumulate result
340        "vadd.s32       q13, q13, q14                 \n"// accumulate result
341        "vadd.s32       q0, q0, q15                   \n"// accumulate result
342        "vadd.s32       q0, q0, q13                   \n"// accumulate result
343
344        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
345        "subs           %[count], %[count], #8        \n"// update loop counter
346
347        "bne            1b                            \n"// loop
348
349        ASSEMBLY_ACCUMULATE_MONO
350
351        : [out]     "=Uv" (out[0]),
352          [count]   "+r" (count),
353          [coefsP0] "+r" (coefsP),
354          [coefsN0] "+r" (coefsN),
355          [sP]      "+r" (sP),
356          [sN]      "+r" (sN)
357        : [vLR]     "r" (volumeLR)
358        : "cc", "memory",
359          "q0", "q1", "q2", "q3",
360          "q8", "q9", "q10", "q11",
361          "q12", "q13", "q14", "q15"
362    );
363}
364
365template <>
366inline void ProcessL<2, 16>(int32_t* const out,
367        int count,
368        const int32_t* coefsP,
369        const int32_t* coefsN,
370        const int16_t* sP,
371        const int16_t* sN,
372        const int32_t* const volumeLR)
373{
374    const int CHANNELS = 2; // template specialization does not preserve params
375    const int STRIDE = 16;
376    sP -= CHANNELS*((STRIDE>>1)-1);
377    asm (
378        "veor           q0, q0, q0                    \n"// result, initialize to 0
379        "veor           q4, q4, q4                    \n"// result, initialize to 0
380
381        "1:                                           \n"
382
383        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
384        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
385        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
386        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
387
388        "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
389        "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
390
391        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
392        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
393
394        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
395        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
396
397        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
398        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
399        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
400        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
401
402        "vadd.s32       q0, q0, q12                   \n"// accumulate result
403        "vadd.s32       q13, q13, q14                 \n"// accumulate result
404        "vadd.s32       q0, q0, q15                   \n"// accumulate result
405        "vadd.s32       q0, q0, q13                   \n"// accumulate result
406
407        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
408        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
409
410        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
411        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
412
413        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
414        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
415        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
416        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
417
418        "vadd.s32       q4, q4, q12                   \n"// accumulate result
419        "vadd.s32       q13, q13, q14                 \n"// accumulate result
420        "vadd.s32       q4, q4, q15                   \n"// accumulate result
421        "vadd.s32       q4, q4, q13                   \n"// accumulate result
422
423        "subs           %[count], %[count], #8        \n"// update loop counter
424        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
425
426        "bne            1b                            \n"// loop
427
428        ASSEMBLY_ACCUMULATE_STEREO
429
430        : [out]     "=Uv" (out[0]),
431          [count]   "+r" (count),
432          [coefsP0] "+r" (coefsP),
433          [coefsN0] "+r" (coefsN),
434          [sP]      "+r" (sP),
435          [sN]      "+r" (sN)
436        : [vLR]     "r" (volumeLR)
437        : "cc", "memory",
438          "q0", "q1", "q2", "q3",
439          "q4", "q5", "q6",
440          "q8", "q9", "q10", "q11",
441          "q12", "q13", "q14", "q15"
442    );
443}
444
445template <>
446inline void Process<1, 16>(int32_t* const out,
447        int count,
448        const int32_t* coefsP,
449        const int32_t* coefsN,
450        const int32_t* coefsP1,
451        const int32_t* coefsN1,
452        const int16_t* sP,
453        const int16_t* sN,
454        uint32_t lerpP,
455        const int32_t* const volumeLR)
456{
457    const int CHANNELS = 1; // template specialization does not preserve params
458    const int STRIDE = 16;
459    sP -= CHANNELS*((STRIDE>>1)-1);
460    asm (
461        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
462        "veor           q0, q0, q0                    \n"// result, initialize to 0
463
464        "1:                                           \n"
465
466        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
467        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
468        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
469        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
470        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
471        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
472
473        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
474        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
475        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
476        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
477
478        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
479        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
480        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
481        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
482
483        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
484        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
485        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
486        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
487
488        "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
489
490        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
491        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
492
493        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
494        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
495
496        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
497        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
498        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
499        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
500
501        "vadd.s32       q0, q0, q12                   \n"// accumulate result
502        "vadd.s32       q13, q13, q14                 \n"// accumulate result
503        "vadd.s32       q0, q0, q15                   \n"// accumulate result
504        "vadd.s32       q0, q0, q13                   \n"// accumulate result
505
506        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
507        "subs           %[count], %[count], #8        \n"// update loop counter
508
509        "bne            1b                            \n"// loop
510
511        ASSEMBLY_ACCUMULATE_MONO
512
513        : [out]     "=Uv" (out[0]),
514          [count]   "+r" (count),
515          [coefsP0] "+r" (coefsP),
516          [coefsN0] "+r" (coefsN),
517          [coefsP1] "+r" (coefsP1),
518          [coefsN1] "+r" (coefsN1),
519          [sP]      "+r" (sP),
520          [sN]      "+r" (sN)
521        : [lerpP]   "r" (lerpP),
522          [vLR]     "r" (volumeLR)
523        : "cc", "memory",
524          "q0", "q1", "q2", "q3",
525          "q8", "q9", "q10", "q11",
526          "q12", "q13", "q14", "q15"
527    );
528}
529
530template <>
531inline void Process<2, 16>(int32_t* const out,
532        int count,
533        const int32_t* coefsP,
534        const int32_t* coefsN,
535        const int32_t* coefsP1,
536        const int32_t* coefsN1,
537        const int16_t* sP,
538        const int16_t* sN,
539        uint32_t lerpP,
540        const int32_t* const volumeLR)
541{
542    const int CHANNELS = 2; // template specialization does not preserve params
543    const int STRIDE = 16;
544    sP -= CHANNELS*((STRIDE>>1)-1);
545    asm (
546        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
547        "veor           q0, q0, q0                    \n"// result, initialize to 0
548        "veor           q4, q4, q4                    \n"// result, initialize to 0
549
550        "1:                                           \n"
551
552        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
553        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
554        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
555        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
556        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
557        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
558
559        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
560        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
561        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
562        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
563
564        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
565        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
566        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
567        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
568
569        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
570        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
571        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
572        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
573
574        "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
575        "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
576
577        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
578        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
579
580        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
581        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
582
583        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
584        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
585        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
586        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
587
588        "vadd.s32       q0, q0, q12                   \n"// accumulate result
589        "vadd.s32       q13, q13, q14                 \n"// accumulate result
590        "vadd.s32       q0, q0, q15                   \n"// accumulate result
591        "vadd.s32       q0, q0, q13                   \n"// accumulate result
592
593        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
594        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
595
596        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
597        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
598
599        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
600        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
601        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
602        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
603
604        "vadd.s32       q4, q4, q12                   \n"// accumulate result
605        "vadd.s32       q13, q13, q14                 \n"// accumulate result
606        "vadd.s32       q4, q4, q15                   \n"// accumulate result
607        "vadd.s32       q4, q4, q13                   \n"// accumulate result
608
609        "subs           %[count], %[count], #8        \n"// update loop counter
610        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
611
612        "bne            1b                            \n"// loop
613
614        ASSEMBLY_ACCUMULATE_STEREO
615
616        : [out]     "=Uv" (out[0]),
617          [count]   "+r" (count),
618          [coefsP0] "+r" (coefsP),
619          [coefsN0] "+r" (coefsN),
620          [coefsP1] "+r" (coefsP1),
621          [coefsN1] "+r" (coefsN1),
622          [sP]      "+r" (sP),
623          [sN]      "+r" (sN)
624        : [lerpP]   "r" (lerpP),
625          [vLR]     "r" (volumeLR)
626        : "cc", "memory",
627          "q0", "q1", "q2", "q3",
628          "q4", "q5", "q6",
629          "q8", "q9", "q10", "q11",
630          "q12", "q13", "q14", "q15"
631    );
632}
633
634#endif //USE_NEON
635
636}; // namespace android
637
638#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
639