AudioResamplerFirProcessNeon.h revision d7a77156eb13973f7fce5c9db6113bef83bc205b
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 19 20namespace android { 21 22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 23 24#if USE_NEON 25// 26// NEON specializations are enabled for Process() and ProcessL() 27 28// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. 29#define ASSEMBLY_ACCUMULATE_MONO \ 30 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ 31 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ 32 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ 33 "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ 34 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ 35 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ 36 "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ 37 38#define ASSEMBLY_ACCUMULATE_STEREO \ 39 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ 40 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ 41 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ 42 "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ 43 "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ 44 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ 45 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ 46 "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ 47 48template <> 49inline void ProcessL<1, 16>(int32_t* const out, 50 int count, 51 const int16_t* coefsP, 52 const int16_t* coefsN, 53 const int16_t* sP, 54 const int16_t* sN, 55 const int32_t* const volumeLR) 56{ 57 const int CHANNELS = 1; // template specialization does not preserve params 58 const int STRIDE = 16; 59 sP -= CHANNELS*((STRIDE>>1)-1); 60 asm ( 61 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 62 63 "1: \n" 64 65 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 66 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 67 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 68 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 69 70 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 71 72 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 73 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef 74 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef 75 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 76 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 77 78 // moving these ARM instructions before neon above seems to be slower 79 "subs %[count], %[count], #8 \n"// (1) update loop counter 80 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 81 82 // sP used after branch (warning) 83 "bne 1b \n"// loop 84 85 ASSEMBLY_ACCUMULATE_MONO 86 87 : [out] "=Uv" (out[0]), 88 [count] "+r" (count), 89 [coefsP0] "+r" (coefsP), 90 [coefsN0] "+r" (coefsN), 91 [sP] "+r" (sP), 92 [sN] "+r" (sN) 93 : [vLR] "r" (volumeLR) 94 : "cc", "memory", 95 "q0", "q1", "q2", "q3", 96 "q8", "q10" 97 ); 98} 99 100template <> 101inline void ProcessL<2, 16>(int32_t* const out, 102 int count, 103 const int16_t* coefsP, 104 const int16_t* coefsN, 105 const int16_t* sP, 106 const int16_t* sN, 107 const int32_t* const volumeLR) 108{ 109 const int CHANNELS = 2; // template specialization does not preserve params 110 const int STRIDE = 16; 111 sP -= CHANNELS*((STRIDE>>1)-1); 112 asm ( 113 "veor q0, q0, q0 \n"// (1) acc_L = 0 114 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 115 116 "1: \n" 117 118 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames 119 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames 120 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 121 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 122 123 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left 124 "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right 125 126 "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left 127 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left 128 "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right 129 "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right 130 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 131 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 132 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 133 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 134 135 // moving these ARM before neon seems to be slower 136 "subs %[count], %[count], #8 \n"// (1) update loop counter 137 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 138 139 // sP used after branch (warning) 140 "bne 1b \n"// loop 141 142 ASSEMBLY_ACCUMULATE_STEREO 143 144 : [out] "=Uv" (out[0]), 145 [count] "+r" (count), 146 [coefsP0] "+r" (coefsP), 147 [coefsN0] "+r" (coefsN), 148 [sP] "+r" (sP), 149 [sN] "+r" (sN) 150 : [vLR] "r" (volumeLR) 151 : "cc", "memory", 152 "q0", "q1", "q2", "q3", 153 "q4", "q5", "q6", 154 "q8", "q10" 155 ); 156} 157 158template <> 159inline void Process<1, 16>(int32_t* const out, 160 int count, 161 const int16_t* coefsP, 162 const int16_t* coefsN, 163 const int16_t* coefsP1, 164 const int16_t* coefsN1, 165 const int16_t* sP, 166 const int16_t* sN, 167 uint32_t lerpP, 168 const int32_t* const volumeLR) 169{ 170 const int CHANNELS = 1; // template specialization does not preserve params 171 const int STRIDE = 16; 172 sP -= CHANNELS*((STRIDE>>1)-1); 173 asm ( 174 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 175 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 176 177 "1: \n" 178 179 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 180 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 181 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 182 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 183 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 184 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 185 186 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 187 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 188 189 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 190 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 191 192 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 193 194 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set 195 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 196 197 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 198 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef 199 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef 200 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 201 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 202 203 // moving these ARM instructions before neon above seems to be slower 204 "subs %[count], %[count], #8 \n"// (1) update loop counter 205 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 206 207 // sP used after branch (warning) 208 "bne 1b \n"// loop 209 210 ASSEMBLY_ACCUMULATE_MONO 211 212 : [out] "=Uv" (out[0]), 213 [count] "+r" (count), 214 [coefsP0] "+r" (coefsP), 215 [coefsN0] "+r" (coefsN), 216 [coefsP1] "+r" (coefsP1), 217 [coefsN1] "+r" (coefsN1), 218 [sP] "+r" (sP), 219 [sN] "+r" (sN) 220 : [lerpP] "r" (lerpP), 221 [vLR] "r" (volumeLR) 222 : "cc", "memory", 223 "q0", "q1", "q2", "q3", 224 "q8", "q9", "q10", "q11" 225 ); 226} 227 228template <> 229inline void Process<2, 16>(int32_t* const out, 230 int count, 231 const int16_t* coefsP, 232 const int16_t* coefsN, 233 const int16_t* coefsP1, 234 const int16_t* coefsN1, 235 const int16_t* sP, 236 const int16_t* sN, 237 uint32_t lerpP, 238 const int32_t* const volumeLR) 239{ 240 const int CHANNELS = 2; // template specialization does not preserve params 241 const int STRIDE = 16; 242 sP -= CHANNELS*((STRIDE>>1)-1); 243 asm ( 244 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 245 "veor q0, q0, q0 \n"// (1) acc_L = 0 246 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 247 248 "1: \n" 249 250 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames 251 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames 252 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 253 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 254 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 255 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 256 257 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 258 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 259 260 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 261 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 262 263 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left 264 "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right 265 266 "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set 267 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 268 269 "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left 270 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left 271 "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right 272 "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right 273 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 274 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 275 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 276 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 277 278 // moving these ARM before neon seems to be slower 279 "subs %[count], %[count], #8 \n"// (1) update loop counter 280 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 281 282 // sP used after branch (warning) 283 "bne 1b \n"// loop 284 285 ASSEMBLY_ACCUMULATE_STEREO 286 287 : [out] "=Uv" (out[0]), 288 [count] "+r" (count), 289 [coefsP0] "+r" (coefsP), 290 [coefsN0] "+r" (coefsN), 291 [coefsP1] "+r" (coefsP1), 292 [coefsN1] "+r" (coefsN1), 293 [sP] "+r" (sP), 294 [sN] "+r" (sN) 295 : [lerpP] "r" (lerpP), 296 [vLR] "r" (volumeLR) 297 : "cc", "memory", 298 "q0", "q1", "q2", "q3", 299 "q4", "q5", "q6", 300 "q8", "q9", "q10", "q11" 301 ); 302} 303 304template <> 305inline void ProcessL<1, 16>(int32_t* const out, 306 int count, 307 const int32_t* coefsP, 308 const int32_t* coefsN, 309 const int16_t* sP, 310 const int16_t* sN, 311 const int32_t* const volumeLR) 312{ 313 const int CHANNELS = 1; // template specialization does not preserve params 314 const int STRIDE = 16; 315 sP -= CHANNELS*((STRIDE>>1)-1); 316 asm ( 317 "veor q0, q0, q0 \n"// result, initialize to 0 318 319 "1: \n" 320 321 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 322 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 323 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 324 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 325 326 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side 327 328 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 329 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 330 331 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 332 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 333 334 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples 335 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples 336 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples 337 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples 338 339 "vadd.s32 q0, q0, q12 \n"// accumulate result 340 "vadd.s32 q13, q13, q14 \n"// accumulate result 341 "vadd.s32 q0, q0, q15 \n"// accumulate result 342 "vadd.s32 q0, q0, q13 \n"// accumulate result 343 344 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 345 "subs %[count], %[count], #8 \n"// update loop counter 346 347 "bne 1b \n"// loop 348 349 ASSEMBLY_ACCUMULATE_MONO 350 351 : [out] "=Uv" (out[0]), 352 [count] "+r" (count), 353 [coefsP0] "+r" (coefsP), 354 [coefsN0] "+r" (coefsN), 355 [sP] "+r" (sP), 356 [sN] "+r" (sN) 357 : [vLR] "r" (volumeLR) 358 : "cc", "memory", 359 "q0", "q1", "q2", "q3", 360 "q8", "q9", "q10", "q11", 361 "q12", "q13", "q14", "q15" 362 ); 363} 364 365template <> 366inline void ProcessL<2, 16>(int32_t* const out, 367 int count, 368 const int32_t* coefsP, 369 const int32_t* coefsN, 370 const int16_t* sP, 371 const int16_t* sN, 372 const int32_t* const volumeLR) 373{ 374 const int CHANNELS = 2; // template specialization does not preserve params 375 const int STRIDE = 16; 376 sP -= CHANNELS*((STRIDE>>1)-1); 377 asm ( 378 "veor q0, q0, q0 \n"// result, initialize to 0 379 "veor q4, q4, q4 \n"// result, initialize to 0 380 381 "1: \n" 382 383 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames 384 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames 385 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 386 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 387 388 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left 389 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right 390 391 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 392 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 393 394 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 395 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 396 397 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef 398 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 399 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 400 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef 401 402 "vadd.s32 q0, q0, q12 \n"// accumulate result 403 "vadd.s32 q13, q13, q14 \n"// accumulate result 404 "vadd.s32 q0, q0, q15 \n"// accumulate result 405 "vadd.s32 q0, q0, q13 \n"// accumulate result 406 407 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 408 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 409 410 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 411 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 412 413 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef 414 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 415 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 416 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef 417 418 "vadd.s32 q4, q4, q12 \n"// accumulate result 419 "vadd.s32 q13, q13, q14 \n"// accumulate result 420 "vadd.s32 q4, q4, q15 \n"// accumulate result 421 "vadd.s32 q4, q4, q13 \n"// accumulate result 422 423 "subs %[count], %[count], #8 \n"// update loop counter 424 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 425 426 "bne 1b \n"// loop 427 428 ASSEMBLY_ACCUMULATE_STEREO 429 430 : [out] "=Uv" (out[0]), 431 [count] "+r" (count), 432 [coefsP0] "+r" (coefsP), 433 [coefsN0] "+r" (coefsN), 434 [sP] "+r" (sP), 435 [sN] "+r" (sN) 436 : [vLR] "r" (volumeLR) 437 : "cc", "memory", 438 "q0", "q1", "q2", "q3", 439 "q4", "q5", "q6", 440 "q8", "q9", "q10", "q11", 441 "q12", "q13", "q14", "q15" 442 ); 443} 444 445template <> 446inline void Process<1, 16>(int32_t* const out, 447 int count, 448 const int32_t* coefsP, 449 const int32_t* coefsN, 450 const int32_t* coefsP1, 451 const int32_t* coefsN1, 452 const int16_t* sP, 453 const int16_t* sN, 454 uint32_t lerpP, 455 const int32_t* const volumeLR) 456{ 457 const int CHANNELS = 1; // template specialization does not preserve params 458 const int STRIDE = 16; 459 sP -= CHANNELS*((STRIDE>>1)-1); 460 asm ( 461 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 462 "veor q0, q0, q0 \n"// result, initialize to 0 463 464 "1: \n" 465 466 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 467 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 468 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 469 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 470 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 471 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 472 473 "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 474 "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 475 "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 476 "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 477 478 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 479 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 480 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 481 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 482 483 "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 484 "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 485 "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 486 "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 487 488 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side 489 490 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 491 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 492 493 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 494 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 495 496 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 497 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 498 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 499 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 500 501 "vadd.s32 q0, q0, q12 \n"// accumulate result 502 "vadd.s32 q13, q13, q14 \n"// accumulate result 503 "vadd.s32 q0, q0, q15 \n"// accumulate result 504 "vadd.s32 q0, q0, q13 \n"// accumulate result 505 506 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 507 "subs %[count], %[count], #8 \n"// update loop counter 508 509 "bne 1b \n"// loop 510 511 ASSEMBLY_ACCUMULATE_MONO 512 513 : [out] "=Uv" (out[0]), 514 [count] "+r" (count), 515 [coefsP0] "+r" (coefsP), 516 [coefsN0] "+r" (coefsN), 517 [coefsP1] "+r" (coefsP1), 518 [coefsN1] "+r" (coefsN1), 519 [sP] "+r" (sP), 520 [sN] "+r" (sN) 521 : [lerpP] "r" (lerpP), 522 [vLR] "r" (volumeLR) 523 : "cc", "memory", 524 "q0", "q1", "q2", "q3", 525 "q8", "q9", "q10", "q11", 526 "q12", "q13", "q14", "q15" 527 ); 528} 529 530template <> 531inline void Process<2, 16>(int32_t* const out, 532 int count, 533 const int32_t* coefsP, 534 const int32_t* coefsN, 535 const int32_t* coefsP1, 536 const int32_t* coefsN1, 537 const int16_t* sP, 538 const int16_t* sN, 539 uint32_t lerpP, 540 const int32_t* const volumeLR) 541{ 542 const int CHANNELS = 2; // template specialization does not preserve params 543 const int STRIDE = 16; 544 sP -= CHANNELS*((STRIDE>>1)-1); 545 asm ( 546 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 547 "veor q0, q0, q0 \n"// result, initialize to 0 548 "veor q4, q4, q4 \n"// result, initialize to 0 549 550 "1: \n" 551 552 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames 553 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames 554 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 555 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 556 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 557 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 558 559 "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 560 "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 561 "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 562 "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 563 564 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 565 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 566 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 567 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 568 569 "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 570 "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 571 "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 572 "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 573 574 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left 575 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right 576 577 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 578 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 579 580 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 581 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 582 583 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 584 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 585 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 586 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 587 588 "vadd.s32 q0, q0, q12 \n"// accumulate result 589 "vadd.s32 q13, q13, q14 \n"// accumulate result 590 "vadd.s32 q0, q0, q15 \n"// accumulate result 591 "vadd.s32 q0, q0, q13 \n"// accumulate result 592 593 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 594 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 595 596 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 597 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 598 599 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 600 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 601 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 602 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 603 604 "vadd.s32 q4, q4, q12 \n"// accumulate result 605 "vadd.s32 q13, q13, q14 \n"// accumulate result 606 "vadd.s32 q4, q4, q15 \n"// accumulate result 607 "vadd.s32 q4, q4, q13 \n"// accumulate result 608 609 "subs %[count], %[count], #8 \n"// update loop counter 610 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 611 612 "bne 1b \n"// loop 613 614 ASSEMBLY_ACCUMULATE_STEREO 615 616 : [out] "=Uv" (out[0]), 617 [count] "+r" (count), 618 [coefsP0] "+r" (coefsP), 619 [coefsN0] "+r" (coefsN), 620 [coefsP1] "+r" (coefsP1), 621 [coefsN1] "+r" (coefsN1), 622 [sP] "+r" (sP), 623 [sN] "+r" (sN) 624 : [lerpP] "r" (lerpP), 625 [vLR] "r" (volumeLR) 626 : "cc", "memory", 627 "q0", "q1", "q2", "q3", 628 "q4", "q5", "q6", 629 "q8", "q9", "q10", "q11", 630 "q12", "q13", "q14", "q15" 631 ); 632} 633 634#endif //USE_NEON 635 636}; // namespace android 637 638#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ 639