1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 18#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 19 20namespace android { 21 22// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 23 24#if USE_NEON 25 26// use intrinsics if inline arm32 assembly is not possible 27#if !USE_INLINE_ASSEMBLY 28#define USE_INTRINSIC 29#endif 30 31// following intrinsics available only on ARM 64 bit ACLE 32#ifndef __aarch64__ 33#undef vld1q_f32_x2 34#undef vld1q_s32_x2 35#endif 36 37#define TO_STRING2(x) #x 38#define TO_STRING(x) TO_STRING2(x) 39// uncomment to print GCC version, may be relevant for intrinsic optimizations 40/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \ 41 "." TO_STRING(__GNUC_MINOR__) \ 42 "." TO_STRING(__GNUC_PATCHLEVEL__)) */ 43 44// 45// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h 46// 47// Two variants are presented here: 48// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32. 49// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header. 50// 51 52// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. 53// These are only used for inline assembly. 54#define ASSEMBLY_ACCUMULATE_MONO \ 55 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ 56 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ 57 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ 58 "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ 59 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ 60 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ 61 "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ 62 63#define ASSEMBLY_ACCUMULATE_STEREO \ 64 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ 65 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ 66 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ 67 "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ 68 "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ 69 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ 70 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ 71 "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ 72 73template <int CHANNELS, int STRIDE, bool FIXED> 74static inline void ProcessNeonIntrinsic(int32_t* out, 75 int count, 76 const int16_t* coefsP, 77 const int16_t* coefsN, 78 const int16_t* sP, 79 const int16_t* sN, 80 const int32_t* volumeLR, 81 uint32_t lerpP, 82 const int16_t* coefsP1, 83 const int16_t* coefsN1) 84{ 85 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 86 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2"); 87 88 sP -= CHANNELS*((STRIDE>>1)-1); 89 coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16); 90 coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16); 91 92 int16x4_t interp; 93 if (!FIXED) { 94 interp = vdup_n_s16(lerpP); 95 //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0); 96 coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16); 97 coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16); 98 } 99 int32x4_t accum, accum2; 100 // warning uninitialized if we use veorq_s32 101 // (alternative to below) accum = veorq_s32(accum, accum); 102 accum = vdupq_n_s32(0); 103 if (CHANNELS == 2) { 104 // (alternative to below) accum2 = veorq_s32(accum2, accum2); 105 accum2 = vdupq_n_s32(0); 106 } 107 do { 108 int16x8_t posCoef = vld1q_s16(coefsP); 109 coefsP += 8; 110 int16x8_t negCoef = vld1q_s16(coefsN); 111 coefsN += 8; 112 if (!FIXED) { // interpolate 113 int16x8_t posCoef1 = vld1q_s16(coefsP1); 114 coefsP1 += 8; 115 int16x8_t negCoef1 = vld1q_s16(coefsN1); 116 coefsN1 += 8; 117 118 posCoef1 = vsubq_s16(posCoef1, posCoef); 119 negCoef = vsubq_s16(negCoef, negCoef1); 120 121 posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0); 122 negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0); 123 124 posCoef = vaddq_s16(posCoef, posCoef1); 125 negCoef = vaddq_s16(negCoef, negCoef1); 126 } 127 switch (CHANNELS) { 128 case 1: { 129 int16x8_t posSamp = vld1q_s16(sP); 130 int16x8_t negSamp = vld1q_s16(sN); 131 sN += 8; 132 posSamp = vrev64q_s16(posSamp); 133 134 // dot product 135 accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed 136 accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed 137 accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef)); 138 accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef)); 139 sP -= 8; 140 } break; 141 case 2: { 142 int16x8x2_t posSamp = vld2q_s16(sP); 143 int16x8x2_t negSamp = vld2q_s16(sN); 144 sN += 16; 145 posSamp.val[0] = vrev64q_s16(posSamp.val[0]); 146 posSamp.val[1] = vrev64q_s16(posSamp.val[1]); 147 148 // dot product 149 accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r 150 accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r 151 accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r 152 accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r 153 accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef)); 154 accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef)); 155 accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef)); 156 accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef)); 157 sP -= 16; 158 } break; 159 } 160 } while (count -= 8); 161 162 // multiply by volume and save 163 volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8); 164 int32x2_t vLR = vld1_s32(volumeLR); 165 int32x2_t outSamp = vld1_s32(out); 166 // combine and funnel down accumulator 167 int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); 168 if (CHANNELS == 1) { 169 // duplicate accum to both L and R 170 outAccum = vpadd_s32(outAccum, outAccum); 171 } else if (CHANNELS == 2) { 172 // accum2 contains R, fold in 173 int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); 174 outAccum = vpadd_s32(outAccum, outAccum2); 175 } 176 outAccum = vqrdmulh_s32(outAccum, vLR); 177 outSamp = vqadd_s32(outSamp, outAccum); 178 vst1_s32(out, outSamp); 179} 180 181template <int CHANNELS, int STRIDE, bool FIXED> 182static inline void ProcessNeonIntrinsic(int32_t* out, 183 int count, 184 const int32_t* coefsP, 185 const int32_t* coefsN, 186 const int16_t* sP, 187 const int16_t* sN, 188 const int32_t* volumeLR, 189 uint32_t lerpP, 190 const int32_t* coefsP1, 191 const int32_t* coefsN1) 192{ 193 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 194 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2"); 195 196 sP -= CHANNELS*((STRIDE>>1)-1); 197 coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16); 198 coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16); 199 200 int32x2_t interp; 201 if (!FIXED) { 202 interp = vdup_n_s32(lerpP); 203 coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16); 204 coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16); 205 } 206 int32x4_t accum, accum2; 207 // warning uninitialized if we use veorq_s32 208 // (alternative to below) accum = veorq_s32(accum, accum); 209 accum = vdupq_n_s32(0); 210 if (CHANNELS == 2) { 211 // (alternative to below) accum2 = veorq_s32(accum2, accum2); 212 accum2 = vdupq_n_s32(0); 213 } 214 do { 215#ifdef vld1q_s32_x2 216 int32x4x2_t posCoef = vld1q_s32_x2(coefsP); 217 coefsP += 8; 218 int32x4x2_t negCoef = vld1q_s32_x2(coefsN); 219 coefsN += 8; 220#else 221 int32x4x2_t posCoef; 222 posCoef.val[0] = vld1q_s32(coefsP); 223 coefsP += 4; 224 posCoef.val[1] = vld1q_s32(coefsP); 225 coefsP += 4; 226 int32x4x2_t negCoef; 227 negCoef.val[0] = vld1q_s32(coefsN); 228 coefsN += 4; 229 negCoef.val[1] = vld1q_s32(coefsN); 230 coefsN += 4; 231#endif 232 if (!FIXED) { // interpolate 233#ifdef vld1q_s32_x2 234 int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1); 235 coefsP1 += 8; 236 int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1); 237 coefsN1 += 8; 238#else 239 int32x4x2_t posCoef1; 240 posCoef1.val[0] = vld1q_s32(coefsP1); 241 coefsP1 += 4; 242 posCoef1.val[1] = vld1q_s32(coefsP1); 243 coefsP1 += 4; 244 int32x4x2_t negCoef1; 245 negCoef1.val[0] = vld1q_s32(coefsN1); 246 coefsN1 += 4; 247 negCoef1.val[1] = vld1q_s32(coefsN1); 248 coefsN1 += 4; 249#endif 250 251 posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]); 252 posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]); 253 negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]); 254 negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]); 255 256 posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0); 257 posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0); 258 negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0); 259 negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0); 260 261 posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]); 262 posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]); 263 negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]); 264 negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]); 265 } 266 switch (CHANNELS) { 267 case 1: { 268 int16x8_t posSamp = vld1q_s16(sP); 269 int16x8_t negSamp = vld1q_s16(sN); 270 sN += 8; 271 posSamp = vrev64q_s16(posSamp); 272 273 int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15); 274 int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15); 275 int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15); 276 int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15); 277 278 // dot product 279 posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed 280 posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed 281 negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); 282 negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); 283 284 accum = vaddq_s32(accum, posSamp0); 285 negSamp0 = vaddq_s32(negSamp0, negSamp1); 286 accum = vaddq_s32(accum, posSamp1); 287 accum = vaddq_s32(accum, negSamp0); 288 289 sP -= 8; 290 } break; 291 case 2: { 292 int16x8x2_t posSamp = vld2q_s16(sP); 293 int16x8x2_t negSamp = vld2q_s16(sN); 294 sN += 16; 295 posSamp.val[0] = vrev64q_s16(posSamp.val[0]); 296 posSamp.val[1] = vrev64q_s16(posSamp.val[1]); 297 298 // left 299 int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15); 300 int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15); 301 int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15); 302 int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15); 303 304 // dot product 305 posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed 306 posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed 307 negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); 308 negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); 309 310 accum = vaddq_s32(accum, posSamp0); 311 negSamp0 = vaddq_s32(negSamp0, negSamp1); 312 accum = vaddq_s32(accum, posSamp1); 313 accum = vaddq_s32(accum, negSamp0); 314 315 // right 316 posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15); 317 posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15); 318 negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15); 319 negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15); 320 321 // dot product 322 posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed 323 posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed 324 negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); 325 negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); 326 327 accum2 = vaddq_s32(accum2, posSamp0); 328 negSamp0 = vaddq_s32(negSamp0, negSamp1); 329 accum2 = vaddq_s32(accum2, posSamp1); 330 accum2 = vaddq_s32(accum2, negSamp0); 331 332 sP -= 16; 333 } break; 334 } 335 } while (count -= 8); 336 337 // multiply by volume and save 338 volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8); 339 int32x2_t vLR = vld1_s32(volumeLR); 340 int32x2_t outSamp = vld1_s32(out); 341 // combine and funnel down accumulator 342 int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); 343 if (CHANNELS == 1) { 344 // duplicate accum to both L and R 345 outAccum = vpadd_s32(outAccum, outAccum); 346 } else if (CHANNELS == 2) { 347 // accum2 contains R, fold in 348 int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); 349 outAccum = vpadd_s32(outAccum, outAccum2); 350 } 351 outAccum = vqrdmulh_s32(outAccum, vLR); 352 outSamp = vqadd_s32(outSamp, outAccum); 353 vst1_s32(out, outSamp); 354} 355 356template <int CHANNELS, int STRIDE, bool FIXED> 357static inline void ProcessNeonIntrinsic(float* out, 358 int count, 359 const float* coefsP, 360 const float* coefsN, 361 const float* sP, 362 const float* sN, 363 const float* volumeLR, 364 float lerpP, 365 const float* coefsP1, 366 const float* coefsN1) 367{ 368 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 369 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2"); 370 371 sP -= CHANNELS*((STRIDE>>1)-1); 372 coefsP = (const float*)__builtin_assume_aligned(coefsP, 16); 373 coefsN = (const float*)__builtin_assume_aligned(coefsN, 16); 374 375 float32x2_t interp; 376 if (!FIXED) { 377 interp = vdup_n_f32(lerpP); 378 coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16); 379 coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16); 380 } 381 float32x4_t accum, accum2; 382 // warning uninitialized if we use veorq_s32 383 // (alternative to below) accum = veorq_s32(accum, accum); 384 accum = vdupq_n_f32(0); 385 if (CHANNELS == 2) { 386 // (alternative to below) accum2 = veorq_s32(accum2, accum2); 387 accum2 = vdupq_n_f32(0); 388 } 389 do { 390#ifdef vld1q_f32_x2 391 float32x4x2_t posCoef = vld1q_f32_x2(coefsP); 392 coefsP += 8; 393 float32x4x2_t negCoef = vld1q_f32_x2(coefsN); 394 coefsN += 8; 395#else 396 float32x4x2_t posCoef; 397 posCoef.val[0] = vld1q_f32(coefsP); 398 coefsP += 4; 399 posCoef.val[1] = vld1q_f32(coefsP); 400 coefsP += 4; 401 float32x4x2_t negCoef; 402 negCoef.val[0] = vld1q_f32(coefsN); 403 coefsN += 4; 404 negCoef.val[1] = vld1q_f32(coefsN); 405 coefsN += 4; 406#endif 407 if (!FIXED) { // interpolate 408#ifdef vld1q_f32_x2 409 float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1); 410 coefsP1 += 8; 411 float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1); 412 coefsN1 += 8; 413#else 414 float32x4x2_t posCoef1; 415 posCoef1.val[0] = vld1q_f32(coefsP1); 416 coefsP1 += 4; 417 posCoef1.val[1] = vld1q_f32(coefsP1); 418 coefsP1 += 4; 419 float32x4x2_t negCoef1; 420 negCoef1.val[0] = vld1q_f32(coefsN1); 421 coefsN1 += 4; 422 negCoef1.val[1] = vld1q_f32(coefsN1); 423 coefsN1 += 4; 424#endif 425 posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]); 426 posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]); 427 negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]); 428 negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]); 429 430 posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0); 431 posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0); 432 negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev 433 negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev 434 } 435 switch (CHANNELS) { 436 case 1: { 437#ifdef vld1q_f32_x2 438 float32x4x2_t posSamp = vld1q_f32_x2(sP); 439 float32x4x2_t negSamp = vld1q_f32_x2(sN); 440 sN += 8; 441 sP -= 8; 442#else 443 float32x4x2_t posSamp; 444 posSamp.val[0] = vld1q_f32(sP); 445 sP += 4; 446 posSamp.val[1] = vld1q_f32(sP); 447 sP -= 12; 448 float32x4x2_t negSamp; 449 negSamp.val[0] = vld1q_f32(sN); 450 sN += 4; 451 negSamp.val[1] = vld1q_f32(sN); 452 sN += 4; 453#endif 454 // effectively we want a vrev128q_f32() 455 posSamp.val[0] = vrev64q_f32(posSamp.val[0]); 456 posSamp.val[1] = vrev64q_f32(posSamp.val[1]); 457 posSamp.val[0] = vcombine_f32( 458 vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0])); 459 posSamp.val[1] = vcombine_f32( 460 vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1])); 461 462 accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]); 463 accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]); 464 accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]); 465 accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]); 466 } break; 467 case 2: { 468 float32x4x2_t posSamp0 = vld2q_f32(sP); 469 sP += 8; 470 float32x4x2_t negSamp0 = vld2q_f32(sN); 471 sN += 8; 472 posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]); 473 posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]); 474 posSamp0.val[0] = vcombine_f32( 475 vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0])); 476 posSamp0.val[1] = vcombine_f32( 477 vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1])); 478 479 float32x4x2_t posSamp1 = vld2q_f32(sP); 480 sP -= 24; 481 float32x4x2_t negSamp1 = vld2q_f32(sN); 482 sN += 8; 483 posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]); 484 posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]); 485 posSamp1.val[0] = vcombine_f32( 486 vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0])); 487 posSamp1.val[1] = vcombine_f32( 488 vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1])); 489 490 // Note: speed is affected by accumulation order. 491 // Also, speed appears slower using vmul/vadd instead of vmla for 492 // stereo case, comparable for mono. 493 494 accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]); 495 accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]); 496 accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]); 497 accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]); 498 499 accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed 500 accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed 501 accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed 502 accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed 503 } break; 504 } 505 } while (count -= 8); 506 507 // multiply by volume and save 508 volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8); 509 float32x2_t vLR = vld1_f32(volumeLR); 510 float32x2_t outSamp = vld1_f32(out); 511 // combine and funnel down accumulator 512 float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)); 513 if (CHANNELS == 1) { 514 // duplicate accum to both L and R 515 outAccum = vpadd_f32(outAccum, outAccum); 516 } else if (CHANNELS == 2) { 517 // accum2 contains R, fold in 518 float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); 519 outAccum = vpadd_f32(outAccum, outAccum2); 520 } 521 outSamp = vmla_f32(outSamp, outAccum, vLR); 522 vst1_f32(out, outSamp); 523} 524 525template <> 526inline void ProcessL<1, 16>(int32_t* const out, 527 int count, 528 const int16_t* coefsP, 529 const int16_t* coefsN, 530 const int16_t* sP, 531 const int16_t* sN, 532 const int32_t* const volumeLR) 533{ 534#ifdef USE_INTRINSIC 535 ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 536 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 537#else 538 const int CHANNELS = 1; // template specialization does not preserve params 539 const int STRIDE = 16; 540 sP -= CHANNELS*((STRIDE>>1)-1); 541 asm ( 542 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 543 544 "1: \n" 545 546 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 547 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 548 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 549 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 550 551 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 552 553 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 554 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef 555 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef 556 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 557 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 558 559 // moving these ARM instructions before neon above seems to be slower 560 "subs %[count], %[count], #8 \n"// (1) update loop counter 561 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 562 563 // sP used after branch (warning) 564 "bne 1b \n"// loop 565 566 ASSEMBLY_ACCUMULATE_MONO 567 568 : [out] "=Uv" (out[0]), 569 [count] "+r" (count), 570 [coefsP0] "+r" (coefsP), 571 [coefsN0] "+r" (coefsN), 572 [sP] "+r" (sP), 573 [sN] "+r" (sN) 574 : [vLR] "r" (volumeLR) 575 : "cc", "memory", 576 "q0", "q1", "q2", "q3", 577 "q8", "q10" 578 ); 579#endif 580} 581 582template <> 583inline void ProcessL<2, 16>(int32_t* const out, 584 int count, 585 const int16_t* coefsP, 586 const int16_t* coefsN, 587 const int16_t* sP, 588 const int16_t* sN, 589 const int32_t* const volumeLR) 590{ 591#ifdef USE_INTRINSIC 592 ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 593 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 594#else 595 const int CHANNELS = 2; // template specialization does not preserve params 596 const int STRIDE = 16; 597 sP -= CHANNELS*((STRIDE>>1)-1); 598 asm ( 599 "veor q0, q0, q0 \n"// (1) acc_L = 0 600 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 601 602 "1: \n" 603 604 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames 605 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames 606 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 607 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 608 609 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left 610 "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right 611 612 "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left 613 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left 614 "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right 615 "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right 616 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 617 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 618 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 619 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 620 621 // moving these ARM before neon seems to be slower 622 "subs %[count], %[count], #8 \n"// (1) update loop counter 623 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 624 625 // sP used after branch (warning) 626 "bne 1b \n"// loop 627 628 ASSEMBLY_ACCUMULATE_STEREO 629 630 : [out] "=Uv" (out[0]), 631 [count] "+r" (count), 632 [coefsP0] "+r" (coefsP), 633 [coefsN0] "+r" (coefsN), 634 [sP] "+r" (sP), 635 [sN] "+r" (sN) 636 : [vLR] "r" (volumeLR) 637 : "cc", "memory", 638 "q0", "q1", "q2", "q3", 639 "q4", "q5", "q6", 640 "q8", "q10" 641 ); 642#endif 643} 644 645template <> 646inline void Process<1, 16>(int32_t* const out, 647 int count, 648 const int16_t* coefsP, 649 const int16_t* coefsN, 650 const int16_t* coefsP1, 651 const int16_t* coefsN1, 652 const int16_t* sP, 653 const int16_t* sN, 654 uint32_t lerpP, 655 const int32_t* const volumeLR) 656{ 657#ifdef USE_INTRINSIC 658 ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 659 lerpP, coefsP1, coefsN1); 660#else 661 662 const int CHANNELS = 1; // template specialization does not preserve params 663 const int STRIDE = 16; 664 sP -= CHANNELS*((STRIDE>>1)-1); 665 asm ( 666 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 667 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 668 669 "1: \n" 670 671 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 672 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 673 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 674 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 675 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 676 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 677 678 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 679 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 680 681 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 682 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 683 684 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 685 686 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set 687 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 688 689 // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 690 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef 691 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef 692 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 693 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 694 695 // moving these ARM instructions before neon above seems to be slower 696 "subs %[count], %[count], #8 \n"// (1) update loop counter 697 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 698 699 // sP used after branch (warning) 700 "bne 1b \n"// loop 701 702 ASSEMBLY_ACCUMULATE_MONO 703 704 : [out] "=Uv" (out[0]), 705 [count] "+r" (count), 706 [coefsP0] "+r" (coefsP), 707 [coefsN0] "+r" (coefsN), 708 [coefsP1] "+r" (coefsP1), 709 [coefsN1] "+r" (coefsN1), 710 [sP] "+r" (sP), 711 [sN] "+r" (sN) 712 : [lerpP] "r" (lerpP), 713 [vLR] "r" (volumeLR) 714 : "cc", "memory", 715 "q0", "q1", "q2", "q3", 716 "q8", "q9", "q10", "q11" 717 ); 718#endif 719} 720 721template <> 722inline void Process<2, 16>(int32_t* const out, 723 int count, 724 const int16_t* coefsP, 725 const int16_t* coefsN, 726 const int16_t* coefsP1, 727 const int16_t* coefsN1, 728 const int16_t* sP, 729 const int16_t* sN, 730 uint32_t lerpP, 731 const int32_t* const volumeLR) 732{ 733#ifdef USE_INTRINSIC 734 ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 735 lerpP, coefsP1, coefsN1); 736#else 737 const int CHANNELS = 2; // template specialization does not preserve params 738 const int STRIDE = 16; 739 sP -= CHANNELS*((STRIDE>>1)-1); 740 asm ( 741 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 742 "veor q0, q0, q0 \n"// (1) acc_L = 0 743 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 744 745 "1: \n" 746 747 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames 748 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames 749 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 750 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 751 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 752 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 753 754 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 755 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 756 757 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 758 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 759 760 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left 761 "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right 762 763 "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set 764 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 765 766 "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left 767 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left 768 "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right 769 "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right 770 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 771 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 772 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 773 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 774 775 // moving these ARM before neon seems to be slower 776 "subs %[count], %[count], #8 \n"// (1) update loop counter 777 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 778 779 // sP used after branch (warning) 780 "bne 1b \n"// loop 781 782 ASSEMBLY_ACCUMULATE_STEREO 783 784 : [out] "=Uv" (out[0]), 785 [count] "+r" (count), 786 [coefsP0] "+r" (coefsP), 787 [coefsN0] "+r" (coefsN), 788 [coefsP1] "+r" (coefsP1), 789 [coefsN1] "+r" (coefsN1), 790 [sP] "+r" (sP), 791 [sN] "+r" (sN) 792 : [lerpP] "r" (lerpP), 793 [vLR] "r" (volumeLR) 794 : "cc", "memory", 795 "q0", "q1", "q2", "q3", 796 "q4", "q5", "q6", 797 "q8", "q9", "q10", "q11" 798 ); 799#endif 800} 801 802template <> 803inline void ProcessL<1, 16>(int32_t* const out, 804 int count, 805 const int32_t* coefsP, 806 const int32_t* coefsN, 807 const int16_t* sP, 808 const int16_t* sN, 809 const int32_t* const volumeLR) 810{ 811#ifdef USE_INTRINSIC 812 ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 813 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 814#else 815 const int CHANNELS = 1; // template specialization does not preserve params 816 const int STRIDE = 16; 817 sP -= CHANNELS*((STRIDE>>1)-1); 818 asm ( 819 "veor q0, q0, q0 \n"// result, initialize to 0 820 821 "1: \n" 822 823 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 824 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 825 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 826 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 827 828 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side 829 830 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 831 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 832 833 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 834 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 835 836 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples 837 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples 838 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples 839 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples 840 841 "vadd.s32 q0, q0, q12 \n"// accumulate result 842 "vadd.s32 q13, q13, q14 \n"// accumulate result 843 "vadd.s32 q0, q0, q15 \n"// accumulate result 844 "vadd.s32 q0, q0, q13 \n"// accumulate result 845 846 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 847 "subs %[count], %[count], #8 \n"// update loop counter 848 849 "bne 1b \n"// loop 850 851 ASSEMBLY_ACCUMULATE_MONO 852 853 : [out] "=Uv" (out[0]), 854 [count] "+r" (count), 855 [coefsP0] "+r" (coefsP), 856 [coefsN0] "+r" (coefsN), 857 [sP] "+r" (sP), 858 [sN] "+r" (sN) 859 : [vLR] "r" (volumeLR) 860 : "cc", "memory", 861 "q0", "q1", "q2", "q3", 862 "q8", "q9", "q10", "q11", 863 "q12", "q13", "q14", "q15" 864 ); 865#endif 866} 867 868template <> 869inline void ProcessL<2, 16>(int32_t* const out, 870 int count, 871 const int32_t* coefsP, 872 const int32_t* coefsN, 873 const int16_t* sP, 874 const int16_t* sN, 875 const int32_t* const volumeLR) 876{ 877#ifdef USE_INTRINSIC 878 ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 879 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 880#else 881 const int CHANNELS = 2; // template specialization does not preserve params 882 const int STRIDE = 16; 883 sP -= CHANNELS*((STRIDE>>1)-1); 884 asm ( 885 "veor q0, q0, q0 \n"// result, initialize to 0 886 "veor q4, q4, q4 \n"// result, initialize to 0 887 888 "1: \n" 889 890 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames 891 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames 892 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 893 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 894 895 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left 896 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right 897 898 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 899 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 900 901 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 902 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 903 904 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef 905 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 906 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 907 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef 908 909 "vadd.s32 q0, q0, q12 \n"// accumulate result 910 "vadd.s32 q13, q13, q14 \n"// accumulate result 911 "vadd.s32 q0, q0, q15 \n"// accumulate result 912 "vadd.s32 q0, q0, q13 \n"// accumulate result 913 914 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 915 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 916 917 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 918 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 919 920 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef 921 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 922 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 923 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef 924 925 "vadd.s32 q4, q4, q12 \n"// accumulate result 926 "vadd.s32 q13, q13, q14 \n"// accumulate result 927 "vadd.s32 q4, q4, q15 \n"// accumulate result 928 "vadd.s32 q4, q4, q13 \n"// accumulate result 929 930 "subs %[count], %[count], #8 \n"// update loop counter 931 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 932 933 "bne 1b \n"// loop 934 935 ASSEMBLY_ACCUMULATE_STEREO 936 937 : [out] "=Uv" (out[0]), 938 [count] "+r" (count), 939 [coefsP0] "+r" (coefsP), 940 [coefsN0] "+r" (coefsN), 941 [sP] "+r" (sP), 942 [sN] "+r" (sN) 943 : [vLR] "r" (volumeLR) 944 : "cc", "memory", 945 "q0", "q1", "q2", "q3", 946 "q4", "q5", "q6", 947 "q8", "q9", "q10", "q11", 948 "q12", "q13", "q14", "q15" 949 ); 950#endif 951} 952 953template <> 954inline void Process<1, 16>(int32_t* const out, 955 int count, 956 const int32_t* coefsP, 957 const int32_t* coefsN, 958 const int32_t* coefsP1, 959 const int32_t* coefsN1, 960 const int16_t* sP, 961 const int16_t* sN, 962 uint32_t lerpP, 963 const int32_t* const volumeLR) 964{ 965#ifdef USE_INTRINSIC 966 ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 967 lerpP, coefsP1, coefsN1); 968#else 969 const int CHANNELS = 1; // template specialization does not preserve params 970 const int STRIDE = 16; 971 sP -= CHANNELS*((STRIDE>>1)-1); 972 asm ( 973 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 974 "veor q0, q0, q0 \n"// result, initialize to 0 975 976 "1: \n" 977 978 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 979 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 980 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 981 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 982 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 983 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 984 985 "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 986 "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 987 "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 988 "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 989 990 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 991 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 992 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 993 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 994 995 "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 996 "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 997 "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 998 "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 999 1000 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side 1001 1002 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 1003 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 1004 1005 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 1006 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 1007 1008 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 1009 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 1010 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 1011 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 1012 1013 "vadd.s32 q0, q0, q12 \n"// accumulate result 1014 "vadd.s32 q13, q13, q14 \n"// accumulate result 1015 "vadd.s32 q0, q0, q15 \n"// accumulate result 1016 "vadd.s32 q0, q0, q13 \n"// accumulate result 1017 1018 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 1019 "subs %[count], %[count], #8 \n"// update loop counter 1020 1021 "bne 1b \n"// loop 1022 1023 ASSEMBLY_ACCUMULATE_MONO 1024 1025 : [out] "=Uv" (out[0]), 1026 [count] "+r" (count), 1027 [coefsP0] "+r" (coefsP), 1028 [coefsN0] "+r" (coefsN), 1029 [coefsP1] "+r" (coefsP1), 1030 [coefsN1] "+r" (coefsN1), 1031 [sP] "+r" (sP), 1032 [sN] "+r" (sN) 1033 : [lerpP] "r" (lerpP), 1034 [vLR] "r" (volumeLR) 1035 : "cc", "memory", 1036 "q0", "q1", "q2", "q3", 1037 "q8", "q9", "q10", "q11", 1038 "q12", "q13", "q14", "q15" 1039 ); 1040#endif 1041} 1042 1043template <> 1044inline void Process<2, 16>(int32_t* const out, 1045 int count, 1046 const int32_t* coefsP, 1047 const int32_t* coefsN, 1048 const int32_t* coefsP1, 1049 const int32_t* coefsN1, 1050 const int16_t* sP, 1051 const int16_t* sN, 1052 uint32_t lerpP, 1053 const int32_t* const volumeLR) 1054{ 1055#ifdef USE_INTRINSIC 1056 ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 1057 lerpP, coefsP1, coefsN1); 1058#else 1059 const int CHANNELS = 2; // template specialization does not preserve params 1060 const int STRIDE = 16; 1061 sP -= CHANNELS*((STRIDE>>1)-1); 1062 asm ( 1063 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 1064 "veor q0, q0, q0 \n"// result, initialize to 0 1065 "veor q4, q4, q4 \n"// result, initialize to 0 1066 1067 "1: \n" 1068 1069 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames 1070 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames 1071 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 1072 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 1073 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 1074 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 1075 1076 "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 1077 "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 1078 "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 1079 "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 1080 1081 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 1082 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 1083 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 1084 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 1085 1086 "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 1087 "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 1088 "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 1089 "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 1090 1091 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left 1092 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right 1093 1094 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 1095 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 1096 1097 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 1098 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 1099 1100 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 1101 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 1102 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 1103 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 1104 1105 "vadd.s32 q0, q0, q12 \n"// accumulate result 1106 "vadd.s32 q13, q13, q14 \n"// accumulate result 1107 "vadd.s32 q0, q0, q15 \n"// accumulate result 1108 "vadd.s32 q0, q0, q13 \n"// accumulate result 1109 1110 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 1111 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 1112 1113 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 1114 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 1115 1116 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 1117 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 1118 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 1119 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 1120 1121 "vadd.s32 q4, q4, q12 \n"// accumulate result 1122 "vadd.s32 q13, q13, q14 \n"// accumulate result 1123 "vadd.s32 q4, q4, q15 \n"// accumulate result 1124 "vadd.s32 q4, q4, q13 \n"// accumulate result 1125 1126 "subs %[count], %[count], #8 \n"// update loop counter 1127 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 1128 1129 "bne 1b \n"// loop 1130 1131 ASSEMBLY_ACCUMULATE_STEREO 1132 1133 : [out] "=Uv" (out[0]), 1134 [count] "+r" (count), 1135 [coefsP0] "+r" (coefsP), 1136 [coefsN0] "+r" (coefsN), 1137 [coefsP1] "+r" (coefsP1), 1138 [coefsN1] "+r" (coefsN1), 1139 [sP] "+r" (sP), 1140 [sN] "+r" (sN) 1141 : [lerpP] "r" (lerpP), 1142 [vLR] "r" (volumeLR) 1143 : "cc", "memory", 1144 "q0", "q1", "q2", "q3", 1145 "q4", "q5", "q6", 1146 "q8", "q9", "q10", "q11", 1147 "q12", "q13", "q14", "q15" 1148 ); 1149#endif 1150} 1151 1152template<> 1153inline void ProcessL<1, 16>(float* const out, 1154 int count, 1155 const float* coefsP, 1156 const float* coefsN, 1157 const float* sP, 1158 const float* sN, 1159 const float* const volumeLR) 1160{ 1161 ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 1162 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 1163} 1164 1165template<> 1166inline void ProcessL<2, 16>(float* const out, 1167 int count, 1168 const float* coefsP, 1169 const float* coefsN, 1170 const float* sP, 1171 const float* sN, 1172 const float* const volumeLR) 1173{ 1174 ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 1175 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 1176} 1177 1178template<> 1179inline void Process<1, 16>(float* const out, 1180 int count, 1181 const float* coefsP, 1182 const float* coefsN, 1183 const float* coefsP1, 1184 const float* coefsN1, 1185 const float* sP, 1186 const float* sN, 1187 float lerpP, 1188 const float* const volumeLR) 1189{ 1190 ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 1191 lerpP, coefsP1, coefsN1); 1192} 1193 1194template<> 1195inline void Process<2, 16>(float* const out, 1196 int count, 1197 const float* coefsP, 1198 const float* coefsN, 1199 const float* coefsP1, 1200 const float* coefsN1, 1201 const float* sP, 1202 const float* sN, 1203 float lerpP, 1204 const float* const volumeLR) 1205{ 1206 ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 1207 lerpP, coefsP1, coefsN1); 1208} 1209 1210#endif //USE_NEON 1211 1212} // namespace android 1213 1214#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ 1215