186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung/* 286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Copyright (C) 2013 The Android Open Source Project 386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * 486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Licensed under the Apache License, Version 2.0 (the "License"); 586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * you may not use this file except in compliance with the License. 686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * You may obtain a copy of the License at 786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * 886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * http://www.apache.org/licenses/LICENSE-2.0 986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * 1086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Unless required by applicable law or agreed to in writing, software 1186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * distributed under the License is distributed on an "AS IS" BASIS, 1286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * See the License for the specific language governing permissions and 1486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * limitations under the License. 1586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung */ 1686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 1786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 1886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 1986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 2086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungnamespace android { 2186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 2286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 2386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 2486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#if USE_NEON 256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// use intrinsics if inline arm32 assembly is not possible 276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#if !USE_INLINE_ASSEMBLY 286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#define USE_INTRINSIC 296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// following intrinsics available only on ARM 64 bit ACLE 326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifndef __aarch64__ 336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#undef vld1q_f32_x2 346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#undef vld1q_s32_x2 356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#define TO_STRING2(x) #x 386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#define TO_STRING(x) TO_STRING2(x) 396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// uncomment to print GCC version, may be relevant for intrinsic optimizations 406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \ 416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung "." TO_STRING(__GNUC_MINOR__) \ 426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung "." TO_STRING(__GNUC_PATCHLEVEL__)) */ 436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// 456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h 466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// 476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// Two variants are presented here: 486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32. 496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header. 5086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// 5186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 5286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. 536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// These are only used for inline assembly. 5486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_MONO \ 5586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ 5686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ 5786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ 5886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ 5986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ 6086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ 6186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ 6286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 6386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_STEREO \ 6486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ 6586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ 6686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ 6786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ 6886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ 6986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ 7086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ 7186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ 7286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 736b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate <int CHANNELS, int STRIDE, bool FIXED> 746b667dde03a5707285a2ff76ada525075d4c60efAndy Hungstatic inline void ProcessNeonIntrinsic(int32_t* out, 756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* coefsP, 776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* coefsN, 786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* sP, 796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* sN, 806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int32_t* volumeLR, 816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung uint32_t lerpP, 826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* coefsP1, 836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* coefsN1) 846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2); 876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= CHANNELS*((STRIDE>>1)-1); 896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16); 906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16); 916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x4_t interp; 936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (!FIXED) { 946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung interp = vdup_n_s16(lerpP); 956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0); 966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16); 976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16); 986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t accum, accum2; 1006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // warning uninitialized if we use veorq_s32 1016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // (alternative to below) accum = veorq_s32(accum, accum); 1026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vdupq_n_s32(0); 1036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (CHANNELS == 2) { 1046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // (alternative to below) accum2 = veorq_s32(accum2, accum2); 1056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vdupq_n_s32(0); 1066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 1076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung do { 1086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t posCoef = vld1q_s16(coefsP); 1096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 8; 1106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t negCoef = vld1q_s16(coefsN); 1116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 8; 1126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (!FIXED) { // interpolate 1136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t posCoef1 = vld1q_s16(coefsP1); 1146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 8; 1156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t negCoef1 = vld1q_s16(coefsN1); 1166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 8; 1176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1 = vsubq_s16(posCoef1, posCoef); 1196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef = vsubq_s16(negCoef, negCoef1); 1206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0); 1226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0); 1236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef = vaddq_s16(posCoef, posCoef1); 1256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef = vaddq_s16(negCoef, negCoef1); 1266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 1276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung switch (CHANNELS) { 1286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung case 1: { 1296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t posSamp = vld1q_s16(sP); 1306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t negSamp = vld1q_s16(sN); 1316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 8; 1326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp = vrev64q_s16(posSamp); 1336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // dot product 1356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed 1366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed 1376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef)); 1386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef)); 1396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 8; 1406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } break; 1416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung case 2: { 1426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8x2_t posSamp = vld2q_s16(sP); 1436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8x2_t negSamp = vld2q_s16(sN); 1446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 16; 1456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[0] = vrev64q_s16(posSamp.val[0]); 1466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[1] = vrev64q_s16(posSamp.val[1]); 1476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // dot product 1496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r 1506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r 1516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r 1526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r 1536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef)); 1546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef)); 1556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef)); 1566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef)); 1576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 16; 1586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 1596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } break; 1606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } while (count -= 8); 1616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // multiply by volume and save 1636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8); 1646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t vLR = vld1_s32(volumeLR); 1656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t outSamp = vld1_s32(out); 1666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // combine and funnel down accumulator 1676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); 1686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (CHANNELS == 1) { 1696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // duplicate accum to both L and R 1706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vpadd_s32(outAccum, outAccum); 1716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } else if (CHANNELS == 2) { 1726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // accum2 contains R, fold in 1736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); 1746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vpadd_s32(outAccum, outAccum2); 1756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 1766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vqrdmulh_s32(outAccum, vLR); 1776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outSamp = vqadd_s32(outSamp, outAccum); 1786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vst1_s32(out, outSamp); 1796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 1806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1816b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate <int CHANNELS, int STRIDE, bool FIXED> 1826b667dde03a5707285a2ff76ada525075d4c60efAndy Hungstatic inline void ProcessNeonIntrinsic(int32_t* out, 1836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 1846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int32_t* coefsP, 1856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int32_t* coefsN, 1866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* sP, 1876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int16_t* sN, 1886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int32_t* volumeLR, 1896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung uint32_t lerpP, 1906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int32_t* coefsP1, 1916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const int32_t* coefsN1) 1926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 1936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 1946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2); 1956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 1966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= CHANNELS*((STRIDE>>1)-1); 1976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16); 1986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16); 1996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t interp; 2016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (!FIXED) { 2026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung interp = vdup_n_s32(lerpP); 2036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16); 2046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16); 2056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 2066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t accum, accum2; 2076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // warning uninitialized if we use veorq_s32 2086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // (alternative to below) accum = veorq_s32(accum, accum); 2096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vdupq_n_s32(0); 2106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (CHANNELS == 2) { 2116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // (alternative to below) accum2 = veorq_s32(accum2, accum2); 2126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vdupq_n_s32(0); 2136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 2146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung do { 2156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_s32_x2 2166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t posCoef = vld1q_s32_x2(coefsP); 2176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 8; 2186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t negCoef = vld1q_s32_x2(coefsN); 2196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 8; 2206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 2216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t posCoef; 2226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[0] = vld1q_s32(coefsP); 2236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 4; 2246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[1] = vld1q_s32(coefsP); 2256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 4; 2266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t negCoef; 2276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vld1q_s32(coefsN); 2286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 4; 2296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vld1q_s32(coefsN); 2306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 4; 2316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 2326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (!FIXED) { // interpolate 2336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_s32_x2 2346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1); 2356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 8; 2366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1); 2376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 8; 2386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 2396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t posCoef1; 2406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[0] = vld1q_s32(coefsP1); 2416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 4; 2426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[1] = vld1q_s32(coefsP1); 2436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 4; 2446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4x2_t negCoef1; 2456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef1.val[0] = vld1q_s32(coefsN1); 2466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 4; 2476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef1.val[1] = vld1q_s32(coefsN1); 2486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 4; 2496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 2506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]); 2526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]); 2536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]); 2546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]); 2556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0); 2576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0); 2586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0); 2596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0); 2606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]); 2626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]); 2636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]); 2646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]); 2656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 2666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung switch (CHANNELS) { 2676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung case 1: { 2686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t posSamp = vld1q_s16(sP); 2696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8_t negSamp = vld1q_s16(sN); 2706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 8; 2716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp = vrev64q_s16(posSamp); 2726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15); 2746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15); 2756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15); 2766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15); 2776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // dot product 2796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed 2806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed 2816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); 2826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); 2836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vaddq_s32(accum, posSamp0); 2856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vaddq_s32(negSamp0, negSamp1); 2866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vaddq_s32(accum, posSamp1); 2876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vaddq_s32(accum, negSamp0); 2886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 8; 2906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } break; 2916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung case 2: { 2926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8x2_t posSamp = vld2q_s16(sP); 2936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int16x8x2_t negSamp = vld2q_s16(sN); 2946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 16; 2956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[0] = vrev64q_s16(posSamp.val[0]); 2966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[1] = vrev64q_s16(posSamp.val[1]); 2976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 2986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // left 2996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15); 3006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15); 3016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15); 3026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15); 3036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // dot product 3056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed 3066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed 3076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); 3086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); 3096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vaddq_s32(accum, posSamp0); 3116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vaddq_s32(negSamp0, negSamp1); 3126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vaddq_s32(accum, posSamp1); 3136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vaddq_s32(accum, negSamp0); 3146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // right 3166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15); 3176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15); 3186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15); 3196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15); 3206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // dot product 3226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed 3236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed 3246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]); 3256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]); 3266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vaddq_s32(accum2, posSamp0); 3286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp0 = vaddq_s32(negSamp0, negSamp1); 3296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vaddq_s32(accum2, posSamp1); 3306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vaddq_s32(accum2, negSamp0); 3316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 16; 3336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } break; 3346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 3356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } while (count -= 8); 3366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // multiply by volume and save 3386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8); 3396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t vLR = vld1_s32(volumeLR); 3406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t outSamp = vld1_s32(out); 3416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // combine and funnel down accumulator 3426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum)); 3436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (CHANNELS == 1) { 3446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // duplicate accum to both L and R 3456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vpadd_s32(outAccum, outAccum); 3466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } else if (CHANNELS == 2) { 3476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // accum2 contains R, fold in 3486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); 3496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vpadd_s32(outAccum, outAccum2); 3506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 3516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vqrdmulh_s32(outAccum, vLR); 3526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outSamp = vqadd_s32(outSamp, outAccum); 3536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vst1_s32(out, outSamp); 3546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 3556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3566b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate <int CHANNELS, int STRIDE, bool FIXED> 3576b667dde03a5707285a2ff76ada525075d4c60efAndy Hungstatic inline void ProcessNeonIntrinsic(float* out, 3586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 3596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP, 3606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN, 3616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sP, 3626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sN, 3636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* volumeLR, 3646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float lerpP, 3656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP1, 3666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN1) 3676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 3686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8 3696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2); 3706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= CHANNELS*((STRIDE>>1)-1); 3726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP = (const float*)__builtin_assume_aligned(coefsP, 16); 3736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN = (const float*)__builtin_assume_aligned(coefsN, 16); 3746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 3756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x2_t interp; 3766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (!FIXED) { 3776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung interp = vdup_n_f32(lerpP); 3786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16); 3796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16); 3806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 3816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4_t accum, accum2; 3826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // warning uninitialized if we use veorq_s32 3836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // (alternative to below) accum = veorq_s32(accum, accum); 3846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vdupq_n_f32(0); 3856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (CHANNELS == 2) { 3866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // (alternative to below) accum2 = veorq_s32(accum2, accum2); 3876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vdupq_n_f32(0); 3886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 3896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung do { 3906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_f32_x2 3916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posCoef = vld1q_f32_x2(coefsP); 3926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 8; 3936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negCoef = vld1q_f32_x2(coefsN); 3946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 8; 3956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 3966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posCoef; 3976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[0] = vld1q_f32(coefsP); 3986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 4; 3996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[1] = vld1q_f32(coefsP); 4006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP += 4; 4016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negCoef; 4026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vld1q_f32(coefsN); 4036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 4; 4046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vld1q_f32(coefsN); 4056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN += 4; 4066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 4076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (!FIXED) { // interpolate 4086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_f32_x2 4096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1); 4106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 8; 4116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1); 4126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 8; 4136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 4146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posCoef1; 4156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[0] = vld1q_f32(coefsP1); 4166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 4; 4176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[1] = vld1q_f32(coefsP1); 4186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsP1 += 4; 4196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negCoef1; 4206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef1.val[0] = vld1q_f32(coefsN1); 4216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 4; 4226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef1.val[1] = vld1q_f32(coefsN1); 4236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung coefsN1 += 4; 4246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 4256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]); 4266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]); 4276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]); 4286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]); 4296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 4306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0); 4316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0); 4326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev 4336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev 4346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 4356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung switch (CHANNELS) { 4366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung case 1: { 4376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_f32_x2 4386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posSamp = vld1q_f32_x2(sP); 4396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negSamp = vld1q_f32_x2(sN); 4406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 8; 4416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 8; 4426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 4436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posSamp; 4446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[0] = vld1q_f32(sP); 4456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP += 4; 4466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[1] = vld1q_f32(sP); 4476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 12; 4486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negSamp; 4496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp.val[0] = vld1q_f32(sN); 4506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 4; 4516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung negSamp.val[1] = vld1q_f32(sN); 4526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 4; 4536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 4546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // effectively we want a vrev128q_f32() 4556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[0] = vrev64q_f32(posSamp.val[0]); 4566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[1] = vrev64q_f32(posSamp.val[1]); 4576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[0] = vcombine_f32( 4586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0])); 4596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp.val[1] = vcombine_f32( 4606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1])); 4616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 4626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]); 4636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]); 4646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]); 4656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]); 4666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } break; 4676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung case 2: { 4686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posSamp0 = vld2q_f32(sP); 4696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP += 8; 4706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negSamp0 = vld2q_f32(sN); 4716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 8; 4726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]); 4736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]); 4746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0.val[0] = vcombine_f32( 4756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0])); 4766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp0.val[1] = vcombine_f32( 4776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1])); 4786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 4796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t posSamp1 = vld2q_f32(sP); 4806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sP -= 24; 4816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x4x2_t negSamp1 = vld2q_f32(sN); 4826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung sN += 8; 4836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]); 4846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]); 4856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1.val[0] = vcombine_f32( 4866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0])); 4876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung posSamp1.val[1] = vcombine_f32( 4886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1])); 4896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 4906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // Note: speed is affected by accumulation order. 4916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // Also, speed appears slower using vmul/vadd instead of vmla for 4926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // stereo case, comparable for mono. 4936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 4946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]); 4956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]); 4966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]); 4976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]); 4986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 4996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed 5006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed 5016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed 5026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed 5036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } break; 5046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 5056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } while (count -= 8); 5066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 5076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // multiply by volume and save 5086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8); 5096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x2_t vLR = vld1_f32(volumeLR); 5106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x2_t outSamp = vld1_f32(out); 5116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // combine and funnel down accumulator 5126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)); 5136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung if (CHANNELS == 1) { 5146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // duplicate accum to both L and R 5156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vpadd_f32(outAccum, outAccum); 5166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } else if (CHANNELS == 2) { 5176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung // accum2 contains R, fold in 5186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); 5196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outAccum = vpadd_f32(outAccum, outAccum2); 5206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung } 5216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung outSamp = vmla_f32(outSamp, outAccum, vLR); 5226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung vst1_f32(out, outSamp); 5236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 5246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 52586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 52686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out, 52786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 52886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 52986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 53086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 53186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 53286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 53386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 5346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 5356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 5366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 5376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 53886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 53986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 54086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 54186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 54286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 54386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 54486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 54586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 54686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 54786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 54886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 54986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 55086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 55186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 55286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 55386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 55486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef 55586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef 55686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 55786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 55886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 55986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM instructions before neon above seems to be slower 56086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 56186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 56286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 56386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 56486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 56586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 56686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 56786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 56886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 56986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 57086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 57186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 57286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 57386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 57486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 57586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 57686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 57786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q10" 57886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 5796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 58086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 58186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 58286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 58386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out, 58486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 58586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 58686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 58786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 58886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 58986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 59086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 5916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 5926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 5936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 5946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 59586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 59686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 59786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 59886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 59986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (1) acc_L = 0 60086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 60186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 60286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 60386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 604d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames 605d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames 60686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 60786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 60886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 609d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left 610d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right 61186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 61286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left 61386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left 61486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right 61586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right 61686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 61786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 61886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 61986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 62086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 62186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM before neon seems to be slower 62286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 62386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 62486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 62586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 62686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 62786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 62886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 62986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 63086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 63186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 63286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 63386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 63486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 63586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 63686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 63786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 63886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 63986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 64086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q10" 64186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 6426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 64386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 64486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 64586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 64686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out, 64786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 64886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 64986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 65086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP1, 65186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN1, 65286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 65386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 65486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 65586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 65686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 6576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 6586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 6596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung lerpP, coefsP1, coefsN1); 6606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 6616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 66286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 66386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 66486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 66586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 66686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 66786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 66886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 66986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 67086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 67186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 67286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 67386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 67486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 67586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 67686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 67786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 67886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 67986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 68086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 68186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 68286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 68386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 68486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 68586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 68686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set 68786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 68886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 68986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 69086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef 69186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef 69286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 69386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 69486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 69586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM instructions before neon above seems to be slower 69686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 69786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 69886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 69986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 70086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 70186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 70286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 70386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 70486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 70586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 70686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 70786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 70886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 70986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 71086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 71186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 71286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 71386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 71486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 71586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 71686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11" 71786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 7186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 71986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 72086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 72186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 72286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out, 72386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 72486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 72586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 72686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP1, 72786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN1, 72886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 72986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 73086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 73186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 73286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 7336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 7346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 7356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung lerpP, coefsP1, coefsN1); 7366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 73786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 73886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 73986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 74086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 74186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 74286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (1) acc_L = 0 74386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 74486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 74586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 74686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 747d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames 748d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames 74986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 75086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 75186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 75286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 75386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 75486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 75586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 75686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 75786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 75886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 75986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 760d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left 761d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right 76286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 76386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set 76486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 76586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 76686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left 76786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left 76886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right 76986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right 77086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 77186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 77286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 77386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 77486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 77586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM before neon seems to be slower 77686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 77786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 77886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 77986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 78086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 78186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 78286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 78386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 78486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 78586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 78686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 78786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 78886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 78986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 79086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 79186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 79286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 79386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 79486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 79586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 79686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 79786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11" 79886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 7996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 80086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 80186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 80286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 80386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out, 80486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 80586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 80686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 80786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 80886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 80986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 81086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 8116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 8126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 8136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 8146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 81586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 81686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 81786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 81886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 81986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 82086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 82186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 82286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 82386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 82486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 82586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 82686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 82786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 828d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side 82986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 83086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 83186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 83286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 83386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 83486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 83586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 836d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples 837d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples 838d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples 839d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples 84086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 84286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 84386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q15 \n"// accumulate result 84486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q13 \n"// accumulate result 84586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 84786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 84886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 85086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 85186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 85286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 85386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 85486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 85586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 85686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 85786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 85886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 85986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 86086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 86186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 86286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 86386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 86486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 8656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 86686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 86786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 86886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 86986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out, 87086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 87186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 87286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 87386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 87486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 87586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 87686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 8776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 8786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 8796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 8806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 88186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 88286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 88386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 88486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 88586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 88686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// result, initialize to 0 88786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 88886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 88986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 890d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames 891d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames 892d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 893d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 89486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 895d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left 896d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right 89786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 89886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 89986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 90086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 90186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 90286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 90386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 904d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef 905d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 906d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 907d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef 90886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 90986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 91086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 911d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q0, q0, q15 \n"// accumulate result 912d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q0, q0, q13 \n"// accumulate result 91386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 91486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 91586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 91686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 91786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 91886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 91986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 920d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef 921d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 922d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 923d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef 92486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 92586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q12 \n"// accumulate result 92686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 927d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q4, q4, q15 \n"// accumulate result 928d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q4, q4, q13 \n"// accumulate result 92986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 93086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 93186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 93286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 93386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 93486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 93586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 93686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 93786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 93886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 93986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 94086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 94186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 94286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 94386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 94486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 94586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 94686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 94786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 94886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 94986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 9506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 95186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 95286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 95386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 95486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out, 95586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 95686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 95786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 95886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP1, 95986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN1, 96086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 96186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 96286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 96386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 96486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 9656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 9666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 9676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung lerpP, coefsP1, coefsN1); 9686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 96986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 97086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 97186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 97286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 97386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 97486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 97586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 97686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 97786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 97886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 97986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 98086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 98186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 98286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 98386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 98486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 98586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 98686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 98786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 98886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 98986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 99086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 99186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 99286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 99386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 99486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 99586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 99686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 99786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 99886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 99986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 1000d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side 100186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 100286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 100386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 100486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 100586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 100686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 100786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 100886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 100986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 101086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 101186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 101286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 101386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 101486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 101586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q15 \n"// accumulate result 101686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q13 \n"// accumulate result 101786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 101886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 101986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 102086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 102186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 102286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 102386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 102486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 102586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 102686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 102786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 102886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 102986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 103086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 103186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 103286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 103386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 103486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 103586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 103686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 103786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 103886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 103986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 10406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 104186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 104286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 104386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 104486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out, 104586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 104686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 104786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 104886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP1, 104986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN1, 105086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 105186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 105286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 105386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 105486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 10556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC 10566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 10576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung lerpP, coefsP1, coefsN1); 10586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else 105986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 106086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 106186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 106286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 106386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 106486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 106586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// result, initialize to 0 106686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 106786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 106886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 1069d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames 1070d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames 107186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 107286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 107386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 107486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 107586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 107686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 107786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 107886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 107986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 108086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 108186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 108286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 108386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 108486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 108586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 108686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 108786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 108886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 108986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 109086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 1091d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left 1092d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right 109386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 109486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 109586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 109686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 109786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 109886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 109986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 110086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 110186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 110286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 110386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 110486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 110586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 110686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 1107d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q0, q0, q15 \n"// accumulate result 1108d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q0, q0, q13 \n"// accumulate result 110986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 111086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 111186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 111286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 111386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 111486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 111586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 111686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 111786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 111886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 111986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 112086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q12 \n"// accumulate result 112286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 1123d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q4, q4, q15 \n"// accumulate result 1124d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung "vadd.s32 q4, q4, q13 \n"// accumulate result 112586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 112786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 112886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 113086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 113186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 113286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 113386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 113486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 113586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 113686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 113786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 113886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 113986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 114086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 114186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 114286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 114386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 114486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 114586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 114686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 114786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 114886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 11496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif 11506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 11516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 11526b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<> 11536b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void ProcessL<1, 16>(float* const out, 11546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 11556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP, 11566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN, 11576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sP, 11586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sN, 11596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* const volumeLR) 11606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 11616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 11626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 11636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 11646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 11656b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<> 11666b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void ProcessL<2, 16>(float* const out, 11676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 11686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP, 11696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN, 11706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sP, 11716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sN, 11726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* const volumeLR) 11736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 11746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR, 11756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/); 11766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 11776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 11786b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<> 11796b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void Process<1, 16>(float* const out, 11806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 11816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP, 11826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN, 11836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP1, 11846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN1, 11856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sP, 11866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sN, 11876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float lerpP, 11886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* const volumeLR) 11896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 11906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 11916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung lerpP, coefsP1, coefsN1); 11926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung} 11936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung 11946b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<> 11956b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void Process<2, 16>(float* const out, 11966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung int count, 11976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP, 11986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN, 11996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsP1, 12006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* coefsN1, 12016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sP, 12026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* sN, 12036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung float lerpP, 12046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung const float* const volumeLR) 12056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{ 12066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR, 12076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung lerpP, coefsP1, coefsN1); 120886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 120986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 121086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif //USE_NEON 121186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 121263238efb0d674758902918e3cdaac322126484b7Glenn Kasten} // namespace android 121386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 121486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ 1215