186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung/*
286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Copyright (C) 2013 The Android Open Source Project
386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *
486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Licensed under the Apache License, Version 2.0 (the "License");
586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * you may not use this file except in compliance with the License.
686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * You may obtain a copy of the License at
786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *
886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *      http://www.apache.org/licenses/LICENSE-2.0
986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *
1086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Unless required by applicable law or agreed to in writing, software
1186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * distributed under the License is distributed on an "AS IS" BASIS,
1286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * See the License for the specific language governing permissions and
1486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * limitations under the License.
1586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung */
1686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
1786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
1886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
1986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
2086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungnamespace android {
2186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
2286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
2386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
2486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#if USE_NEON
256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// use intrinsics if inline arm32 assembly is not possible
276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#if !USE_INLINE_ASSEMBLY
286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#define USE_INTRINSIC
296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// following intrinsics available only on ARM 64 bit ACLE
326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifndef __aarch64__
336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#undef vld1q_f32_x2
346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#undef vld1q_s32_x2
356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#define TO_STRING2(x) #x
386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#define TO_STRING(x) TO_STRING2(x)
396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// uncomment to print GCC version, may be relevant for intrinsic optimizations
406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        "." TO_STRING(__GNUC_MINOR__) \
426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        "." TO_STRING(__GNUC_PATCHLEVEL__)) */
436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung//
456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung//
476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// Two variants are presented here:
486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
5086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung//
5186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
5286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung// These are only used for inline assembly.
5486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_MONO \
5586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
5686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
5786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
5886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
5986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
6086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
6186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
6286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
6386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_STEREO \
6486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
6586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
6686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
6786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
6886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
6986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
7086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
7186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
7286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
736b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate <int CHANNELS, int STRIDE, bool FIXED>
746b667dde03a5707285a2ff76ada525075d4c60efAndy Hungstatic inline void ProcessNeonIntrinsic(int32_t* out,
756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* coefsP,
776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* coefsN,
786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* sP,
796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* sN,
806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int32_t* volumeLR,
816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        uint32_t lerpP,
826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* coefsP1,
836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* coefsN1)
846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int16x4_t interp;
936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (!FIXED) {
946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        interp = vdup_n_s16(lerpP);
956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x4_t accum, accum2;
1006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // warning uninitialized if we use veorq_s32
1016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // (alternative to below) accum = veorq_s32(accum, accum);
1026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    accum = vdupq_n_s32(0);
1036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (CHANNELS == 2) {
1046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // (alternative to below) accum2 = veorq_s32(accum2, accum2);
1056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        accum2 = vdupq_n_s32(0);
1066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
1076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    do {
1086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int16x8_t posCoef = vld1q_s16(coefsP);
1096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 8;
1106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int16x8_t negCoef = vld1q_s16(coefsN);
1116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 8;
1126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        if (!FIXED) { // interpolate
1136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8_t posCoef1 = vld1q_s16(coefsP1);
1146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 8;
1156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8_t negCoef1 = vld1q_s16(coefsN1);
1166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 8;
1176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1 = vsubq_s16(posCoef1, posCoef);
1196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef = vsubq_s16(negCoef, negCoef1);
1206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
1226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
1236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef = vaddq_s16(posCoef, posCoef1);
1256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef = vaddq_s16(negCoef, negCoef1);
1266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        }
1276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        switch (CHANNELS) {
1286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        case 1: {
1296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8_t posSamp = vld1q_s16(sP);
1306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8_t negSamp = vld1q_s16(sN);
1316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 8;
1326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp = vrev64q_s16(posSamp);
1336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // dot product
1356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
1366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
1376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
1386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
1396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 8;
1406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        } break;
1416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        case 2: {
1426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8x2_t posSamp = vld2q_s16(sP);
1436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8x2_t negSamp = vld2q_s16(sN);
1446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 16;
1456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
1466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
1476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // dot product
1496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
1506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
1516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
1526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
1536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
1546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
1556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
1566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
1576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 16;
1586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        }
1596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        } break;
1606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    } while (count -= 8);
1616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // multiply by volume and save
1636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
1646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t vLR = vld1_s32(volumeLR);
1656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t outSamp = vld1_s32(out);
1666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // combine and funnel down accumulator
1676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
1686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (CHANNELS == 1) {
1696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // duplicate accum to both L and R
1706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        outAccum = vpadd_s32(outAccum, outAccum);
1716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    } else if (CHANNELS == 2) {
1726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // accum2 contains R, fold in
1736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
1746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        outAccum = vpadd_s32(outAccum, outAccum2);
1756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
1766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    outAccum = vqrdmulh_s32(outAccum, vLR);
1776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    outSamp = vqadd_s32(outSamp, outAccum);
1786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    vst1_s32(out, outSamp);
1796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
1806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1816b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate <int CHANNELS, int STRIDE, bool FIXED>
1826b667dde03a5707285a2ff76ada525075d4c60efAndy Hungstatic inline void ProcessNeonIntrinsic(int32_t* out,
1836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
1846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int32_t* coefsP,
1856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int32_t* coefsN,
1866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* sP,
1876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int16_t* sN,
1886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int32_t* volumeLR,
1896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        uint32_t lerpP,
1906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int32_t* coefsP1,
1916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const int32_t* coefsN1)
1926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
1936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
1946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
1956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
1966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
1976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
1986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
1996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t interp;
2016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (!FIXED) {
2026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        interp = vdup_n_s32(lerpP);
2036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
2046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
2056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
2066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x4_t accum, accum2;
2076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // warning uninitialized if we use veorq_s32
2086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // (alternative to below) accum = veorq_s32(accum, accum);
2096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    accum = vdupq_n_s32(0);
2106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (CHANNELS == 2) {
2116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // (alternative to below) accum2 = veorq_s32(accum2, accum2);
2126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        accum2 = vdupq_n_s32(0);
2136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
2146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    do {
2156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_s32_x2
2166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
2176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 8;
2186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
2196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 8;
2206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
2216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int32x4x2_t posCoef;
2226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        posCoef.val[0] = vld1q_s32(coefsP);
2236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 4;
2246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        posCoef.val[1] = vld1q_s32(coefsP);
2256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 4;
2266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int32x4x2_t negCoef;
2276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        negCoef.val[0] = vld1q_s32(coefsN);
2286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 4;
2296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        negCoef.val[1] = vld1q_s32(coefsN);
2306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 4;
2316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
2326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        if (!FIXED) { // interpolate
2336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_s32_x2
2346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
2356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 8;
2366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
2376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 8;
2386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
2396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4x2_t posCoef1;
2406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[0] = vld1q_s32(coefsP1);
2416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 4;
2426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[1] = vld1q_s32(coefsP1);
2436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 4;
2446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4x2_t negCoef1;
2456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef1.val[0] = vld1q_s32(coefsN1);
2466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 4;
2476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef1.val[1] = vld1q_s32(coefsN1);
2486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 4;
2496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
2506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
2526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
2536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
2546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
2556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
2576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
2586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
2596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
2606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
2626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
2636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
2646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
2656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        }
2666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        switch (CHANNELS) {
2676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        case 1: {
2686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8_t posSamp = vld1q_s16(sP);
2696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8_t negSamp = vld1q_s16(sN);
2706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 8;
2716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp = vrev64q_s16(posSamp);
2726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
2746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
2756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
2766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
2776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // dot product
2796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
2806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
2816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
2826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
2836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vaddq_s32(accum, posSamp0);
2856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vaddq_s32(negSamp0, negSamp1);
2866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vaddq_s32(accum, posSamp1);
2876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vaddq_s32(accum, negSamp0);
2886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 8;
2906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        } break;
2916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        case 2: {
2926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8x2_t posSamp = vld2q_s16(sP);
2936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int16x8x2_t negSamp = vld2q_s16(sN);
2946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 16;
2956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
2966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
2976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
2986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // left
2996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
3006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
3016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
3026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
3036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // dot product
3056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
3066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
3076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
3086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
3096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vaddq_s32(accum, posSamp0);
3116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vaddq_s32(negSamp0, negSamp1);
3126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vaddq_s32(accum, posSamp1);
3136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vaddq_s32(accum, negSamp0);
3146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // right
3166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
3176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
3186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
3196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
3206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // dot product
3226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
3236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
3246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
3256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
3266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vaddq_s32(accum2, posSamp0);
3286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp0 = vaddq_s32(negSamp0, negSamp1);
3296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vaddq_s32(accum2, posSamp1);
3306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vaddq_s32(accum2, negSamp0);
3316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 16;
3336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        } break;
3346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        }
3356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    } while (count -= 8);
3366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // multiply by volume and save
3386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
3396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t vLR = vld1_s32(volumeLR);
3406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t outSamp = vld1_s32(out);
3416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // combine and funnel down accumulator
3426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
3436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (CHANNELS == 1) {
3446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // duplicate accum to both L and R
3456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        outAccum = vpadd_s32(outAccum, outAccum);
3466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    } else if (CHANNELS == 2) {
3476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // accum2 contains R, fold in
3486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
3496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        outAccum = vpadd_s32(outAccum, outAccum2);
3506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
3516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    outAccum = vqrdmulh_s32(outAccum, vLR);
3526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    outSamp = vqadd_s32(outSamp, outAccum);
3536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    vst1_s32(out, outSamp);
3546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
3556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3566b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate <int CHANNELS, int STRIDE, bool FIXED>
3576b667dde03a5707285a2ff76ada525075d4c60efAndy Hungstatic inline void ProcessNeonIntrinsic(float* out,
3586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
3596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP,
3606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN,
3616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sP,
3626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sN,
3636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* volumeLR,
3646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float lerpP,
3656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP1,
3666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN1)
3676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
3686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
3696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
3706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
3726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
3736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
3746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
3756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    float32x2_t interp;
3766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (!FIXED) {
3776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        interp = vdup_n_f32(lerpP);
3786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
3796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
3806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
3816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    float32x4_t accum, accum2;
3826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // warning uninitialized if we use veorq_s32
3836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // (alternative to below) accum = veorq_s32(accum, accum);
3846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    accum = vdupq_n_f32(0);
3856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (CHANNELS == 2) {
3866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // (alternative to below) accum2 = veorq_s32(accum2, accum2);
3876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        accum2 = vdupq_n_f32(0);
3886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
3896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    do {
3906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_f32_x2
3916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
3926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 8;
3936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
3946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 8;
3956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
3966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float32x4x2_t posCoef;
3976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        posCoef.val[0] = vld1q_f32(coefsP);
3986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 4;
3996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        posCoef.val[1] = vld1q_f32(coefsP);
4006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsP += 4;
4016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float32x4x2_t negCoef;
4026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        negCoef.val[0] = vld1q_f32(coefsN);
4036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 4;
4046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        negCoef.val[1] = vld1q_f32(coefsN);
4056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        coefsN += 4;
4066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
4076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        if (!FIXED) { // interpolate
4086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_f32_x2
4096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
4106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 8;
4116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
4126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 8;
4136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
4146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t posCoef1;
4156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[0] = vld1q_f32(coefsP1);
4166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 4;
4176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[1] = vld1q_f32(coefsP1);
4186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsP1 += 4;
4196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t negCoef1;
4206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef1.val[0] = vld1q_f32(coefsN1);
4216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 4;
4226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef1.val[1] = vld1q_f32(coefsN1);
4236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            coefsN1 += 4;
4246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
4256b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
4266b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
4276b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
4286b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
4296b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
4306b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
4316b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
4326b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
4336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
4346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        }
4356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        switch (CHANNELS) {
4366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        case 1: {
4376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef vld1q_f32_x2
4386b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t posSamp = vld1q_f32_x2(sP);
4396b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t negSamp = vld1q_f32_x2(sN);
4406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 8;
4416b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 8;
4426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
4436b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t posSamp;
4446b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[0] = vld1q_f32(sP);
4456b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP += 4;
4466b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[1] = vld1q_f32(sP);
4476b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 12;
4486b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t negSamp;
4496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp.val[0] = vld1q_f32(sN);
4506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 4;
4516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            negSamp.val[1] = vld1q_f32(sN);
4526b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 4;
4536b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
4546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // effectively we want a vrev128q_f32()
4556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
4566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
4576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[0] = vcombine_f32(
4586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung                    vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
4596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp.val[1] = vcombine_f32(
4606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung                    vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
4616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
4626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
4636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
4646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
4656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
4666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        } break;
4676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        case 2: {
4686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t posSamp0 = vld2q_f32(sP);
4696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP += 8;
4706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t negSamp0 = vld2q_f32(sN);
4716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 8;
4726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
4736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
4746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0.val[0] = vcombine_f32(
4756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung                    vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
4766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp0.val[1] = vcombine_f32(
4776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung                    vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
4786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
4796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t posSamp1 = vld2q_f32(sP);
4806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sP -= 24;
4816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            float32x4x2_t negSamp1 = vld2q_f32(sN);
4826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            sN += 8;
4836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
4846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
4856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1.val[0] = vcombine_f32(
4866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung                    vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
4876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            posSamp1.val[1] = vcombine_f32(
4886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung                    vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
4896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
4906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // Note: speed is affected by accumulation order.
4916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // Also, speed appears slower using vmul/vadd instead of vmla for
4926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            // stereo case, comparable for mono.
4936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
4946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
4956b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
4966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
4976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
4986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
4996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
5006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
5016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
5026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
5036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        } break;
5046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        }
5056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    } while (count -= 8);
5066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
5076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // multiply by volume and save
5086b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
5096b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    float32x2_t vLR = vld1_f32(volumeLR);
5106b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    float32x2_t outSamp = vld1_f32(out);
5116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    // combine and funnel down accumulator
5126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
5136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    if (CHANNELS == 1) {
5146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // duplicate accum to both L and R
5156b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        outAccum = vpadd_f32(outAccum, outAccum);
5166b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    } else if (CHANNELS == 2) {
5176b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        // accum2 contains R, fold in
5186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
5196b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        outAccum = vpadd_f32(outAccum, outAccum2);
5206b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    }
5216b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    outSamp = vmla_f32(outSamp, outAccum, vLR);
5226b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    vst1_f32(out, outSamp);
5236b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
5246b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
52586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
52686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out,
52786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
52886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
52986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
53086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
53186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
53286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
53386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
5346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
5356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
5366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
5376b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
53886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
53986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
54086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
54186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
54286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
54386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
54486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
54586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
54686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
54786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
54886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
54986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
55086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
55186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
55286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
55386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
55486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
55586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
55686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
55786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
55886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
55986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM instructions before neon above seems to be slower
56086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
56186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
56286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
56386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
56486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
56586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
56686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung         ASSEMBLY_ACCUMULATE_MONO
56786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
56886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
56986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
57086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
57186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
57286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
57386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
57486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
57586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
57686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
57786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q10"
57886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
5796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
58086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
58186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
58286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
58386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out,
58486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
58586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
58686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
58786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
58886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
58986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
59086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
5916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
5926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
5936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
5946b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
59586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
59686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
59786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
59886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
59986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (1) acc_L = 0
60086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
60186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
60286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
60386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
604d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
605d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
60686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
60786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
60886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
609d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
610d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse positive right
61186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
61286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
61386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
61486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
61586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
61686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
61786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
61886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
61986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
62086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
62186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM before neon seems to be slower
62286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
62386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
62486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
62586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
62686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
62786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
62886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
62986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
63086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
63186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
63286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
63386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
63486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
63586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
63686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR] "r" (volumeLR)
63786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
63886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
63986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
64086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q10"
64186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung     );
6426b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
64386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
64486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
64586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
64686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out,
64786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
64886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
64986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
65086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP1,
65186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN1,
65286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
65386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
65486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
65586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
65686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
6576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
6586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
6596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            lerpP, coefsP1, coefsN1);
6606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
6616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
66286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
66386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
66486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
66586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
66686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
66786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
66886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
66986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
67086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
67186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
67286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
67386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
67486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
67586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
67686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
67786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
67886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
67986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
68086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
68186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
68286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
68386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
68486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
68586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
68686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
68786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
68886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
68986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
69086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
69186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
69286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
69386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
69486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
69586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM instructions before neon above seems to be slower
69686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
69786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
69886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
69986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
70086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
70186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
70286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
70386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
70486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
70586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
70686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
70786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
70886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
70986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
71086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
71186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
71286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
71386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
71486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
71586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
71686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11"
71786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
7186b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
71986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
72086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
72186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
72286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out,
72386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
72486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
72586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
72686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP1,
72786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN1,
72886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
72986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
73086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
73186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
73286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
7336b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
7346b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
7356b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            lerpP, coefsP1, coefsN1);
7366b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
73786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
73886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
73986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
74086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
74186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
74286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (1) acc_L = 0
74386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
74486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
74586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
74686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
747d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo frames
748d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo frames
74986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
75086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
75186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
75286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
75386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
75486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
75586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
75686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
75786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
75886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
75986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
760d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse 8 samples of positive left
761d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q3, q3                   \n"// (1) reverse 8 samples of positive right
76286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
76386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
76486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
76586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
76686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
76786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
76886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
76986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
77086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
77186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
77286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
77386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
77486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
77586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM before neon seems to be slower
77686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
77786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
77886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
77986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
78086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
78186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
78286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
78386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
78486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
78586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
78686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
78786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
78886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
78986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
79086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
79186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
79286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
79386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR] "r" (volumeLR)
79486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
79586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
79686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
79786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11"
79886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
7996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
80086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
80186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
80286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
80386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out,
80486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
80586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
80686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
80786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
80886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
80986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
81086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
8116b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
8126b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
8136b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
8146b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
81586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
81686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
81786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
81886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
81986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
82086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
82186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
82286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
82386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
82486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
82586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
82686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
82786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
828d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
82986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
83086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
83186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
83286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
83386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
83486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
83586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
836d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples
837d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples
838d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples
839d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples
84086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
84286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
84386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q15                   \n"// accumulate result
84486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q13                   \n"// accumulate result
84586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
84786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
84886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
85086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
85186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
85286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
85386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
85486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
85586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
85686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
85786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
85886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
85986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
86086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
86186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
86286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
86386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
86486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
8656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
86686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
86786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
86886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
86986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out,
87086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
87186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
87286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
87386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
87486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
87586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
87686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
8776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
8786b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
8796b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
8806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
88186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
88286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
88386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
88486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
88586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
88686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4                    \n"// result, initialize to 0
88786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
88886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
88986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
890d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
891d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
892d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
893d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
89486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
895d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
896d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
89786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
89886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
89986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
90086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
90186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
90286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
90386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
904d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
905d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
906d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
907d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
90886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
90986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
91086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
911d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q0, q0, q15                   \n"// accumulate result
912d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q0, q0, q13                   \n"// accumulate result
91386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
91486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
91586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
91686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
91786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
91886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
91986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
920d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by coef
921d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by coef
922d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by coef
923d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by coef
92486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
92586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q12                   \n"// accumulate result
92686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
927d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q4, q4, q15                   \n"// accumulate result
928d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q4, q4, q13                   \n"// accumulate result
92986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
93086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
93186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
93286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
93386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
93486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
93586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
93686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
93786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
93886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
93986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
94086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
94186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
94286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
94386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
94486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
94586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
94686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
94786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
94886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
94986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
9506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
95186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
95286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
95386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
95486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out,
95586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
95686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
95786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
95886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP1,
95986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN1,
96086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
96186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
96286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
96386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
96486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
9656b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
9666b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
9676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            lerpP, coefsP1, coefsN1);
9686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
96986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
97086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
97186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
97286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
97386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
97486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
97586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
97686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
97786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
97886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
97986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
98086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
98186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
98286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
98386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
98486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
98586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
98686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
98786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
98886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
98986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
99086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
99186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
99286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
99386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
99486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
99586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
99686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
99786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
99886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
99986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
1000d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 samples of the positive side
100186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
100286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
100386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
100486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
100586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
100686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
100786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
100886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
100986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
101086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
101186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
101286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
101386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
101486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
101586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q15                   \n"// accumulate result
101686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q13                   \n"// accumulate result
101786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
101886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
101986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
102086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
102186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
102286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
102386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
102486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
102586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
102686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
102786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
102886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
102986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
103086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
103186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
103286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
103386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
103486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
103586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
103686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
103786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
103886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
103986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
10406b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
104186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
104286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
104386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
104486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out,
104586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
104686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
104786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
104886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP1,
104986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN1,
105086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
105186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
105286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
105386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
105486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
10556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#ifdef USE_INTRINSIC
10566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
10576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            lerpP, coefsP1, coefsN1);
10586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#else
105986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
106086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
106186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
106286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
106386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
106486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
106586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4                    \n"// result, initialize to 0
106686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
106786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
106886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
1069d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 8 16-bits stereo frames
1070d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 8 16-bits stereo frames
107186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
107286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
107386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
107486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
107586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
107686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
107786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
107886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
107986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
108086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
108186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
108286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
108386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
108486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
108586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
108686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
108786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
108886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
108986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
109086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
1091d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 samples of positive left
1092d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vrev64.16      q3, q3                        \n"// reverse 8 samples of positive right
109386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
109486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
109586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
109686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
109786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
109886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
109986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
110086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
110186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
110286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
110386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
110486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
110586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
110686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
1107d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q0, q0, q15                   \n"// accumulate result
1108d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q0, q0, q13                   \n"// accumulate result
110986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
111086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
111186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
111286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
111386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
111486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
111586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
111686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
111786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
111886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
111986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
112086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q12                   \n"// accumulate result
112286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
1123d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q4, q4, q15                   \n"// accumulate result
1124d7a77156eb13973f7fce5c9db6113bef83bc205bAndy Hung        "vadd.s32       q4, q4, q13                   \n"// accumulate result
112586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
112786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
112886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
113086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
113186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
113286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
113386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
113486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
113586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
113686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
113786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
113886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
113986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
114086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
114186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
114286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
114386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
114486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
114586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
114686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
114786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
114886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
11496b667dde03a5707285a2ff76ada525075d4c60efAndy Hung#endif
11506b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
11516b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
11526b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<>
11536b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void ProcessL<1, 16>(float* const out,
11546b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
11556b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP,
11566b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN,
11576b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sP,
11586b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sN,
11596b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* const volumeLR)
11606b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
11616b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
11626b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
11636b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
11646b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
11656b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<>
11666b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void ProcessL<2, 16>(float* const out,
11676b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
11686b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP,
11696b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN,
11706b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sP,
11716b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sN,
11726b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* const volumeLR)
11736b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
11746b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
11756b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
11766b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
11776b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
11786b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<>
11796b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void Process<1, 16>(float* const out,
11806b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
11816b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP,
11826b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN,
11836b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP1,
11846b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN1,
11856b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sP,
11866b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sN,
11876b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float lerpP,
11886b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* const volumeLR)
11896b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
11906b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
11916b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            lerpP, coefsP1, coefsN1);
11926b667dde03a5707285a2ff76ada525075d4c60efAndy Hung}
11936b667dde03a5707285a2ff76ada525075d4c60efAndy Hung
11946b667dde03a5707285a2ff76ada525075d4c60efAndy Hungtemplate<>
11956b667dde03a5707285a2ff76ada525075d4c60efAndy Hunginline void Process<2, 16>(float* const out,
11966b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        int count,
11976b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP,
11986b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN,
11996b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsP1,
12006b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* coefsN1,
12016b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sP,
12026b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* sN,
12036b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        float lerpP,
12046b667dde03a5707285a2ff76ada525075d4c60efAndy Hung        const float* const volumeLR)
12056b667dde03a5707285a2ff76ada525075d4c60efAndy Hung{
12066b667dde03a5707285a2ff76ada525075d4c60efAndy Hung    ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
12076b667dde03a5707285a2ff76ada525075d4c60efAndy Hung            lerpP, coefsP1, coefsN1);
120886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
120986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
121086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif //USE_NEON
121186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
121263238efb0d674758902918e3cdaac322126484b7Glenn Kasten} // namespace android
121386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
121486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1215