186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung/*
286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Copyright (C) 2013 The Android Open Source Project
386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *
486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Licensed under the Apache License, Version 2.0 (the "License");
586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * you may not use this file except in compliance with the License.
686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * You may obtain a copy of the License at
786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *
886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *      http://www.apache.org/licenses/LICENSE-2.0
986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung *
1086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Unless required by applicable law or agreed to in writing, software
1186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * distributed under the License is distributed on an "AS IS" BASIS,
1286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * See the License for the specific language governing permissions and
1486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * limitations under the License.
1586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung */
1686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
1786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
1886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
1986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
2086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungnamespace android {
2186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
2286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
2386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
2486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#if USE_NEON
2586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung//
2686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// NEON specializations are enabled for Process() and ProcessL()
2786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung//
2886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary)
2986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// and looping stride 16 (or vice versa). This has some polyphase coef data alignment
3086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// issues with S16 coefs. Consider this later.
3186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
3286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
3386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_MONO \
3486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes */\
3586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output */\
3686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums */\
3786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d0               \n"/* (1+4d) and replicate L/R */\
3886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume */\
3986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating) */\
4086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vst1.s32       {d3}, %[out]             \n"/* (2+2d) store result */
4186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
4286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_STEREO \
4386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d2}, [%[vLR]:64]        \n"/* (1) load volumes*/\
4486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.s32       {d3}, %[out]             \n"/* (2) unaligned load the output*/\
4586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d1               \n"/* (1) add all 4 partial sums from q0*/\
4686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d8, d8, d9               \n"/* (1) add all 4 partial sums from q4*/\
4786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vpadd.s32      d0, d0, d8               \n"/* (1+4d) combine into L/R*/\
4886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   d0, d0, d2               \n"/* (2+3d) apply volume*/\
4986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqadd.s32      d3, d3, d0               \n"/* (1+4d) accumulate result (saturating)*/\
5086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vst1.s32       {d3}, %[out]             \n"/* (2+2d)store result*/
5186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
5286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
5386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out,
5486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
5586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
5686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
5786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
5886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
5986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
6086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
6186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
6286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
6386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
6486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
6586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
6686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
6786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
6886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
6986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
7086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
7186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
7286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
7386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
7486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
7586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
7686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
7786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply (reversed)samples by coef
7886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed)samples by coef
7986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
8086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
8186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
8286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM instructions before neon above seems to be slower
8386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
8486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
8586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
8686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
8786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
8886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
8986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung         ASSEMBLY_ACCUMULATE_MONO
9086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
9186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
9286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
9386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
9486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
9586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
9686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
9786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
9886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
9986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
10086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q10"
10186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
10286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
10386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
10486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
10586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out,
10686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
10786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
10886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
10986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
11086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
11186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
11286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
11386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
11486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
11586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
11686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
11786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (1) acc_L = 0
11886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
11986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
12086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
12186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
12286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
12386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
12486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
12586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
12686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
12786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
12886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q3, q3                   \n"// (0 combines+) reverse right positive
12986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
13086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1) multiply (reversed) samples left
13186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply (reversed) samples left
13286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d6, d17              \n"// (1) multiply (reversed) samples right
13386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d7, d16              \n"// (1) multiply (reversed) samples right
13486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
13586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
13686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
13786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
13886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
13986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM before neon seems to be slower
14086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
14186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
14286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
14386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
14486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
14586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
14686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
14786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
14886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
14986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
15086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
15186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
15286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
15386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
15486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR] "r" (volumeLR)
15586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
15686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
15786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
15886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q10"
15986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung     );
16086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
16186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
16286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
16386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out,
16486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
16586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
16686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
16786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP1,
16886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN1,
16986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
17086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
17186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
17286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
17386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
17486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
17586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
17686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
17786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
17886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
17986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
18086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
18186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
18286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
18386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]            \n"// (2+0d) load 8 16-bits mono samples
18486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!           \n"// (2) load 8 16-bits mono samples
18586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
18686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
18786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
18886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
18986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
19086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
19186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
19286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
19386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
19486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
19586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
19686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
19786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
19886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q8, q8, q9               \n"// (1+2d) interpolate (step3) 1st set
19986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
20086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
20186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
20286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1+0d) multiply reversed samples by coef
20386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples by coef
20486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
20586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d7, d21              \n"// (1) multiply neg samples
20686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
20786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM instructions before neon above seems to be slower
20886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
20986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
21086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
21186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
21286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
21386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
21486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
21586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
21686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
21786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
21886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
21986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
22086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
22186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
22286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
22386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
22486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
22586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
22686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
22786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
22886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11"
22986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
23086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
23186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
23286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
23386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out,
23486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
23586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
23686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
23786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP1,
23886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN1,
23986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
24086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
24186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
24286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
24386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
24486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
24586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
24686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
24786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
24886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
24986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (1) acc_L = 0
25086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
25186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
25286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
25386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
25486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q2, q3}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
25586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q5, q6}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
25686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q8}, [%[coefsP0]:128]!  \n"// (1) load 8 16-bits coefs
25786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q9}, [%[coefsP1]:128]!  \n"// (1) load 8 16-bits coefs for interpolation
25886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
25986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
26086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
26186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q9, q9, q8               \n"// (1) interpolate (step1) 1st set of coefs
26286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       q11, q11, q10            \n"// (1) interpolate (step1) 2nd set of coets
26386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
26486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q9, q9, d2[0]            \n"// (2) interpolate (step2) 1st set of coefs
26586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   q11, q11, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
26686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
26786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
26886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q3, q3                   \n"// (1) reverse 8 frames of the right positive
26986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
27086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q8, q8, q9               \n"// (1+1d) interpolate (step3) 1st set
27186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       q10, q10, q11            \n"// (1+1d) interpolate (step3) 2nd set
27286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
27386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d17              \n"// (1) multiply reversed samples left
27486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d5, d16              \n"// (1) multiply reversed samples left
27586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d6, d17              \n"// (1) multiply reversed samples right
27686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d7, d16              \n"// (1) multiply reversed samples right
27786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d10, d20             \n"// (1) multiply samples left
27886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d11, d21             \n"// (1) multiply samples left
27986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d12, d20             \n"// (1) multiply samples right
28086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d13, d21             \n"// (1) multiply samples right
28186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
28286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM before neon seems to be slower
28386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8   \n"// (1) update loop counter
28486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32        \n"// (0) move pointer to next set of samples
28586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
28686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
28786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
28886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
28986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
29086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
29186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
29286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
29386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
29486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
29586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
29686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
29786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
29886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
29986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
30086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR] "r" (volumeLR)
30186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
30286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
30386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
30486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11"
30586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
30686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
30786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
30886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
30986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out,
31086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
31186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
31286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
31386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
31486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
31586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
31686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
31786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
31886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
31986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
32086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
32186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
32286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
32386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
32486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
32586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
32686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
32786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
32886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
32986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
33086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
33186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
33286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12, d4, #15                  \n"// extend samples to 31 bits
33386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13, d5, #15                  \n"// extend samples to 31 bits
33486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
33586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14, d6, #15                  \n"// extend samples to 31 bits
33686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15, d7, #15                  \n"// extend samples to 31 bits
33786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
33886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
33986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
34086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
34186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
34286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
34386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
34486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
34586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q15                   \n"// accumulate result
34686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q13                   \n"// accumulate result
34786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
34886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
34986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
35086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
35186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
35286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
35386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
35486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
35586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
35686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
35786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
35886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
35986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
36086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
36186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
36286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
36386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
36486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
36586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
36686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
36786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
36886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
36986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
37086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out,
37186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
37286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
37386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
37486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
37586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
37686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
37786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
37886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
37986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
38086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
38186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
38286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
38386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4                    \n"// result, initialize to 0
38486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
38586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
38686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
38786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
38886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
38986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 4 32-bits coefs
39086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
39186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
39286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
39386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
39486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
39586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
39686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
39786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
39886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
39986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
40086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
40186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
40286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
40386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
40486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
40586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
40686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
40786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
40886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
40986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
41086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
41186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
41286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
41386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
41486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
41586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
41686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
41786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
41886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
41986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
42086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
42186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
42286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q12                   \n"// accumulate result
42386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
42486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
42586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
42686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
42786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
42886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
42986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
43086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
43186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
43286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
43386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
43486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
43586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
43686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
43786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
43886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
43986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
44086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
44186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
44286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
44386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
44486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
44586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
44686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
44786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
44886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
44986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
45086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out,
45186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
45286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
45386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
45486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP1,
45586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN1,
45686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
45786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
45886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
45986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
46086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
46186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
46286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
46386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
46486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
46586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
46686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
46786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
46886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
46986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
47086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q2}, [%[sP]]                 \n"// load 8 16-bits mono samples
47186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {q3}, [%[sN]]!                \n"// load 8 16-bits mono samples
47286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
47386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
47486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
47586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
47686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
47786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
47886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
47986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
48086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
48186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
48286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
48386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
48486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
48586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
48686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
48786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
48886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
48986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
49086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
49186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
49286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
49386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
49486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
49586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
49686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
49786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d6, #15                 \n"// extend samples to 31 bits
49886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d7, #15                 \n"// extend samples to 31 bits
49986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
50086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
50186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
50286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
50386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
50486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
50586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
50686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
50786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q15                   \n"// accumulate result
50886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q13                   \n"// accumulate result
50986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
51086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16             \n"// move pointer to next set of samples
51186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
51286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
51386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
51486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
51586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
51686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
51786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
51886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
51986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
52086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
52186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
52286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
52386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
52486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
52586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
52686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
52786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
52886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
52986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
53086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
53186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
53286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
53386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
53486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
53586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out,
53686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
53786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
53886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
53986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP1,
54086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN1,
54186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
54286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
54386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
54486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
54586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
54686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
54786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 16;
54886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
54986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
55086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]               \n"// load the positive phase
55186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0                    \n"// result, initialize to 0
55286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4                    \n"// result, initialize to 0
55386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
55486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                           \n"
55586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
55686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q2, q3}, [%[sP]]             \n"// load 4 16-bits stereo samples
55786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {q5, q6}, [%[sN]]!            \n"// load 4 16-bits stereo samples
55886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8, q9}, [%[coefsP0]:128]!   \n"// load 8 32-bits coefs
55986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
56086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
56186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
56286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
56386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q12, q12, q8                  \n"// interpolate (step1)
56486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q13, q13, q9                  \n"// interpolate (step1)
56586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q14, q14, q10                 \n"// interpolate (step1)
56686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q15, q15, q11                 \n"// interpolate (step1)
56786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
56886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, d2[0]               \n"// interpolate (step2)
56986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, d2[0]               \n"// interpolate (step2)
57086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, d2[0]               \n"// interpolate (step2)
57186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, d2[0]               \n"// interpolate (step2)
57286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
57386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q8, q8, q12                   \n"// interpolate (step3)
57486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q9, q9, q13                   \n"// interpolate (step3)
57586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q10, q10, q14                 \n"// interpolate (step3)
57686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q11, q11, q15                 \n"// interpolate (step3)
57786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
57886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                        \n"// reverse 8 frames of the positive side
57986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q3, q3                        \n"// reverse 8 frames of the positive side
58086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
58186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d4, #15                 \n"// extend samples to 31 bits
58286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d5, #15                 \n"// extend samples to 31 bits
58386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
58486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d10, #15                \n"// extend samples to 31 bits
58586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d11, #15                \n"// extend samples to 31 bits
58686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
58786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
58886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
58986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
59086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
59186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
59286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12                   \n"// accumulate result
59386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
59486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q15                   \n"// (+1) accumulate result
59586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q13                   \n"// (+1) accumulate result
59686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
59786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12,  d6, #15                 \n"// extend samples to 31 bits
59886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13,  d7, #15                 \n"// extend samples to 31 bits
59986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
60086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14,  d12, #15                \n"// extend samples to 31 bits
60186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15,  d13, #15                \n"// extend samples to 31 bits
60286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
60386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q9                  \n"// multiply samples by interpolated coef
60486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8                  \n"// multiply samples by interpolated coef
60586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10                 \n"// multiply samples by interpolated coef
60686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q11                 \n"// multiply samples by interpolated coef
60786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
60886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q12                   \n"// accumulate result
60986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q13, q13, q14                 \n"// accumulate result
61086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q15                   \n"// (+1) accumulate result
61186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q13                   \n"// (+1) accumulate result
61286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
61386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #8        \n"// update loop counter
61486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #32             \n"// move pointer to next set of samples
61586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
61686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                            \n"// loop
61786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
61886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
61986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
62086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
62186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
62286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
62386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
62486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
62586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
62686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
62786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
62886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
62986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
63086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
63186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
63286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
63386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
63486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
63586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
63686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
63786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
63886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
63986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 8>(int32_t* const out,
64086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
64186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
64286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
64386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
64486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
64586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
64686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
64786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
64886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
64986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
65086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
65186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
65286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
65386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
65486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
65586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
65686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
65786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
65886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs
65986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
66086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      d4, d4                   \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4
66186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
66286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
66386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed)samples by coef
66486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
66586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
66686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM instructions before neon above seems to be slower
66786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// (1) update loop counter
66886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #8         \n"// (0) move pointer to next set of samples
66986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
67086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
67186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
67286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
67386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
67486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
67586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
67686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
67786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
67886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
67986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
68086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
68186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
68286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
68386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
68486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q10"
68586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
68686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
68786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
68886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
68986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 8>(int32_t* const out,
69086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
69186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
69286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
69386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
69486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
69586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
69686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
69786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
69886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
69986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
70086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
70186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (1) acc_L = 0
70286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
70386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
70486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
70586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
70686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d4, d5}, [%[sP]]        \n"// (2+0d) load 8 16-bits stereo samples
70786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (2) load 8 16-bits stereo samples
70886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
70986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d20}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs
71086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
71186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
71286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
71386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
71486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
71586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
71686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
71786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
71886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM before neon seems to be slower
71986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// (1) update loop counter
72086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// (0) move pointer to next set of samples
72186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
72286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
72386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
72486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
72586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
72686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
72786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
72886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
72986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
73086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
73186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
73286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
73386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR] "r" (volumeLR)
73486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
73586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
73686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
73786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q10"
73886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung     );
73986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
74086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
74186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
74286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 8>(int32_t* const out,
74386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
74486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
74586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
74686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP1,
74786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN1,
74886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
74986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
75086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
75186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
75286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
75386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
75486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
75586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
75686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
75786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase S32 Q15
75886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (0 - combines+) accumulator = 0
75986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
76086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
76186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
76286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d4}, [%[sP]]            \n"// (2+0d) load 4 16-bits mono samples
76386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d6}, [%[sN]]!           \n"// (2) load 4 16-bits mono samples
76486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 4 16-bits coefs
76586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
76686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 4 16-bits coefs
76786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 4 16-bits coefs for interpolation
76886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
76986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
77086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
77186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
77286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
77386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
77486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
77586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      d4, d4                   \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
77686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
77786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       d16, d16, d17            \n"// (1+2d) interpolate (step3) 1st set
77886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
77986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
78086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
78186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d16              \n"// (1+0d) multiply (reversed)by coef
78286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply neg samples
78386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
78486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM instructions before neon above seems to be slower
78586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// (1) update loop counter
78686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #8        \n"// move pointer to next set of samples
78786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
78886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
78986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
79086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
79186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
79286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
79386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
79486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
79586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
79686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
79786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
79886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
79986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
80086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
80186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
80286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
80386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
80486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
80586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11"
80686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
80786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
80886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
80986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
81086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 8>(int32_t* const out,
81186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
81286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP,
81386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN,
81486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsP1,
81586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* coefsN1,
81686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
81786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
81886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
81986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
82086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
82186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
82286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
82386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
82486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
82586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
82686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// (1) acc_L = 0
82786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// (0 combines+) acc_R = 0
82886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
82986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
83086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
83186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d4, d5}, [%[sP]]        \n"// (3+0d) load 8 16-bits stereo samples
83286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d6, d7}, [%[sN]]!       \n"// (3) load 8 16-bits stereo samples
83386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d16}, [%[coefsP0]:64]!  \n"// (1) load 8 16-bits coefs
83486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d17}, [%[coefsP1]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
83586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d20}, [%[coefsN1]:64]!  \n"// (1) load 8 16-bits coefs
83686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d21}, [%[coefsN0]:64]!  \n"// (1) load 8 16-bits coefs for interpolation
83786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
83886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       d17, d17, d16            \n"// (1) interpolate (step1) 1st set of coefs
83986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s16       d21, d21, d20            \n"// (1) interpolate (step1) 2nd set of coets
84086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   d17, d17, d2[0]          \n"// (2) interpolate (step2) 1st set of coefs
84286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s16   d21, d21, d2[0]          \n"// (2) interpolate (step2) 2nd set of coefs
84386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (1) reverse 8 frames of the left positive
84586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       d16, d16, d17            \n"// (1+1d) interpolate (step3) 1st set
84786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s16       d20, d20, d21            \n"// (1+1d) interpolate (step3) 2nd set
84886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
84986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d4, d16              \n"// (1) multiply (reversed) samples left
85086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d5, d16              \n"// (1) multiply (reversed) samples right
85186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q0, d6, d20              \n"// (1) multiply samples left
85286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmlal.s16      q4, d7, d20              \n"// (1) multiply samples right
85386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
85486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // moving these ARM before neon seems to be slower
85586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// (1) update loop counter
85686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
85786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
85886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        // sP used after branch (warning)
85986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
86086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
86186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
86286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
86386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
86486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
86586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
86686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
86786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
86886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
86986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
87086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
87186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
87286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR] "r" (volumeLR)
87386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
87486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
87586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q4", "q5", "q6",
87686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11"
87786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
87886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
87986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
88086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
88186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 8>(int32_t* const out,
88286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
88386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
88486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
88586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
88686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
88786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
88886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
88986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
89086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
89186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
89286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
89386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// result, initialize to 0
89486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
89586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
89686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
89786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
89886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
89986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
90086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
90186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
90286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
90386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
90486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12, d4, #15             \n"// (stall) extend samples to 31 bits
90586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
90686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
90786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
90886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
90986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
91086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12              \n"// accumulate result
91186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q14              \n"// (stall) accumulate result
91286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
91386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// update loop counter
91486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
91586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
91686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
91786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
91886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
91986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
92086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out] "=Uv" (out[0]),
92186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count] "+r" (count),
92286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
92386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
92486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP] "+r" (sP),
92586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN] "+r" (sN)
92686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR] "r" (volumeLR)
92786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
92886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
92986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
93086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q14"
93186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
93286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
93386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
93486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
93586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 8>(int32_t* const out,
93686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
93786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
93886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
93986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
94086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
94186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
94286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
94386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
94486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
94586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
94686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
94786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// result, initialize to 0
94886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// result, initialize to 0
94986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
95086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
95186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
95286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
95386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
95486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
95586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs
95686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
95786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// reverse 2 frames of the positive side
95886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
95986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
96086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
96186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
96286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
96386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
96486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
96586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by coef
96686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by coef
96786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by coef
96886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by coef
96986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
97086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12              \n"// accumulate result
97186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q13              \n"// accumulate result
97286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q14              \n"// accumulate result
97386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q15              \n"// accumulate result
97486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
97586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// update loop counter
97686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
97786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
97886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
97986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
98086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
98186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
98286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
98386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
98486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
98586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
98686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
98786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
98886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [vLR]     "r" (volumeLR)
98986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
99086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3", "q4",
99186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
99286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
99386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
99486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
99586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
99686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
99786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 8>(int32_t* const out,
99886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
99986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
100086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
100186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP1,
100286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN1,
100386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
100486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
100586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
100686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
100786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
100886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 1; // template specialization does not preserve params
100986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
101086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
101186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
101286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
101386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// result, initialize to 0
101486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
101586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
101686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
101786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d4}, [%[sP]]            \n"// load 4 16-bits mono samples
101886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.16        {d6}, [%[sN]]!           \n"// load 4 16-bits mono samples
101986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
102086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
102186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
102286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
102386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
102486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      d4, d4                   \n"// reverse 2 frames of the positive side
102586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
102686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
102786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
102886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
102986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
103086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
103186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q11, q11, d2[0]          \n"// interpolate (step2) 2nd set of coefs
103286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
103386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
103486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
103586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
103686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
103786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
103886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
103986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
104086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12              \n"// accumulate result
104186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q14              \n"// accumulate result
104286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
104386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// update loop counter
104486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #8         \n"// move pointer to next set of samples
104586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
104686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
104786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
104886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_MONO
104986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
105086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
105186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
105286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
105386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
105486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
105586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
105686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
105786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
105886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
105986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
106086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
106186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3",
106286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
106386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q14"
106486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
106586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
106686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
106786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <>
106886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline
106986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungvoid Process<2, 8>(int32_t* const out,
107086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        int count,
107186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP,
107286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN,
107386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsP1,
107486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* coefsN1,
107586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sP,
107686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int16_t* sN,
107786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        uint32_t lerpP,
107886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        const int32_t* const volumeLR)
107986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{
108086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int CHANNELS = 2; // template specialization does not preserve params
108186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    const int STRIDE = 8;
108286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    sP -= CHANNELS*((STRIDE>>1)-1);
108386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    asm (
108486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vmov.32        d2[0], %[lerpP]          \n"// load the positive phase
108586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q0, q0, q0               \n"// result, initialize to 0
108686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "veor           q4, q4, q4               \n"// result, initialize to 0
108786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
108886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "1:                                      \n"
108986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d4, d5}, [%[sP]]        \n"// load 4 16-bits stereo samples
109086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld2.16        {d6, d7}, [%[sN]]!       \n"// load 4 16-bits stereo samples
109186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q8}, [%[coefsP0]:128]!  \n"// load 4 32-bits coefs
109286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q9}, [%[coefsP1]:128]!  \n"// load 4 32-bits coefs for interpolation
109386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs
109486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vld1.32        {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation
109586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
109686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vrev64.16      q2, q2                   \n"// (reversed) 2 frames of the positive side
109786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
109886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q9, q9, q8               \n"// interpolate (step1) 1st set of coefs
109986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vsub.s32       q11, q11, q10            \n"// interpolate (step1) 2nd set of coets
110086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q12, d4, #15             \n"// extend samples to 31 bits
110186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q13, d5, #15             \n"// extend samples to 31 bits
110286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
110386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q9, q9, d2[0]            \n"// interpolate (step2) 1st set of coefs
110486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q11, q11, d2[1]          \n"// interpolate (step3) 2nd set of coefs
110586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q14, d6, #15             \n"// extend samples to 31 bits
110686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vshll.s16      q15, d7, #15             \n"// extend samples to 31 bits
110786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
110886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q8, q8, q9               \n"// interpolate (step3) 1st set
110986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q10, q10, q11            \n"// interpolate (step4) 2nd set
111086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
111186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q12, q12, q8             \n"// multiply samples by interpolated coef
111286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q13, q13, q8             \n"// multiply samples by interpolated coef
111386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q14, q14, q10            \n"// multiply samples by interpolated coef
111486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vqrdmulh.s32   q15, q15, q10            \n"// multiply samples by interpolated coef
111586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
111686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q12              \n"// accumulate result
111786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q13              \n"// accumulate result
111886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q0, q0, q14              \n"// accumulate result
111986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "vadd.s32       q4, q4, q15              \n"// accumulate result
112086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "subs           %[count], %[count], #4   \n"// update loop counter
112286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "sub            %[sP], %[sP], #16        \n"// move pointer to next set of samples
112386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        "bne            1b                       \n"// loop
112586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        ASSEMBLY_ACCUMULATE_STEREO
112786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
112886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [out]     "=Uv" (out[0]),
112986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [count]   "+r" (count),
113086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP0] "+r" (coefsP),
113186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsP1] "+r" (coefsP1),
113286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN0] "+r" (coefsN),
113386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [coefsN1] "+r" (coefsN1),
113486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sP]      "+r" (sP),
113586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [sN]      "+r" (sN)
113686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : [lerpP]   "r" (lerpP),
113786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          [vLR]     "r" (volumeLR)
113886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung        : "cc", "memory",
113986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q0", "q1", "q2", "q3", "q4",
114086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q8", "q9", "q10", "q11",
114186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung          "q12", "q13", "q14", "q15"
114286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung    );
114386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}
114486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
114586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif //USE_NEON
114686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
114786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}; // namespace android
114886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung
114986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1150