186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung/* 286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Copyright (C) 2013 The Android Open Source Project 386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * 486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Licensed under the Apache License, Version 2.0 (the "License"); 586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * you may not use this file except in compliance with the License. 686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * You may obtain a copy of the License at 786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * 886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * http://www.apache.org/licenses/LICENSE-2.0 986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * 1086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * Unless required by applicable law or agreed to in writing, software 1186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * distributed under the License is distributed on an "AS IS" BASIS, 1286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * See the License for the specific language governing permissions and 1486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung * limitations under the License. 1586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung */ 1686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 1786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 1886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H 1986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 2086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungnamespace android { 2186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 2286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h 2386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 2486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#if USE_NEON 2586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// 2686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// NEON specializations are enabled for Process() and ProcessL() 2786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// 2886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// TODO: Stride 16 and Stride 8 can be combined with one pass stride 8 (if necessary) 2986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// and looping stride 16 (or vice versa). This has some polyphase coef data alignment 3086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// issues with S16 coefs. Consider this later. 3186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 3286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out. 3386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_MONO \ 3486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\ 3586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\ 3686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\ 3786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\ 3886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\ 3986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\ 4086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */ 4186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 4286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#define ASSEMBLY_ACCUMULATE_STEREO \ 4386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\ 4486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\ 4586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\ 4686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\ 4786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\ 4886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\ 4986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\ 5086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/ 5186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 5286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 5386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out, 5486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 5586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 5686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 5786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 5886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 5986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 6086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 6186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 6286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 6386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 6486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 6586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 6686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 6786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 6886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 6986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 7086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 7186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 7286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 7386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 7486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 7586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 7686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 7786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef 7886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef 7986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 8086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 8186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 8286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM instructions before neon above seems to be slower 8386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 8486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 8586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 8686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 8786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 8886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 8986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 9086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 9186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 9286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 9386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 9486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 9586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 9686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 9786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 9886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 9986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 10086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q10" 10186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 10286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 10386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 10486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 10586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out, 10686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 10786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 10886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 10986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 11086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 11186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 11286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 11386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 11486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 11586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 11686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 11786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (1) acc_L = 0 11886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 11986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 12086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 12186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 12286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples 12386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples 12486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 12586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs 12686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 12786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 12886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q3, q3 \n"// (0 combines+) reverse right positive 12986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 13086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left 13186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left 13286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right 13386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right 13486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 13586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 13686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 13786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 13886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 13986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM before neon seems to be slower 14086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 14186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 14286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 14386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 14486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 14586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 14686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 14786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 14886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 14986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 15086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 15186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 15286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 15386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 15486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 15586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 15686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 15786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 15886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q10" 15986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 16086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 16186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 16286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 16386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out, 16486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 16586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 16686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 16786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP1, 16886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN1, 16986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 17086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 17186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 17286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 17386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 17486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 17586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 17686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 17786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 17886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 17986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 18086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 18186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 18286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 18386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples 18486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples 18586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 18686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 18786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 18886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 18986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 19086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 19186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 19286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 19386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 19486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 19586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 19686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 19786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 19886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set 19986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 20086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 20186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 20286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef 20386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef 20486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 20586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples 20686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 20786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM instructions before neon above seems to be slower 20886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 20986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 21086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 21186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 21286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 21386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 21486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 21586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 21686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 21786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 21886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 21986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 22086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 22186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 22286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 22386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 22486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 22586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 22686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 22786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 22886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11" 22986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 23086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 23186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 23286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 23386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out, 23486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 23586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 23686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 23786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP1, 23886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN1, 23986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 24086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 24186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 24286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 24386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 24486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 24586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 24686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 24786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 24886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 24986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (1) acc_L = 0 25086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 25186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 25286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 25386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 25486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples 25586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples 25686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs 25786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation 25886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs 25986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation 26086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 26186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs 26286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets 26386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 26486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 26586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 26686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 26786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 26886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q3, q3 \n"// (1) reverse 8 frames of the right positive 26986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 27086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set 27186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set 27286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 27386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left 27486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left 27586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right 27686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right 27786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left 27886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left 27986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right 28086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right 28186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 28286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM before neon seems to be slower 28386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// (1) update loop counter 28486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples 28586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 28686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 28786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 28886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 28986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 29086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 29186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 29286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 29386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 29486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 29586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 29686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 29786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 29886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 29986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 30086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 30186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 30286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 30386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 30486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11" 30586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 30686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 30786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 30886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 30986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 16>(int32_t* const out, 31086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 31186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 31286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 31386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 31486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 31586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 31686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 31786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 31886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 31986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 32086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 32186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 32286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 32386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 32486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 32586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 32686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 32786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 32886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 32986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 33086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 33186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 33286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 33386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 33486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 33586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 33686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 33786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 33886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 33986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 34086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 34186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 34286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 34386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 34486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 34586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q15 \n"// accumulate result 34686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q13 \n"// accumulate result 34786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 34886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 34986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 35086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 35186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 35286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 35386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 35486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 35586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 35686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 35786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 35886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 35986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 36086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 36186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 36286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 36386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 36486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 36586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 36686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 36786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 36886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 36986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 37086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 16>(int32_t* const out, 37186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 37286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 37386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 37486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 37586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 37686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 37786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 37886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 37986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 38086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 38186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 38286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 38386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// result, initialize to 0 38486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 38586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 38686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 38786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples 38886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples 38986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 39086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs 39186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 39286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 39386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side 39486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 39586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 39686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 39786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 39886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 39986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 40086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 40186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 40286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 40386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 40486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 40586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 40686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 40786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 40886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result 40986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result 41086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 41186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 41286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 41386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 41486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 41586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 41686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 41786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 41886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 41986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 42086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 42186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 42286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q12 \n"// accumulate result 42386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 42486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result 42586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result 42686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 42786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 42886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 42986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 43086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 43186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 43286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 43386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 43486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 43586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 43686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 43786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 43886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 43986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 44086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 44186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 44286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 44386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 44486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 44586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 44686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 44786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 44886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 44986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 45086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 16>(int32_t* const out, 45186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 45286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 45386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 45486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP1, 45586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN1, 45686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 45786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 45886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 45986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 46086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 46186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 46286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 46386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 46486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 46586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 46686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 46786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 46886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 46986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 47086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples 47186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples 47286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 47386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 47486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 47586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 47686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 47786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 47886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 47986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 48086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 48186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 48286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 48386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 48486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 48586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 48686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 48786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 48886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 48986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 49086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 49186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 49286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 49386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 49486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 49586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 49686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 49786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 49886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 49986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 50086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 50186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 50286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 50386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 50486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 50586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 50686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 50786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q15 \n"// accumulate result 50886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q13 \n"// accumulate result 50986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 51086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 51186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 51286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 51386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 51486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 51586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 51686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 51786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 51886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 51986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 52086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 52186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 52286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 52386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 52486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 52586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 52686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 52786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 52886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 52986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 53086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 53186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 53286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 53386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 53486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 53586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 16>(int32_t* const out, 53686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 53786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 53886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 53986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP1, 54086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN1, 54186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 54286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 54386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 54486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 54586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 54686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 54786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 16; 54886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 54986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 55086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 55186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 55286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// result, initialize to 0 55386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 55486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 55586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 55686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q2, q3}, [%[sP]] \n"// load 4 16-bits stereo samples 55786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {q5, q6}, [%[sN]]! \n"// load 4 16-bits stereo samples 55886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs 55986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs 56086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs 56186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs 56286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 56386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q12, q12, q8 \n"// interpolate (step1) 56486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q13, q13, q9 \n"// interpolate (step1) 56586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q14, q14, q10 \n"// interpolate (step1) 56686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q15, q15, q11 \n"// interpolate (step1) 56786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 56886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2) 56986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2) 57086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2) 57186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2) 57286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 57386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q8, q8, q12 \n"// interpolate (step3) 57486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q9, q9, q13 \n"// interpolate (step3) 57586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q10, q10, q14 \n"// interpolate (step3) 57686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q11, q11, q15 \n"// interpolate (step3) 57786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 57886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// reverse 8 frames of the positive side 57986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q3, q3 \n"// reverse 8 frames of the positive side 58086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 58186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 58286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 58386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 58486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits 58586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits 58686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 58786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 58886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 58986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 59086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 59186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 59286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 59386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 59486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q15 \n"// (+1) accumulate result 59586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q13 \n"// (+1) accumulate result 59686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 59786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits 59886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits 59986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 60086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits 60186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits 60286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 60386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef 60486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 60586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 60686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef 60786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 60886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q12 \n"// accumulate result 60986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q13, q13, q14 \n"// accumulate result 61086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q15 \n"// (+1) accumulate result 61186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q13 \n"// (+1) accumulate result 61286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 61386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #8 \n"// update loop counter 61486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples 61586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 61686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 61786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 61886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 61986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 62086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 62186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 62286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 62386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 62486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 62586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 62686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 62786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 62886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 62986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 63086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 63186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 63286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 63386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 63486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 63586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 63686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 63786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 63886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 63986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 8>(int32_t* const out, 64086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 64186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 64286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 64386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 64486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 64586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 64686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 64786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 64886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 64986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 65086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 65186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 65286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 65386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 65486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 65586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples 65686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples 65786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs 65886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs 65986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 66086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 d4, d4 \n"// (1) reversed s3, s2, s1, s0, s7, s6, s5, s4 66186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 66286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 66386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed)samples by coef 66486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 66586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 66686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM instructions before neon above seems to be slower 66786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// (1) update loop counter 66886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #8 \n"// (0) move pointer to next set of samples 66986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 67086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 67186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 67286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 67386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 67486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 67586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 67686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 67786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 67886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 67986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 68086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 68186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 68286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 68386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 68486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q10" 68586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 68686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 68786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 68886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 68986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 8>(int32_t* const out, 69086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 69186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 69286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 69386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 69486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 69586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 69686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 69786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 69886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 69986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 70086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 70186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (1) acc_L = 0 70286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 70386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 70486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 70586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 70686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d4, d5}, [%[sP]] \n"// (2+0d) load 8 16-bits stereo samples 70786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d6, d7}, [%[sN]]! \n"// (2) load 8 16-bits stereo samples 70886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs 70986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d20}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs 71086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 71186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 71286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 71386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left 71486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right 71586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left 71686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right 71786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 71886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM before neon seems to be slower 71986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// (1) update loop counter 72086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples 72186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 72286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 72386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 72486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 72586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 72686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 72786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 72886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 72986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 73086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 73186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 73286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 73386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 73486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 73586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 73686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 73786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q10" 73886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 73986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 74086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 74186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 74286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 8>(int32_t* const out, 74386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 74486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 74586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 74686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP1, 74786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN1, 74886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 74986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 75086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 75186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 75286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 75386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 75486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 75586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 75686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 75786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15 75886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0 75986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 76086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 76186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 76286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d4}, [%[sP]] \n"// (2+0d) load 4 16-bits mono samples 76386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d6}, [%[sN]]! \n"// (2) load 4 16-bits mono samples 76486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 4 16-bits coefs 76586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 4 16-bits coefs for interpolation 76686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 4 16-bits coefs 76786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 4 16-bits coefs for interpolation 76886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 76986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs 77086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets 77186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 77286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 77386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 77486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 77586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 d4, d4 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4 77686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 77786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 d16, d16, d17 \n"// (1+2d) interpolate (step3) 1st set 77886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set 77986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 78086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // reordering the vmal to do d6, d7 before d4, d5 is slower(?) 78186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d16 \n"// (1+0d) multiply (reversed)by coef 78286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples 78386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 78486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM instructions before neon above seems to be slower 78586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// (1) update loop counter 78686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples 78786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 78886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 78986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 79086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 79186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 79286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 79386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 79486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 79586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 79686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 79786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 79886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 79986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 80086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 80186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 80286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 80386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 80486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 80586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11" 80686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 80786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 80886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 80986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 81086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<2, 8>(int32_t* const out, 81186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 81286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP, 81386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN, 81486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsP1, 81586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* coefsN1, 81686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 81786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 81886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 81986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 82086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 82186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 82286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 82386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 82486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 82586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 82686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// (1) acc_L = 0 82786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0 82886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 82986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 83086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 83186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d4, d5}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo samples 83286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d6, d7}, [%[sN]]! \n"// (3) load 8 16-bits stereo samples 83386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d16}, [%[coefsP0]:64]! \n"// (1) load 8 16-bits coefs 83486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d17}, [%[coefsP1]:64]! \n"// (1) load 8 16-bits coefs for interpolation 83586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d20}, [%[coefsN1]:64]! \n"// (1) load 8 16-bits coefs 83686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d21}, [%[coefsN0]:64]! \n"// (1) load 8 16-bits coefs for interpolation 83786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 83886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 d17, d17, d16 \n"// (1) interpolate (step1) 1st set of coefs 83986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s16 d21, d21, d20 \n"// (1) interpolate (step1) 2nd set of coets 84086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 d17, d17, d2[0] \n"// (2) interpolate (step2) 1st set of coefs 84286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s16 d21, d21, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs 84386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (1) reverse 8 frames of the left positive 84586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 d16, d16, d17 \n"// (1+1d) interpolate (step3) 1st set 84786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s16 d20, d20, d21 \n"// (1+1d) interpolate (step3) 2nd set 84886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 84986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d4, d16 \n"// (1) multiply (reversed) samples left 85086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d5, d16 \n"// (1) multiply (reversed) samples right 85186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q0, d6, d20 \n"// (1) multiply samples left 85286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmlal.s16 q4, d7, d20 \n"// (1) multiply samples right 85386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 85486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // moving these ARM before neon seems to be slower 85586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// (1) update loop counter 85686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 85786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 85886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung // sP used after branch (warning) 85986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 86086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 86186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 86286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 86386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 86486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 86586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 86686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 86786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 86886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 86986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 87086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 87186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 87286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 87386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 87486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 87586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q4", "q5", "q6", 87686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11" 87786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 87886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 87986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 88086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 88186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<1, 8>(int32_t* const out, 88286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 88386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 88486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 88586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 88686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 88786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 88886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 88986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 89086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 89186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 89286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 89386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 89486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 89586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 89686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 89786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples 89886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples 89986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 90086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs 90186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 90286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side 90386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 90486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// (stall) extend samples to 31 bits 90586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 90686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 90786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef 90886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 90986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 91086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 91186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q14 \n"// (stall) accumulate result 91286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 91386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// update loop counter 91486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples 91586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 91686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 91786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 91886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 91986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 92086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 92186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 92286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 92386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 92486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 92586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 92686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 92786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 92886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 92986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 93086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q14" 93186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 93286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 93386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 93486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 93586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void ProcessL<2, 8>(int32_t* const out, 93686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 93786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 93886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 93986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 94086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 94186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 94286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 94386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 94486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 94586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 94686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 94786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 94886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// result, initialize to 0 94986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 95086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 95186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 95286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples 95386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples 95486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 95586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs 95686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 95786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// reverse 2 frames of the positive side 95886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 95986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 96086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 96186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 96286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 96386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 96486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 96586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by coef 96686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef 96786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef 96886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by coef 96986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 97086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 97186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q13 \n"// accumulate result 97286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q14 \n"// accumulate result 97386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q15 \n"// accumulate result 97486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 97586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// update loop counter 97686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 97786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 97886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 97986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 98086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 98186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 98286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 98386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 98486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 98586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 98686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 98786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 98886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [vLR] "r" (volumeLR) 98986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 99086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", "q4", 99186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 99286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 99386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 99486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 99586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 99686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 99786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline void Process<1, 8>(int32_t* const out, 99886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 99986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 100086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 100186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP1, 100286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN1, 100386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 100486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 100586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 100686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 100786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 100886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 1; // template specialization does not preserve params 100986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 101086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 101186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 101286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 101386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 101486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 101586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 101686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 101786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d4}, [%[sP]] \n"// load 4 16-bits mono samples 101886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.16 {d6}, [%[sN]]! \n"// load 4 16-bits mono samples 101986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 102086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation 102186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs 102286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation 102386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 102486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 d4, d4 \n"// reverse 2 frames of the positive side 102586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 102686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs 102786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets 102886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 102986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 103086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs 103186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q11, q11, d2[0] \n"// interpolate (step2) 2nd set of coefs 103286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 103386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 103486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set 103586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set 103686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 103786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef 103886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 103986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 104086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 104186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q14 \n"// accumulate result 104286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 104386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// update loop counter 104486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #8 \n"// move pointer to next set of samples 104586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 104686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 104786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 104886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_MONO 104986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 105086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 105186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 105286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 105386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 105486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 105586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 105686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 105786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 105886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 105986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 106086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 106186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", 106286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 106386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q14" 106486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 106586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 106686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 106786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungtemplate <> 106886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hunginline 106986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hungvoid Process<2, 8>(int32_t* const out, 107086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung int count, 107186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP, 107286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN, 107386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsP1, 107486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* coefsN1, 107586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sP, 107686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int16_t* sN, 107786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung uint32_t lerpP, 107886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int32_t* const volumeLR) 107986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung{ 108086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int CHANNELS = 2; // template specialization does not preserve params 108186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung const int STRIDE = 8; 108286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung sP -= CHANNELS*((STRIDE>>1)-1); 108386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung asm ( 108486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vmov.32 d2[0], %[lerpP] \n"// load the positive phase 108586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q0, q0, q0 \n"// result, initialize to 0 108686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "veor q4, q4, q4 \n"// result, initialize to 0 108786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 108886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "1: \n" 108986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d4, d5}, [%[sP]] \n"// load 4 16-bits stereo samples 109086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld2.16 {d6, d7}, [%[sN]]! \n"// load 4 16-bits stereo samples 109186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q8}, [%[coefsP0]:128]! \n"// load 4 32-bits coefs 109286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q9}, [%[coefsP1]:128]! \n"// load 4 32-bits coefs for interpolation 109386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q10}, [%[coefsN1]:128]! \n"// load 4 32-bits coefs 109486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vld1.32 {q11}, [%[coefsN0]:128]! \n"// load 4 32-bits coefs for interpolation 109586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 109686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vrev64.16 q2, q2 \n"// (reversed) 2 frames of the positive side 109786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 109886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q9, q9, q8 \n"// interpolate (step1) 1st set of coefs 109986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vsub.s32 q11, q11, q10 \n"// interpolate (step1) 2nd set of coets 110086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits 110186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits 110286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 110386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q9, q9, d2[0] \n"// interpolate (step2) 1st set of coefs 110486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q11, q11, d2[1] \n"// interpolate (step3) 2nd set of coefs 110586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits 110686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits 110786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 110886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q8, q8, q9 \n"// interpolate (step3) 1st set 110986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q10, q10, q11 \n"// interpolate (step4) 2nd set 111086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 111186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q12, q12, q8 \n"// multiply samples by interpolated coef 111286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef 111386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef 111486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vqrdmulh.s32 q15, q15, q10 \n"// multiply samples by interpolated coef 111586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 111686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q12 \n"// accumulate result 111786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q13 \n"// accumulate result 111886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q0, q0, q14 \n"// accumulate result 111986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "vadd.s32 q4, q4, q15 \n"// accumulate result 112086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "subs %[count], %[count], #4 \n"// update loop counter 112286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples 112386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "bne 1b \n"// loop 112586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ASSEMBLY_ACCUMULATE_STEREO 112786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 112886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [out] "=Uv" (out[0]), 112986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [count] "+r" (count), 113086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP0] "+r" (coefsP), 113186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsP1] "+r" (coefsP1), 113286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN0] "+r" (coefsN), 113386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [coefsN1] "+r" (coefsN1), 113486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sP] "+r" (sP), 113586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [sN] "+r" (sN) 113686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : [lerpP] "r" (lerpP), 113786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung [vLR] "r" (volumeLR) 113886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung : "cc", "memory", 113986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q0", "q1", "q2", "q3", "q4", 114086eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q8", "q9", "q10", "q11", 114186eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung "q12", "q13", "q14", "q15" 114286eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung ); 114386eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung} 114486eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 114586eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif //USE_NEON 114686eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 114786eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung}; // namespace android 114886eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung 114986eae0e5931103e040ac2cdd023ef5db252e09f6Andy Hung#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/ 1150