1c91ee5b5642fcc4969150f73d5f6848f88bf1638flim/* Copyright (c) 2014, Cisco Systems, INC 2c91ee5b5642fcc4969150f73d5f6848f88bf1638flim Written by XiangMingZhu WeiZhou MinPeng YanWang 3c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 4c91ee5b5642fcc4969150f73d5f6848f88bf1638flim Redistribution and use in source and binary forms, with or without 5c91ee5b5642fcc4969150f73d5f6848f88bf1638flim modification, are permitted provided that the following conditions 6c91ee5b5642fcc4969150f73d5f6848f88bf1638flim are met: 7c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 8c91ee5b5642fcc4969150f73d5f6848f88bf1638flim - Redistributions of source code must retain the above copyright 9c91ee5b5642fcc4969150f73d5f6848f88bf1638flim notice, this list of conditions and the following disclaimer. 10c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 11c91ee5b5642fcc4969150f73d5f6848f88bf1638flim - Redistributions in binary form must reproduce the above copyright 12c91ee5b5642fcc4969150f73d5f6848f88bf1638flim notice, this list of conditions and the following disclaimer in the 13c91ee5b5642fcc4969150f73d5f6848f88bf1638flim documentation and/or other materials provided with the distribution. 14c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 15c91ee5b5642fcc4969150f73d5f6848f88bf1638flim THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16c91ee5b5642fcc4969150f73d5f6848f88bf1638flim ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17c91ee5b5642fcc4969150f73d5f6848f88bf1638flim LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18c91ee5b5642fcc4969150f73d5f6848f88bf1638flim A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19c91ee5b5642fcc4969150f73d5f6848f88bf1638flim OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20c91ee5b5642fcc4969150f73d5f6848f88bf1638flim EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21c91ee5b5642fcc4969150f73d5f6848f88bf1638flim PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22c91ee5b5642fcc4969150f73d5f6848f88bf1638flim PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23c91ee5b5642fcc4969150f73d5f6848f88bf1638flim LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24c91ee5b5642fcc4969150f73d5f6848f88bf1638flim NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26c91ee5b5642fcc4969150f73d5f6848f88bf1638flim*/ 27c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 28c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#ifdef HAVE_CONFIG_H 29c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "config.h" 30c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#endif 31c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 32c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <xmmintrin.h> 33c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <emmintrin.h> 34c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 35c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "macros.h" 36c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "celt_lpc.h" 37c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "stack_alloc.h" 38c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "mathops.h" 39c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "pitch.h" 40c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 41c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) 42c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <smmintrin.h> 43c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "x86cpu.h" 44c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 45c91ee5b5642fcc4969150f73d5f6848f88bf1638flimopus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, 46c91ee5b5642fcc4969150f73d5f6848f88bf1638flim int N) 47c91ee5b5642fcc4969150f73d5f6848f88bf1638flim{ 48c91ee5b5642fcc4969150f73d5f6848f88bf1638flim opus_int i, dataSize16; 49c91ee5b5642fcc4969150f73d5f6848f88bf1638flim opus_int32 sum; 50c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; 51c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; 52c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i inVec1_3210, inVec2_3210; 53c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 54c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum = 0; 55c91ee5b5642fcc4969150f73d5f6848f88bf1638flim dataSize16 = N & ~15; 56c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 57c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_setzero_si128(); 58c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc2 = _mm_setzero_si128(); 59c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 60c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (i=0;i<dataSize16;i+=16) { 61c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); 62c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); 63c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 64c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); 65c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); 66c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 67c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); 68c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); 69c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 70c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_add_epi32(acc1, inVec1_76543210); 71c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); 72c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 73c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 74c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_add_epi32(acc1, acc2); 75c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 76c91ee5b5642fcc4969150f73d5f6848f88bf1638flim if (N - i >= 8) 77c91ee5b5642fcc4969150f73d5f6848f88bf1638flim { 78c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); 79c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); 80c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 81c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); 82c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 83c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_add_epi32(acc1, inVec1_76543210); 84c91ee5b5642fcc4969150f73d5f6848f88bf1638flim i += 8; 85c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 86c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 87c91ee5b5642fcc4969150f73d5f6848f88bf1638flim if (N - i >= 4) 88c91ee5b5642fcc4969150f73d5f6848f88bf1638flim { 89c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); 90c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); 91c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 92c91ee5b5642fcc4969150f73d5f6848f88bf1638flim inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); 93c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 94c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_add_epi32(acc1, inVec1_3210); 95c91ee5b5642fcc4969150f73d5f6848f88bf1638flim i += 4; 96c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 97c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 98c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); 99c91ee5b5642fcc4969150f73d5f6848f88bf1638flim acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); 100c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 101c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum += _mm_cvtsi128_si32(acc1); 102c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 103c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (;i<N;i++) 104c91ee5b5642fcc4969150f73d5f6848f88bf1638flim { 105c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum = silk_SMLABB(sum, x[i], y[i]); 106c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 107c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 108c91ee5b5642fcc4969150f73d5f6848f88bf1638flim return sum; 109c91ee5b5642fcc4969150f73d5f6848f88bf1638flim} 110c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 111c91ee5b5642fcc4969150f73d5f6848f88bf1638flimvoid xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) 112c91ee5b5642fcc4969150f73d5f6848f88bf1638flim{ 113c91ee5b5642fcc4969150f73d5f6848f88bf1638flim int j; 114c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 115c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i vecX, vecX0, vecX1, vecX2, vecX3; 116c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i vecY0, vecY1, vecY2, vecY3; 117c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i sum0, sum1, sum2, sum3, vecSum; 118c91ee5b5642fcc4969150f73d5f6848f88bf1638flim __m128i initSum; 119c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 120c91ee5b5642fcc4969150f73d5f6848f88bf1638flim celt_assert(len >= 3); 121c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 122c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_setzero_si128(); 123c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum1 = _mm_setzero_si128(); 124c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum2 = _mm_setzero_si128(); 125c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum3 = _mm_setzero_si128(); 126c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 127c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (j=0;j<(len-7);j+=8) 128c91ee5b5642fcc4969150f73d5f6848f88bf1638flim { 129c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); 130c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); 131c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); 132c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); 133c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); 134c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 135c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); 136c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); 137c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); 138c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); 139c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 140c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 141c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); 142c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); 143c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 144c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); 145c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); 146c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 147c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); 148c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); 149c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 150c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); 151c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); 152c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 153c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), 154c91ee5b5642fcc4969150f73d5f6848f88bf1638flim _mm_unpacklo_epi32(sum2, sum3)); 155c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 156c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (;j<(len-3);j+=4) 157c91ee5b5642fcc4969150f73d5f6848f88bf1638flim { 158c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); 159c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX0 = _mm_shuffle_epi32(vecX, 0x00); 160c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX1 = _mm_shuffle_epi32(vecX, 0x55); 161c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX2 = _mm_shuffle_epi32(vecX, 0xaa); 162c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX3 = _mm_shuffle_epi32(vecX, 0xff); 163c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 164c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); 165c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); 166c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); 167c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); 168c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 169c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_mullo_epi32(vecX0, vecY0); 170c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum1 = _mm_mullo_epi32(vecX1, vecY1); 171c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum2 = _mm_mullo_epi32(vecX2, vecY2); 172c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum3 = _mm_mullo_epi32(vecX3, vecY3); 173c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 174c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_add_epi32(sum0, sum1); 175c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum2 = _mm_add_epi32(sum2, sum3); 176c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecSum = _mm_add_epi32(vecSum, sum0); 177c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecSum = _mm_add_epi32(vecSum, sum2); 178c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 179c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 180c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (;j<len;j++) 181c91ee5b5642fcc4969150f73d5f6848f88bf1638flim { 182c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); 183c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecX0 = _mm_shuffle_epi32(vecX, 0x00); 184c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 185c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); 186c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 187c91ee5b5642fcc4969150f73d5f6848f88bf1638flim sum0 = _mm_mullo_epi32(vecX0, vecY0); 188c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vecSum = _mm_add_epi32(vecSum, sum0); 189c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 190c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 191c91ee5b5642fcc4969150f73d5f6848f88bf1638flim initSum = _mm_loadu_si128((__m128i *)(&sum[0])); 192c91ee5b5642fcc4969150f73d5f6848f88bf1638flim initSum = _mm_add_epi32(initSum, vecSum); 193c91ee5b5642fcc4969150f73d5f6848f88bf1638flim _mm_storeu_si128((__m128i *)sum, initSum); 194c91ee5b5642fcc4969150f73d5f6848f88bf1638flim} 195c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#endif 196