1c91ee5b5642fcc4969150f73d5f6848f88bf1638flim/* Copyright (c) 2014, Cisco Systems, INC
2c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   Written by XiangMingZhu WeiZhou MinPeng YanWang
3c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
4c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   Redistribution and use in source and binary forms, with or without
5c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   modification, are permitted provided that the following conditions
6c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   are met:
7c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
8c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   - Redistributions of source code must retain the above copyright
9c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   notice, this list of conditions and the following disclaimer.
10c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
11c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   - Redistributions in binary form must reproduce the above copyright
12c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   notice, this list of conditions and the following disclaimer in the
13c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   documentation and/or other materials provided with the distribution.
14c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
15c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25c91ee5b5642fcc4969150f73d5f6848f88bf1638flim   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26c91ee5b5642fcc4969150f73d5f6848f88bf1638flim*/
27c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
28c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#ifdef HAVE_CONFIG_H
29c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "config.h"
30c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#endif
31c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
32c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <xmmintrin.h>
33c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <emmintrin.h>
34c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
35c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "macros.h"
36c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "celt_lpc.h"
37c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "stack_alloc.h"
38c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "mathops.h"
39c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "pitch.h"
40c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
41c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
42c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <smmintrin.h>
43c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "x86cpu.h"
44c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
45c91ee5b5642fcc4969150f73d5f6848f88bf1638flimopus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
46c91ee5b5642fcc4969150f73d5f6848f88bf1638flim      int N)
47c91ee5b5642fcc4969150f73d5f6848f88bf1638flim{
48c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    opus_int  i, dataSize16;
49c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    opus_int32 sum;
50c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
51c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
52c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i inVec1_3210, inVec2_3210;
53c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
54c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum = 0;
55c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    dataSize16 = N & ~15;
56c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
57c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    acc1 = _mm_setzero_si128();
58c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    acc2 = _mm_setzero_si128();
59c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
60c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    for (i=0;i<dataSize16;i+=16) {
61c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
62c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
63c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
64c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
65c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
66c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
67c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
68c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
69c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
70c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
71c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
72c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
73c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
74c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    acc1 = _mm_add_epi32(acc1, acc2);
75c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
76c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    if (N - i >= 8)
77c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    {
78c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
79c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
80c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
81c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
82c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
83c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
84c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        i += 8;
85c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
86c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
87c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    if (N - i >= 4)
88c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    {
89c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
90c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
91c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
92c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
93c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
94c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        acc1 = _mm_add_epi32(acc1, inVec1_3210);
95c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        i += 4;
96c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
97c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
98c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
99c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
100c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
101c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum += _mm_cvtsi128_si32(acc1);
102c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
103c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    for (;i<N;i++)
104c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    {
105c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum = silk_SMLABB(sum, x[i], y[i]);
106c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
107c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
108c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    return sum;
109c91ee5b5642fcc4969150f73d5f6848f88bf1638flim}
110c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
111c91ee5b5642fcc4969150f73d5f6848f88bf1638flimvoid xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
112c91ee5b5642fcc4969150f73d5f6848f88bf1638flim{
113c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    int j;
114c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
115c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i vecX, vecX0, vecX1, vecX2, vecX3;
116c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i vecY0, vecY1, vecY2, vecY3;
117c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i sum0, sum1, sum2, sum3, vecSum;
118c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    __m128i initSum;
119c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
120c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    celt_assert(len >= 3);
121c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
122c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum0 = _mm_setzero_si128();
123c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum1 = _mm_setzero_si128();
124c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum2 = _mm_setzero_si128();
125c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum3 = _mm_setzero_si128();
126c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
127c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    for (j=0;j<(len-7);j+=8)
128c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    {
129c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
130c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
131c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
132c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
133c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
134c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
135c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
136c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
137c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
138c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
139c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
140c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
141c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
142c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
143c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
144c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
145c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
146c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
147c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
148c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
149c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
150c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
151c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
152c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
153c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
154c91ee5b5642fcc4969150f73d5f6848f88bf1638flim          _mm_unpacklo_epi32(sum2, sum3));
155c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
156c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    for (;j<(len-3);j+=4)
157c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    {
158c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
159c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
160c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX1 = _mm_shuffle_epi32(vecX, 0x55);
161c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
162c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX3 = _mm_shuffle_epi32(vecX, 0xff);
163c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
164c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
165c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
166c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
167c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
168c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
169c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum0 = _mm_mullo_epi32(vecX0, vecY0);
170c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum1 = _mm_mullo_epi32(vecX1, vecY1);
171c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum2 = _mm_mullo_epi32(vecX2, vecY2);
172c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum3 = _mm_mullo_epi32(vecX3, vecY3);
173c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
174c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum0 = _mm_add_epi32(sum0, sum1);
175c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum2 = _mm_add_epi32(sum2, sum3);
176c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecSum = _mm_add_epi32(vecSum, sum0);
177c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecSum = _mm_add_epi32(vecSum, sum2);
178c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
179c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
180c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    for (;j<len;j++)
181c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    {
182c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
183c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
184c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
185c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
186c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
187c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        sum0 = _mm_mullo_epi32(vecX0, vecY0);
188c91ee5b5642fcc4969150f73d5f6848f88bf1638flim        vecSum = _mm_add_epi32(vecSum, sum0);
189c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    }
190c91ee5b5642fcc4969150f73d5f6848f88bf1638flim
191c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
192c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    initSum = _mm_add_epi32(initSum, vecSum);
193c91ee5b5642fcc4969150f73d5f6848f88bf1638flim    _mm_storeu_si128((__m128i *)sum, initSum);
194c91ee5b5642fcc4969150f73d5f6848f88bf1638flim}
195c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#endif
196