1/* Copyright (c) 2007-2008 CSIRO
2   Copyright (c) 2007-2009 Xiph.Org Foundation
3   Copyright (c) 2007-2016 Jean-Marc Valin */
4/*
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions
7   are met:
8
9   - Redistributions of source code must retain the above copyright
10   notice, this list of conditions and the following disclaimer.
11
12   - Redistributions in binary form must reproduce the above copyright
13   notice, this list of conditions and the following disclaimer in the
14   documentation and/or other materials provided with the distribution.
15
16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29#ifdef HAVE_CONFIG_H
30#include "config.h"
31#endif
32
33#include <xmmintrin.h>
34#include <emmintrin.h>
35#include "celt_lpc.h"
36#include "stack_alloc.h"
37#include "mathops.h"
38#include "vq.h"
39#include "x86cpu.h"
40
41
42#ifndef FIXED_POINT
43
44opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
45{
46   int i, j;
47   int pulsesLeft;
48   float xy, yy;
49   VARDECL(celt_norm, y);
50   VARDECL(celt_norm, X);
51   VARDECL(float, signy);
52   __m128 signmask;
53   __m128 sums;
54   __m128i fours;
55   SAVE_STACK;
56
57   (void)arch;
58   /* All bits set to zero, except for the sign bit. */
59   signmask = _mm_set_ps1(-0.f);
60   fours = _mm_set_epi32(4, 4, 4, 4);
61   ALLOC(y, N+3, celt_norm);
62   ALLOC(X, N+3, celt_norm);
63   ALLOC(signy, N+3, float);
64
65   OPUS_COPY(X, _X, N);
66   X[N] = X[N+1] = X[N+2] = 0;
67   sums = _mm_setzero_ps();
68   for (j=0;j<N;j+=4)
69   {
70      __m128 x4, s4;
71      x4 = _mm_loadu_ps(&X[j]);
72      s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
73      /* Get rid of the sign */
74      x4 = _mm_andnot_ps(signmask, x4);
75      sums = _mm_add_ps(sums, x4);
76      /* Clear y and iy in case we don't do the projection. */
77      _mm_storeu_ps(&y[j], _mm_setzero_ps());
78      _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
79      _mm_storeu_ps(&X[j], x4);
80      _mm_storeu_ps(&signy[j], s4);
81   }
82   sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
83   sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
84
85   xy = yy = 0;
86
87   pulsesLeft = K;
88
89   /* Do a pre-search by projecting on the pyramid */
90   if (K > (N>>1))
91   {
92      __m128i pulses_sum;
93      __m128 yy4, xy4;
94      __m128 rcp4;
95      opus_val32 sum = _mm_cvtss_f32(sums);
96      /* If X is too small, just replace it with a pulse at 0 */
97      /* Prevents infinities and NaNs from causing too many pulses
98         to be allocated. 64 is an approximation of infinity here. */
99      if (!(sum > EPSILON && sum < 64))
100      {
101         X[0] = QCONST16(1.f,14);
102         j=1; do
103            X[j]=0;
104         while (++j<N);
105         sums = _mm_set_ps1(1.f);
106      }
107      /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
108      rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
109      xy4 = yy4 = _mm_setzero_ps();
110      pulses_sum = _mm_setzero_si128();
111      for (j=0;j<N;j+=4)
112      {
113         __m128 rx4, x4, y4;
114         __m128i iy4;
115         x4 = _mm_loadu_ps(&X[j]);
116         rx4 = _mm_mul_ps(x4, rcp4);
117         iy4 = _mm_cvttps_epi32(rx4);
118         pulses_sum = _mm_add_epi32(pulses_sum, iy4);
119         _mm_storeu_si128((__m128i*)&iy[j], iy4);
120         y4 = _mm_cvtepi32_ps(iy4);
121         xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
122         yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
123         /* double the y[] vector so we don't have to do it in the search loop. */
124         _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
125      }
126      pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
127      pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
128      pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
129      xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
130      xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
131      xy = _mm_cvtss_f32(xy4);
132      yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
133      yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
134      yy = _mm_cvtss_f32(yy4);
135   }
136   X[N] = X[N+1] = X[N+2] = -100;
137   y[N] = y[N+1] = y[N+2] = 100;
138   celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
139
140   /* This should never happen, but just in case it does (e.g. on silence)
141      we fill the first bin with pulses. */
142   if (pulsesLeft > N+3)
143   {
144      opus_val16 tmp = (opus_val16)pulsesLeft;
145      yy = MAC16_16(yy, tmp, tmp);
146      yy = MAC16_16(yy, tmp, y[0]);
147      iy[0] += pulsesLeft;
148      pulsesLeft=0;
149   }
150
151   for (i=0;i<pulsesLeft;i++)
152   {
153      int best_id;
154      __m128 xy4, yy4;
155      __m128 max, max2;
156      __m128i count;
157      __m128i pos;
158      /* The squared magnitude term gets added anyway, so we might as well
159         add it outside the loop */
160      yy = ADD16(yy, 1);
161      xy4 = _mm_load1_ps(&xy);
162      yy4 = _mm_load1_ps(&yy);
163      max = _mm_setzero_ps();
164      pos = _mm_setzero_si128();
165      count = _mm_set_epi32(3, 2, 1, 0);
166      for (j=0;j<N;j+=4)
167      {
168         __m128 x4, y4, r4;
169         x4 = _mm_loadu_ps(&X[j]);
170         y4 = _mm_loadu_ps(&y[j]);
171         x4 = _mm_add_ps(x4, xy4);
172         y4 = _mm_add_ps(y4, yy4);
173         y4 = _mm_rsqrt_ps(y4);
174         r4 = _mm_mul_ps(x4, y4);
175         /* Update the index of the max. */
176         pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
177         /* Update the max. */
178         max = _mm_max_ps(max, r4);
179         /* Update the indices (+4) */
180         count = _mm_add_epi32(count, fours);
181      }
182      /* Horizontal max */
183      max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
184      max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
185      /* Now that max2 contains the max at all positions, look at which value(s) of the
186         partial max is equal to the global max. */
187      pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
188      pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
189      pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
190      best_id = _mm_cvtsi128_si32(pos);
191
192      /* Updating the sums of the new pulse(s) */
193      xy = ADD32(xy, EXTEND32(X[best_id]));
194      /* We're multiplying y[j] by two so we don't have to do it here */
195      yy = ADD16(yy, y[best_id]);
196
197      /* Only now that we've made the final choice, update y/iy */
198      /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
199      y[best_id] += 2;
200      iy[best_id]++;
201   }
202
203   /* Put the original sign back */
204   for (j=0;j<N;j+=4)
205   {
206      __m128i y4;
207      __m128i s4;
208      y4 = _mm_loadu_si128((__m128i*)&iy[j]);
209      s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
210      y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
211      _mm_storeu_si128((__m128i*)&iy[j], y4);
212   }
213   RESTORE_STACK;
214   return yy;
215}
216
217#endif
218