vq_sse.h revision 98913fed6520d8849fb2e246be943e04474aefa4
176baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman/* Copyright (C) 2004 Jean-Marc Valin */
276baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman/**
376baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   @file vq_sse.h
476baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   @brief SSE-optimized vq routine
576baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman*/
676baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman/*
776baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   Redistribution and use in source and binary forms, with or without
876baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   modification, are permitted provided that the following conditions
976baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   are met:
1076baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman
1176baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   - Redistributions of source code must retain the above copyright
1276baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   notice, this list of conditions and the following disclaimer.
1376baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman
1476baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   - Redistributions in binary form must reproduce the above copyright
1576baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   notice, this list of conditions and the following disclaimer in the
1676baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   documentation and/or other materials provided with the distribution.
1776baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman
1876baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   - Neither the name of the Xiph.org Foundation nor the names of its
1976baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   contributors may be used to endorse or promote products derived from
2076baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   this software without specific prior written permission.
2176baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman
2276baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2376baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2476baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
2576baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
2676baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
2776baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
2876baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
2976baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
3076baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
319c555e71d22a9aa3baf02a49a989ee184a7b09beRoland McGrath   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman*/
342fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath
352fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath#define OVERRIDE_VQ_NBEST
36c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkermanvoid vq_nbest(spx_word16_t *_in, const __m128 *codebook, int len, int entries, __m128 *E, int N, int *nbest, spx_word32_t *best_dist, char *stack)
372fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath{
38c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   int i,j,k,used;
39c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   VARDECL(float *dist);
40c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   VARDECL(__m128 *in);
41c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   __m128 half;
42c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   used = 0;
43c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   ALLOC(dist, entries, float);
44c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   half = _mm_set_ps1(.5f);
45c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   ALLOC(in, len, __m128);
46c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   for (i=0;i<len;i++)
47c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      in[i] = _mm_set_ps1(_in[i]);
48c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   for (i=0;i<entries>>2;i++)
49c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   {
50c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      __m128 d = _mm_mul_ps(E[i], half);
51c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      for (j=0;j<len;j++)
52c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         d = _mm_sub_ps(d, _mm_mul_ps(in[j], *codebook++));
53c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      _mm_storeu_ps(dist+4*i, d);
54c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   }
55c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   for (i=0;i<entries;i++)
56c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   {
57c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      if (i<N || dist[i]<best_dist[N-1])
580de75164d873fde8b0a69ae124cf7b4b2ea22251Roland McGrath      {
59c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         for (k=N-1; (k >= 1) && (k > used || dist[i] < best_dist[k-1]); k--)
60c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         {
61c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman            best_dist[k]=best_dist[k-1];
62c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman            nbest[k] = nbest[k-1];
630de75164d873fde8b0a69ae124cf7b4b2ea22251Roland McGrath         }
64c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         best_dist[k]=dist[i];
65c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         nbest[k]=i;
66c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         used++;
67c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      }
68c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   }
69c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman}
70c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman
71c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman
722fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath
732fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath
749c555e71d22a9aa3baf02a49a989ee184a7b09beRoland McGrath#define OVERRIDE_VQ_NBEST_SIGN
75c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkermanvoid vq_nbest_sign(spx_word16_t *_in, const __m128 *codebook, int len, int entries, __m128 *E, int N, int *nbest, spx_word32_t *best_dist, char *stack)
762fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath{
77c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   int i,j,k,used;
78c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   VARDECL(float *dist);
79c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   VARDECL(__m128 *in);
80c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   __m128 half;
81c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   used = 0;
82c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   ALLOC(dist, entries, float);
83c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   half = _mm_set_ps1(.5f);
84c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   ALLOC(in, len, __m128);
852fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath   for (i=0;i<len;i++)
86c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      in[i] = _mm_set_ps1(_in[i]);
87c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   for (i=0;i<entries>>2;i++)
88c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   {
89c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      __m128 d = _mm_setzero_ps();
90c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      for (j=0;j<len;j++)
91c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         d = _mm_add_ps(d, _mm_mul_ps(in[j], *codebook++));
92c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      _mm_storeu_ps(dist+4*i, d);
93c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   }
94c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   for (i=0;i<entries;i++)
95c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   {
96c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      int sign;
97c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      if (dist[i]>0)
98c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      {
99c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         sign=0;
100c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         dist[i]=-dist[i];
101c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      } else
102c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      {
103c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         sign=1;
104c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      }
105c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      dist[i] += .5f*((float*)E)[i];
106e9052fd332c315dc9ecd0867ca37fc68754ce18eRoland McGrath      if (i<N || dist[i]<best_dist[N-1])
107c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      {
108c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         for (k=N-1; (k >= 1) && (k > used || dist[i] < best_dist[k-1]); k--)
109c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         {
110c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman            best_dist[k]=best_dist[k-1];
111c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman            nbest[k] = nbest[k-1];
112c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         }
113c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         best_dist[k]=dist[i];
114c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         nbest[k]=i;
115c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         used++;
116c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman         if (sign)
117c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman            nbest[k]+=entries;
118c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman      }
119c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman   }
120c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman}
1212fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath