vq_sse.h revision 98913fed6520d8849fb2e246be943e04474aefa4
176baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman/* Copyright (C) 2004 Jean-Marc Valin */ 276baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman/** 376baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman @file vq_sse.h 476baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman @brief SSE-optimized vq routine 576baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman*/ 676baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman/* 776baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman Redistribution and use in source and binary forms, with or without 876baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman modification, are permitted provided that the following conditions 976baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman are met: 1076baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman 1176baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman - Redistributions of source code must retain the above copyright 1276baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman notice, this list of conditions and the following disclaimer. 1376baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman 1476baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman - Redistributions in binary form must reproduce the above copyright 1576baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman notice, this list of conditions and the following disclaimer in the 1676baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman documentation and/or other materials provided with the distribution. 1776baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman 1876baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman - Neither the name of the Xiph.org Foundation nor the names of its 1976baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman contributors may be used to endorse or promote products derived from 2076baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman this software without specific prior written permission. 2176baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman 2276baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 2376baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 2476baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 2576baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 2676baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 2776baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 2876baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 2976baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 3076baf7c9f6dd61a15524ad43c1b690c252cf5b7Wichert Akkerman LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 319c555e71d22a9aa3baf02a49a989ee184a7b09beRoland McGrath NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman*/ 342fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath 352fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath#define OVERRIDE_VQ_NBEST 36c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkermanvoid vq_nbest(spx_word16_t *_in, const __m128 *codebook, int len, int entries, __m128 *E, int N, int *nbest, spx_word32_t *best_dist, char *stack) 372fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath{ 38c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman int i,j,k,used; 39c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman VARDECL(float *dist); 40c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman VARDECL(__m128 *in); 41c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman __m128 half; 42c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman used = 0; 43c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman ALLOC(dist, entries, float); 44c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman half = _mm_set_ps1(.5f); 45c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman ALLOC(in, len, __m128); 46c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (i=0;i<len;i++) 47c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman in[i] = _mm_set_ps1(_in[i]); 48c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (i=0;i<entries>>2;i++) 49c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 50c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman __m128 d = _mm_mul_ps(E[i], half); 51c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (j=0;j<len;j++) 52c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman d = _mm_sub_ps(d, _mm_mul_ps(in[j], *codebook++)); 53c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman _mm_storeu_ps(dist+4*i, d); 54c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 55c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (i=0;i<entries;i++) 56c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 57c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman if (i<N || dist[i]<best_dist[N-1]) 580de75164d873fde8b0a69ae124cf7b4b2ea22251Roland McGrath { 59c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (k=N-1; (k >= 1) && (k > used || dist[i] < best_dist[k-1]); k--) 60c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 61c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman best_dist[k]=best_dist[k-1]; 62c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman nbest[k] = nbest[k-1]; 630de75164d873fde8b0a69ae124cf7b4b2ea22251Roland McGrath } 64c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman best_dist[k]=dist[i]; 65c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman nbest[k]=i; 66c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman used++; 67c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 68c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 69c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman} 70c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman 71c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman 722fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath 732fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath 749c555e71d22a9aa3baf02a49a989ee184a7b09beRoland McGrath#define OVERRIDE_VQ_NBEST_SIGN 75c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkermanvoid vq_nbest_sign(spx_word16_t *_in, const __m128 *codebook, int len, int entries, __m128 *E, int N, int *nbest, spx_word32_t *best_dist, char *stack) 762fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath{ 77c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman int i,j,k,used; 78c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman VARDECL(float *dist); 79c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman VARDECL(__m128 *in); 80c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman __m128 half; 81c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman used = 0; 82c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman ALLOC(dist, entries, float); 83c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman half = _mm_set_ps1(.5f); 84c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman ALLOC(in, len, __m128); 852fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath for (i=0;i<len;i++) 86c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman in[i] = _mm_set_ps1(_in[i]); 87c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (i=0;i<entries>>2;i++) 88c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 89c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman __m128 d = _mm_setzero_ps(); 90c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (j=0;j<len;j++) 91c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman d = _mm_add_ps(d, _mm_mul_ps(in[j], *codebook++)); 92c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman _mm_storeu_ps(dist+4*i, d); 93c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 94c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (i=0;i<entries;i++) 95c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 96c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman int sign; 97c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman if (dist[i]>0) 98c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 99c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman sign=0; 100c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman dist[i]=-dist[i]; 101c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } else 102c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 103c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman sign=1; 104c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 105c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman dist[i] += .5f*((float*)E)[i]; 106e9052fd332c315dc9ecd0867ca37fc68754ce18eRoland McGrath if (i<N || dist[i]<best_dist[N-1]) 107c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 108c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman for (k=N-1; (k >= 1) && (k > used || dist[i] < best_dist[k-1]); k--) 109c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman { 110c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman best_dist[k]=best_dist[k-1]; 111c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman nbest[k] = nbest[k-1]; 112c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 113c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman best_dist[k]=dist[i]; 114c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman nbest[k]=i; 115c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman used++; 116c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman if (sign) 117c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman nbest[k]+=entries; 118c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 119c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman } 120c792698a99b640e7d256e8692c992bd967f0c5b2Wichert Akkerman} 1212fe7b13b2f893f7647408af98576affe2b9b123dRoland McGrath