1ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes/* libFLAC - Free Lossless Audio Codec library 2ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Copyright (C) 2000-2009 Josh Coalson 3ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Copyright (C) 2011-2016 Xiph.Org Foundation 4ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 5ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Redistribution and use in source and binary forms, with or without 6ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * modification, are permitted provided that the following conditions 7ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * are met: 8ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 9ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Redistributions of source code must retain the above copyright 10ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * notice, this list of conditions and the following disclaimer. 11ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 12ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Redistributions in binary form must reproduce the above copyright 13ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * notice, this list of conditions and the following disclaimer in the 14ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * documentation and/or other materials provided with the distribution. 15ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 16ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Neither the name of the Xiph.org Foundation nor the names of its 17ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * contributors may be used to endorse or promote products derived from 18ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * this software without specific prior written permission. 19ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 20ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 24ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes */ 32ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 33ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifdef HAVE_CONFIG_H 34ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes# include <config.h> 35ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif 36ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 37ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "private/cpu.h" 38ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 39ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifndef FLAC__INTEGER_ONLY_LIBRARY 40ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifndef FLAC__NO_ASM 41ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN 42ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "private/lpc.h" 43ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifdef FLAC__SSE2_SUPPORTED 44ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 45ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "FLAC/assert.h" 46ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "FLAC/format.h" 47ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 48ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include <emmintrin.h> /* SSE2 */ 49ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 50ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); 51ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr; 52ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 53ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); 54ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); 55ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 56ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("sse2") 57ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 58ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{ 59ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes int i; 60ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int32 sum; 61ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 62ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 63ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order > 0); 64ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order <= 32); 65ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 66ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order <= 12) { 67ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 8) { 68ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 10) { 69ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 12) { 70ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 71ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 72ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 73ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 74ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 75ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 76ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 77ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 78ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 79ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 80ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 81ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); 82ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); 83ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 84ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 85ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 86ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12))); 87ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); 88ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); 89ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 90ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 91ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 92ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 93ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 94ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 95ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 96ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 97ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 98ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 99ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 100ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 101ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 102ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 11 */ 103ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 104ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 105ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 106ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 107ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 108ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 109ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 110ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 111ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 112ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 113ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 114ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); 115ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 116ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 117ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 118ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); 119ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); 120ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 121ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 122ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 123ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 124ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 125ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 126ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 127ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 128ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 129ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 130ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 131ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 132ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 133ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 134ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 135ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 10) { 136ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 137ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 138ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 139ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 140ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 141ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 142ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 143ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 144ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 145ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 146ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 147ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 148ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 149ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 150ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); 151ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 152ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 153ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 154ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 155ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 156ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 157ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 158ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 159ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 160ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 161ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 162ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 163ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 164ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 9 */ 165ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8; 166ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 167ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 168ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 169ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 170ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 171ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 172ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 173ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 174ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 175ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 176ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 177ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 178ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); 179ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 180ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 181ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 182ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 183ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 184ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 185ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 186ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 187ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 188ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 189ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 190ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 191ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 192ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 193ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else if(order > 4) { 194ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 6) { 195ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 8) { 196ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5, q6, q7; 197ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 198ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 199ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 200ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 201ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 202ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 203ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 204ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 205ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 206ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 207ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 208ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); 209ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 210ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 211ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 212ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 213ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 214ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 215ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 216ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 217ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 218ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 219ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 220ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 7 */ 221ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5, q6; 222ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 223ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 224ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 225ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 226ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 227ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 228ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 229ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 230ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 231ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 232ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); 233ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 234ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 235ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 236ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 237ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 238ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 239ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 240ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 241ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 242ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 243ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 244ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 245ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 6) { 246ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4, q5; 247ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 248ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 249ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 250ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 251ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 252ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 253ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 254ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 255ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 256ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); 257ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 258ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 259ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 260ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 261ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 262ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 263ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 264ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 265ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 266ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 5 */ 267ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3, q4; 268ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 269ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 270ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 271ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 272ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 273ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 274ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 275ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 276ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); 277ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 278ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 279ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 280ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 281ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 282ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 283ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 284ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 285ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 286ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 287ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 288ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 2) { 289ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 4) { 290ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2, q3; 291ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 292ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 293ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 294ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 295ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 296ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 297ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 298ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); 299ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 300ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 301ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 302ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 303ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 304ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 305ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 306ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 3 */ 307ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1, q2; 308ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 309ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 310ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 311ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 312ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 313ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 314ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); 315ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 316ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 317ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 318ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 319ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 320ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 321ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 322ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 323ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 2) { 324ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0, q1; 325ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 326ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 327ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 328ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 329ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ, mull; 330ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); 331ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 332ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 333ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 334ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 335ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 336ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1 */ 337ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i q0; 338ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 339ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 340ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 341ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i summ; 342ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); 343ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm_sra_epi32(summ, cnt); 344ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 345ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 346ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 347ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 348ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 349ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(; i < (int)data_len; i++) { 350ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 351ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 352ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 12: sum += qlp_coeff[11] * data[i-12]; 353ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 11: sum += qlp_coeff[10] * data[i-11]; 354ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 10: sum += qlp_coeff[ 9] * data[i-10]; 355ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 9: sum += qlp_coeff[ 8] * data[i- 9]; 356ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 8: sum += qlp_coeff[ 7] * data[i- 8]; 357ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 7: sum += qlp_coeff[ 6] * data[i- 7]; 358ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 6: sum += qlp_coeff[ 5] * data[i- 6]; 359ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 5: sum += qlp_coeff[ 4] * data[i- 5]; 360ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 4: sum += qlp_coeff[ 3] * data[i- 4]; 361ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 3: sum += qlp_coeff[ 2] * data[i- 3]; 362ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 2: sum += qlp_coeff[ 1] * data[i- 2]; 363ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 1: sum += qlp_coeff[ 0] * data[i- 1]; 364ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 365ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 366ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 367ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 368ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order > 12 */ 369ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 370ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 371ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 372ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 32: sum += qlp_coeff[31] * data[i-32]; 373ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 31: sum += qlp_coeff[30] * data[i-31]; 374ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 30: sum += qlp_coeff[29] * data[i-30]; 375ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 29: sum += qlp_coeff[28] * data[i-29]; 376ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 28: sum += qlp_coeff[27] * data[i-28]; 377ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 27: sum += qlp_coeff[26] * data[i-27]; 378ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 26: sum += qlp_coeff[25] * data[i-26]; 379ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 25: sum += qlp_coeff[24] * data[i-25]; 380ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 24: sum += qlp_coeff[23] * data[i-24]; 381ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 23: sum += qlp_coeff[22] * data[i-23]; 382ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 22: sum += qlp_coeff[21] * data[i-22]; 383ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 21: sum += qlp_coeff[20] * data[i-21]; 384ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 20: sum += qlp_coeff[19] * data[i-20]; 385ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 19: sum += qlp_coeff[18] * data[i-19]; 386ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 18: sum += qlp_coeff[17] * data[i-18]; 387ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 17: sum += qlp_coeff[16] * data[i-17]; 388ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 16: sum += qlp_coeff[15] * data[i-16]; 389ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 15: sum += qlp_coeff[14] * data[i-15]; 390ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 14: sum += qlp_coeff[13] * data[i-14]; 391ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 13: sum += qlp_coeff[12] * data[i-13]; 392ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[11] * data[i-12]; 393ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[10] * data[i-11]; 394ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 9] * data[i-10]; 395ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 8] * data[i- 9]; 396ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 7] * data[i- 8]; 397ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 6] * data[i- 7]; 398ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 5] * data[i- 6]; 399ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 4] * data[i- 5]; 400ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 3] * data[i- 4]; 401ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 2] * data[i- 3]; 402ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 1] * data[i- 2]; 403ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 0] * data[i- 1]; 404ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 405ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 406ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 407ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 408ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes} 409ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 410ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("sse2") 411ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 412ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{ 413ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes int i; 414ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 415ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order > 0); 416ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order <= 32); 417ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 418ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order <= 12) { 419ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 8) { /* order == 9, 10, 11, 12 */ 420ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 10) { /* order == 11, 12 */ 421ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 12) { 422ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 423ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] 424ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] 425ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] 426ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] 427ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] 428ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] 429ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 430ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] 431ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] 432ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] 433ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] 434ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] 435ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] 436ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 437ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 438ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 439ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[11] * data[i-12]; 440ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[10] * data[i-11]; 441ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] 442ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] 443ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */ 444ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 445ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[9] * data[i-10]; 446ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[8] * data[i-9]; 447ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 448ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 449ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm4); 450ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 451ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 452ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[7] * data[i-8]; 453ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[6] * data[i-7]; 454ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 455ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 456ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm3); 457ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 458ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 459ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 460ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 461ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 462ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 463ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm2); 464ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 465ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 466ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 467ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 468ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 469ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 470ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 471ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 472ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 473ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 474ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 475ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 476ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 477ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 478ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 479ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 480ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 481ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 482ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 483ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 484ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 11 */ 485ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 486ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 487ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 488ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 489ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 490ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 491ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); 492ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 493ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 494ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 495ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 496ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 497ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); 498ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 499ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 500ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 501ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = qlp_coeff[10] * data[i-11]; 502ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_cvtsi32_si128(data[i-11]); 503ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm5); 504ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 505ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[9] * data[i-10]; 506ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[8] * data[i-9]; 507ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 508ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 509ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm4); 510ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 511ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 512ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[7] * data[i-8]; 513ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[6] * data[i-7]; 514ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 515ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 516ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm3); 517ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 518ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 519ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 520ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 521ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 522ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 523ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm2); 524ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 525ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 526ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 527ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 528ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 529ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 530ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 531ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 532ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 533ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 534ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 535ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 536ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 537ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 538ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 539ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 540ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 541ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 542ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 543ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 544ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 545ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 9, 10 */ 546ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 10) { 547ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; 548ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 549ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 550ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 551ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 552ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 553ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 554ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 555ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 556ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 557ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 558ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); 559ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 560ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 561ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 562ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[9] * data[i-10]; 563ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[8] * data[i-9]; 564ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 565ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 566ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm4); 567ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 568ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[7] * data[i-8]; 569ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[6] * data[i-7]; 570ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 571ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 572ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm3); 573ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 574ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 575ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 576ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 577ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 578ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 579ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm2); 580ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 581ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 582ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 583ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 584ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 585ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 586ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 587ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 588ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 589ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 590ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 591ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 592ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 593ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 594ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 595ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 596ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 597ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 598ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 599ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 600ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 9 */ 601ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; 602ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 603ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 604ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 605ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 606ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); 607ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 608ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 609ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 610ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 611ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 612ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 613ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 614ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 615ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = qlp_coeff[8] * data[i-9]; 616ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_cvtsi32_si128(data[i-9]); 617ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm4); 618ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 619ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[7] * data[i-8]; 620ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[6] * data[i-7]; 621ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 622ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 623ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm3); 624ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 625ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 626ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 627ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 628ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 629ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 630ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm2); 631ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 632ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 633ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 634ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 635ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 636ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 637ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 638ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 639ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 640ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 641ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 642ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 643ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 644ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 645ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 646ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 647ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 648ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 649ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 650ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 651ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 652ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 653ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else if(order > 4) { /* order == 5, 6, 7, 8 */ 654ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 6) { /* order == 7, 8 */ 655ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 8) { 656ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; 657ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 658ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 659ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 660ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 661ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 662ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 663ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 664ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 665ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 666ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 667ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 668ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 669ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[7] * data[i-8]; 670ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[6] * data[i-7]; 671ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 672ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 673ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm3); 674ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 675ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 676ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 677ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 678ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 679ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm2); 680ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 681ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 682ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 683ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 684ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 685ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 686ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 687ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 688ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 689ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 690ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 691ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 692ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 693ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 694ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 695ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 696ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 697ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 698ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 699ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 700ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 7 */ 701ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; 702ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 703ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 704ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 705ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); 706ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 707ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 708ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 709ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 710ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 711ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 712ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 713ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = qlp_coeff[6] * data[i-7]; 714ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_cvtsi32_si128(data[i-7]); 715ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm3); 716ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 717ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 718ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 719ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 720ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 721ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm2); 722ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 723ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 724ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 725ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 726ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 727ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 728ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 729ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 730ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 731ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 732ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 733ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 734ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 735ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 736ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 737ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 738ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 739ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 740ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 741ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 742ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 743ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 5, 6 */ 744ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 6) { 745ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm6, xmm7; 746ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 747ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 748ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 749ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 750ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 751ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 752ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 753ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 754ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 755ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 756ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[5] * data[i-6]; 757ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[4] * data[i-5]; 758ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 759ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 760ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm2); 761ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 762ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 763ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 764ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 765ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 766ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 767ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 768ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 769ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 770ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 771ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 772ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 773ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 774ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 775ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 776ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 777ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 778ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 779ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 780ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 5 */ 781ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm6, xmm7; 782ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 783ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 784ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); 785ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 786ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 787ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 788ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 789ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 790ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 791ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = qlp_coeff[4] * data[i-5]; 792ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_cvtsi32_si128(data[i-5]); 793ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm2); 794ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 795ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 796ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 797ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 798ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 799ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm1); 800ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 801ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 802ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 803ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 804ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 805ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 806ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 807ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 808ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 809ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 810ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 811ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 812ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 813ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 814ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 815ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1, 2, 3, 4 */ 816ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 2) { /* order == 3, 4 */ 817ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 4) { 818ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm6, xmm7; 819ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 820ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 821ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 822ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 823ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 824ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 825ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 826ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 827ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[3] * data[i-4]; 828ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[2] * data[i-3]; 829ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 830ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 831ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm1); 832ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 833ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 834ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 835ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 836ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 837ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 838ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 839ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 840ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 841ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 842ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 843ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 844ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 3 */ 845ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm6, xmm7; 846ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 847ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); 848ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 849ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 850ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 851ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 852ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 853ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = qlp_coeff[2] * data[i-3]; 854ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_cvtsi32_si128(data[i-3]); 855ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm1); 856ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 857ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 858ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 859ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 860ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 861ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_mul_epu32(xmm6, xmm0); 862ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, xmm6); 863ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 864ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 865ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 866ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 867ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 868ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 869ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1, 2 */ 870ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 2) { 871ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm7; 872ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 873ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 874ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 875ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 876ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum = 0; 877ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[1] * data[i-2]; 878ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes //sum += qlp_coeff[0] * data[i-1]; 879ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 880ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 881ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_mul_epu32(xmm7, xmm0); 882ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 883ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 884ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes RESIDUAL32_RESULT(xmm7); 885ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 886ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 887ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1 */ 888ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) 889ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization); 890ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 891ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 892ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 893ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 894ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order > 12 */ 895ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int32 sum; 896ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 897ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 898ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 899ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 32: sum += qlp_coeff[31] * data[i-32]; 900ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 31: sum += qlp_coeff[30] * data[i-31]; 901ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 30: sum += qlp_coeff[29] * data[i-30]; 902ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 29: sum += qlp_coeff[28] * data[i-29]; 903ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 28: sum += qlp_coeff[27] * data[i-28]; 904ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 27: sum += qlp_coeff[26] * data[i-27]; 905ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 26: sum += qlp_coeff[25] * data[i-26]; 906ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 25: sum += qlp_coeff[24] * data[i-25]; 907ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 24: sum += qlp_coeff[23] * data[i-24]; 908ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 23: sum += qlp_coeff[22] * data[i-23]; 909ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 22: sum += qlp_coeff[21] * data[i-22]; 910ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 21: sum += qlp_coeff[20] * data[i-21]; 911ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 20: sum += qlp_coeff[19] * data[i-20]; 912ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 19: sum += qlp_coeff[18] * data[i-19]; 913ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 18: sum += qlp_coeff[17] * data[i-18]; 914ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 17: sum += qlp_coeff[16] * data[i-17]; 915ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 16: sum += qlp_coeff[15] * data[i-16]; 916ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 15: sum += qlp_coeff[14] * data[i-15]; 917ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 14: sum += qlp_coeff[13] * data[i-14]; 918ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 13: sum += qlp_coeff[12] * data[i-13]; 919ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[11] * data[i-12]; 920ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[10] * data[i-11]; 921ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 9] * data[i-10]; 922ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 8] * data[i- 9]; 923ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 7] * data[i- 8]; 924ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 6] * data[i- 7]; 925ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 5] * data[i- 6]; 926ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 4] * data[i- 5]; 927ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 3] * data[i- 4]; 928ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 2] * data[i- 3]; 929ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 1] * data[i- 2]; 930ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 0] * data[i- 1]; 931ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 932ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 933ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 934ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 935ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes} 936ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 937ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */ 938ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 939ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("sse2") 940ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 941ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{ 942ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if (order < 8 || order > 12) { 943ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); 944ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes return; 945ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 946ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if (data_len == 0) 947ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes return; 948ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 949ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order >= 8); 950ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order <= 12); 951ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 952ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 8) { /* order == 9, 10, 11, 12 */ 953ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int32 curr; 954ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 955ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); 956ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); 957ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */ 958ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) /* ...and zero them out */ 959ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes { 960ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 9: 961ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break; 962ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 10: 963ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break; 964ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 11: 965ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break; 966ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 967ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_setzero_si128(); 968ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_packs_epi32(xmm0, xmm6); 969ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_packs_epi32(xmm1, xmm2); 970ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 971ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_loadu_si128((const __m128i*)(data-12)); 972ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_loadu_si128((const __m128i*)(data-8)); 973ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); 974ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3)); 975ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3)); 976ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); 977ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_packs_epi32(xmm4, xmm2); 978ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_packs_epi32(xmm3, xmm5); 979ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 980ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_slli_si128(xmm1, 2); 981ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14)); 982ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2 = _mm_slli_si128(xmm0, 2); 983ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 984ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes /* xmm0, xmm1: qlp_coeff 985ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm2, xmm7: qlp_coeff << 16 bit 986ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3, xmm4: data */ 987ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 988ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_madd_epi16(xmm4, xmm1); 989ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_madd_epi16(xmm3, xmm0); 990ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, xmm5); 991ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 992ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 993ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 994ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes DATA16_RESULT(xmm6); 995ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 996ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes data_len--; 997ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 998ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(data_len % 2) { 999ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_srli_si128(xmm3, 14); 1000ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_slli_si128(xmm4, 2); 1001ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_slli_si128(xmm3, 2); 1002ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_or_si128(xmm4, xmm6); 1003ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_insert_epi16(xmm3, curr, 0); 1004ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1005ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_madd_epi16(xmm4, xmm1); 1006ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_madd_epi16(xmm3, xmm0); 1007ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, xmm5); 1008ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1009ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1010ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1011ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes DATA16_RESULT(xmm6); 1012ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1013ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes data_len--; 1014ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1015ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1016ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes while(data_len) { /* data_len is a multiple of 2 */ 1017ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */ 1018ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_srli_si128(xmm3, 12); 1019ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_slli_si128(xmm4, 4); 1020ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_slli_si128(xmm3, 4); 1021ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm4 = _mm_or_si128(xmm4, xmm6); 1022ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_insert_epi16(xmm3, curr, 1); 1023ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1024ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_madd_epi16(xmm4, xmm7); 1025ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_madd_epi16(xmm3, xmm2); 1026ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, xmm5); 1027ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1028ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1029ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1030ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes DATA16_RESULT(xmm6); 1031ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1032ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_insert_epi16(xmm3, curr, 0); 1033ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1034ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm5 = _mm_madd_epi16(xmm4, xmm1); 1035ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_madd_epi16(xmm3, xmm0); 1036ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, xmm5); 1037ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1038ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1039ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1040ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes DATA16_RESULT(xmm6); 1041ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1042ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes data_len-=2; 1043ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1044ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } /* endif(order > 8) */ 1045ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else 1046ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes { 1047ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int32 curr; 1048ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i xmm0, xmm1, xmm3, xmm6; 1049ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); 1050ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); 1051ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm0 = _mm_packs_epi32(xmm0, xmm1); 1052ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1053ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_loadu_si128((const __m128i*)(data-8)); 1054ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); 1055ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3)); 1056ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); 1057ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_packs_epi32(xmm3, xmm1); 1058ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1059ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes /* xmm0: qlp_coeff 1060ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3: data */ 1061ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1062ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_madd_epi16(xmm3, xmm0); 1063ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1064ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1065ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1066ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes DATA16_RESULT(xmm6); 1067ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1068ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes data_len--; 1069ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1070ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes while(data_len) { 1071ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_slli_si128(xmm3, 2); 1072ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm3 = _mm_insert_epi16(xmm3, curr, 0); 1073ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1074ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_madd_epi16(xmm3, xmm0); 1075ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1076ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1077ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1078ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes DATA16_RESULT(xmm6); 1079ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1080ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes data_len--; 1081ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1082ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1083ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes} 1084ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1085ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */ 1086ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1087ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__SSE2_SUPPORTED */ 1088ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ 1089ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__NO_ASM */ 1090ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__INTEGER_ONLY_LIBRARY */ 1091