1/* libFLAC - Free Lossless Audio Codec library 2 * Copyright (C) 2000-2009 Josh Coalson 3 * Copyright (C) 2011-2016 Xiph.Org Foundation 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * - Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * - Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * - Neither the name of the Xiph.org Foundation nor the names of its 17 * contributors may be used to endorse or promote products derived from 18 * this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#ifdef HAVE_CONFIG_H 34# include <config.h> 35#endif 36 37#include "private/cpu.h" 38 39#ifndef FLAC__INTEGER_ONLY_LIBRARY 40#ifndef FLAC__NO_ASM 41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN 42#include "private/lpc.h" 43#ifdef FLAC__SSE2_SUPPORTED 44 45#include "FLAC/assert.h" 46#include "FLAC/format.h" 47 48#include <emmintrin.h> /* SSE2 */ 49 50#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); 51#define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr; 52 53#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); 54#define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); 55 56FLAC__SSE_TARGET("sse2") 57void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 58{ 59 int i; 60 FLAC__int32 sum; 61 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 62 63 FLAC__ASSERT(order > 0); 64 FLAC__ASSERT(order <= 32); 65 66 if(order <= 12) { 67 if(order > 8) { 68 if(order > 10) { 69 if(order == 12) { 70 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 71 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 72 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 73 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 74 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 75 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 76 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 77 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 78 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 79 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 80 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 81 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); 82 q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); 83 84 for(i = 0; i < (int)data_len-3; i+=4) { 85 __m128i summ, mull; 86 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12))); 87 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); 88 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); 89 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 90 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 91 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 92 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 93 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 94 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 95 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 96 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 97 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 98 summ = _mm_sra_epi32(summ, cnt); 99 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 100 } 101 } 102 else { /* order == 11 */ 103 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 104 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 105 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 106 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 107 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 108 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 109 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 110 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 111 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 112 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 113 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 114 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); 115 116 for(i = 0; i < (int)data_len-3; i+=4) { 117 __m128i summ, mull; 118 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); 119 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); 120 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 121 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 122 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 123 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 124 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 125 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 126 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 127 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 128 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 129 summ = _mm_sra_epi32(summ, cnt); 130 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 131 } 132 } 133 } 134 else { 135 if(order == 10) { 136 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 137 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 138 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 139 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 140 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 141 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 142 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 143 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 144 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 145 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 146 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 147 148 for(i = 0; i < (int)data_len-3; i+=4) { 149 __m128i summ, mull; 150 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); 151 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 152 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 153 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 154 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 155 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 156 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 157 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 158 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 159 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 160 summ = _mm_sra_epi32(summ, cnt); 161 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 162 } 163 } 164 else { /* order == 9 */ 165 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8; 166 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 167 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 168 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 169 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 170 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 171 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 172 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 173 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 174 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 175 176 for(i = 0; i < (int)data_len-3; i+=4) { 177 __m128i summ, mull; 178 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); 179 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 180 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 181 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 182 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 183 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 184 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 185 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 186 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 187 summ = _mm_sra_epi32(summ, cnt); 188 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 189 } 190 } 191 } 192 } 193 else if(order > 4) { 194 if(order > 6) { 195 if(order == 8) { 196 __m128i q0, q1, q2, q3, q4, q5, q6, q7; 197 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 198 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 199 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 200 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 201 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 202 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 203 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 204 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 205 206 for(i = 0; i < (int)data_len-3; i+=4) { 207 __m128i summ, mull; 208 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); 209 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 210 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 211 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 212 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 213 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 214 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 215 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 216 summ = _mm_sra_epi32(summ, cnt); 217 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 218 } 219 } 220 else { /* order == 7 */ 221 __m128i q0, q1, q2, q3, q4, q5, q6; 222 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 223 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 224 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 225 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 226 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 227 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 228 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 229 230 for(i = 0; i < (int)data_len-3; i+=4) { 231 __m128i summ, mull; 232 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); 233 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 234 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 235 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 236 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 237 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 238 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 239 summ = _mm_sra_epi32(summ, cnt); 240 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 241 } 242 } 243 } 244 else { 245 if(order == 6) { 246 __m128i q0, q1, q2, q3, q4, q5; 247 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 248 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 249 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 250 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 251 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 252 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 253 254 for(i = 0; i < (int)data_len-3; i+=4) { 255 __m128i summ, mull; 256 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); 257 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 258 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 259 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 260 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 261 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 262 summ = _mm_sra_epi32(summ, cnt); 263 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 264 } 265 } 266 else { /* order == 5 */ 267 __m128i q0, q1, q2, q3, q4; 268 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 269 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 270 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 271 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 272 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 273 274 for(i = 0; i < (int)data_len-3; i+=4) { 275 __m128i summ, mull; 276 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); 277 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 278 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 279 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 280 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 281 summ = _mm_sra_epi32(summ, cnt); 282 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 283 } 284 } 285 } 286 } 287 else { 288 if(order > 2) { 289 if(order == 4) { 290 __m128i q0, q1, q2, q3; 291 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 292 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 293 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 294 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 295 296 for(i = 0; i < (int)data_len-3; i+=4) { 297 __m128i summ, mull; 298 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); 299 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 300 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 301 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 302 summ = _mm_sra_epi32(summ, cnt); 303 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 304 } 305 } 306 else { /* order == 3 */ 307 __m128i q0, q1, q2; 308 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 309 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 310 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 311 312 for(i = 0; i < (int)data_len-3; i+=4) { 313 __m128i summ, mull; 314 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); 315 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 316 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 317 summ = _mm_sra_epi32(summ, cnt); 318 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 319 } 320 } 321 } 322 else { 323 if(order == 2) { 324 __m128i q0, q1; 325 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 326 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 327 328 for(i = 0; i < (int)data_len-3; i+=4) { 329 __m128i summ, mull; 330 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); 331 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 332 summ = _mm_sra_epi32(summ, cnt); 333 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 334 } 335 } 336 else { /* order == 1 */ 337 __m128i q0; 338 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 339 340 for(i = 0; i < (int)data_len-3; i+=4) { 341 __m128i summ; 342 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); 343 summ = _mm_sra_epi32(summ, cnt); 344 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 345 } 346 } 347 } 348 } 349 for(; i < (int)data_len; i++) { 350 sum = 0; 351 switch(order) { 352 case 12: sum += qlp_coeff[11] * data[i-12]; 353 case 11: sum += qlp_coeff[10] * data[i-11]; 354 case 10: sum += qlp_coeff[ 9] * data[i-10]; 355 case 9: sum += qlp_coeff[ 8] * data[i- 9]; 356 case 8: sum += qlp_coeff[ 7] * data[i- 8]; 357 case 7: sum += qlp_coeff[ 6] * data[i- 7]; 358 case 6: sum += qlp_coeff[ 5] * data[i- 6]; 359 case 5: sum += qlp_coeff[ 4] * data[i- 5]; 360 case 4: sum += qlp_coeff[ 3] * data[i- 4]; 361 case 3: sum += qlp_coeff[ 2] * data[i- 3]; 362 case 2: sum += qlp_coeff[ 1] * data[i- 2]; 363 case 1: sum += qlp_coeff[ 0] * data[i- 1]; 364 } 365 residual[i] = data[i] - (sum >> lp_quantization); 366 } 367 } 368 else { /* order > 12 */ 369 for(i = 0; i < (int)data_len; i++) { 370 sum = 0; 371 switch(order) { 372 case 32: sum += qlp_coeff[31] * data[i-32]; 373 case 31: sum += qlp_coeff[30] * data[i-31]; 374 case 30: sum += qlp_coeff[29] * data[i-30]; 375 case 29: sum += qlp_coeff[28] * data[i-29]; 376 case 28: sum += qlp_coeff[27] * data[i-28]; 377 case 27: sum += qlp_coeff[26] * data[i-27]; 378 case 26: sum += qlp_coeff[25] * data[i-26]; 379 case 25: sum += qlp_coeff[24] * data[i-25]; 380 case 24: sum += qlp_coeff[23] * data[i-24]; 381 case 23: sum += qlp_coeff[22] * data[i-23]; 382 case 22: sum += qlp_coeff[21] * data[i-22]; 383 case 21: sum += qlp_coeff[20] * data[i-21]; 384 case 20: sum += qlp_coeff[19] * data[i-20]; 385 case 19: sum += qlp_coeff[18] * data[i-19]; 386 case 18: sum += qlp_coeff[17] * data[i-18]; 387 case 17: sum += qlp_coeff[16] * data[i-17]; 388 case 16: sum += qlp_coeff[15] * data[i-16]; 389 case 15: sum += qlp_coeff[14] * data[i-15]; 390 case 14: sum += qlp_coeff[13] * data[i-14]; 391 case 13: sum += qlp_coeff[12] * data[i-13]; 392 sum += qlp_coeff[11] * data[i-12]; 393 sum += qlp_coeff[10] * data[i-11]; 394 sum += qlp_coeff[ 9] * data[i-10]; 395 sum += qlp_coeff[ 8] * data[i- 9]; 396 sum += qlp_coeff[ 7] * data[i- 8]; 397 sum += qlp_coeff[ 6] * data[i- 7]; 398 sum += qlp_coeff[ 5] * data[i- 6]; 399 sum += qlp_coeff[ 4] * data[i- 5]; 400 sum += qlp_coeff[ 3] * data[i- 4]; 401 sum += qlp_coeff[ 2] * data[i- 3]; 402 sum += qlp_coeff[ 1] * data[i- 2]; 403 sum += qlp_coeff[ 0] * data[i- 1]; 404 } 405 residual[i] = data[i] - (sum >> lp_quantization); 406 } 407 } 408} 409 410FLAC__SSE_TARGET("sse2") 411void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 412{ 413 int i; 414 415 FLAC__ASSERT(order > 0); 416 FLAC__ASSERT(order <= 32); 417 418 if(order <= 12) { 419 if(order > 8) { /* order == 9, 10, 11, 12 */ 420 if(order > 10) { /* order == 11, 12 */ 421 if(order == 12) { 422 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 423 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] 424 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] 425 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] 426 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] 427 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] 428 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] 429 430 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] 431 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] 432 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] 433 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] 434 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] 435 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] 436 437 for(i = 0; i < (int)data_len; i++) { 438 //sum = 0; 439 //sum += qlp_coeff[11] * data[i-12]; 440 //sum += qlp_coeff[10] * data[i-11]; 441 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] 442 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] 443 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */ 444 445 //sum += qlp_coeff[9] * data[i-10]; 446 //sum += qlp_coeff[8] * data[i-9]; 447 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 448 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 449 xmm6 = _mm_mul_epu32(xmm6, xmm4); 450 xmm7 = _mm_add_epi32(xmm7, xmm6); 451 452 //sum += qlp_coeff[7] * data[i-8]; 453 //sum += qlp_coeff[6] * data[i-7]; 454 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 455 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 456 xmm6 = _mm_mul_epu32(xmm6, xmm3); 457 xmm7 = _mm_add_epi32(xmm7, xmm6); 458 459 //sum += qlp_coeff[5] * data[i-6]; 460 //sum += qlp_coeff[4] * data[i-5]; 461 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 462 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 463 xmm6 = _mm_mul_epu32(xmm6, xmm2); 464 xmm7 = _mm_add_epi32(xmm7, xmm6); 465 466 //sum += qlp_coeff[3] * data[i-4]; 467 //sum += qlp_coeff[2] * data[i-3]; 468 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 469 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 470 xmm6 = _mm_mul_epu32(xmm6, xmm1); 471 xmm7 = _mm_add_epi32(xmm7, xmm6); 472 473 //sum += qlp_coeff[1] * data[i-2]; 474 //sum += qlp_coeff[0] * data[i-1]; 475 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 476 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 477 xmm6 = _mm_mul_epu32(xmm6, xmm0); 478 xmm7 = _mm_add_epi32(xmm7, xmm6); 479 480 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 481 RESIDUAL32_RESULT(xmm7); 482 } 483 } 484 else { /* order == 11 */ 485 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 486 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 487 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 488 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 489 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 490 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 491 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); 492 493 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 494 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 495 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 496 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 497 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); 498 499 for(i = 0; i < (int)data_len; i++) { 500 //sum = 0; 501 //sum = qlp_coeff[10] * data[i-11]; 502 xmm7 = _mm_cvtsi32_si128(data[i-11]); 503 xmm7 = _mm_mul_epu32(xmm7, xmm5); 504 505 //sum += qlp_coeff[9] * data[i-10]; 506 //sum += qlp_coeff[8] * data[i-9]; 507 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 508 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 509 xmm6 = _mm_mul_epu32(xmm6, xmm4); 510 xmm7 = _mm_add_epi32(xmm7, xmm6); 511 512 //sum += qlp_coeff[7] * data[i-8]; 513 //sum += qlp_coeff[6] * data[i-7]; 514 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 515 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 516 xmm6 = _mm_mul_epu32(xmm6, xmm3); 517 xmm7 = _mm_add_epi32(xmm7, xmm6); 518 519 //sum += qlp_coeff[5] * data[i-6]; 520 //sum += qlp_coeff[4] * data[i-5]; 521 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 522 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 523 xmm6 = _mm_mul_epu32(xmm6, xmm2); 524 xmm7 = _mm_add_epi32(xmm7, xmm6); 525 526 //sum += qlp_coeff[3] * data[i-4]; 527 //sum += qlp_coeff[2] * data[i-3]; 528 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 529 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 530 xmm6 = _mm_mul_epu32(xmm6, xmm1); 531 xmm7 = _mm_add_epi32(xmm7, xmm6); 532 533 //sum += qlp_coeff[1] * data[i-2]; 534 //sum += qlp_coeff[0] * data[i-1]; 535 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 536 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 537 xmm6 = _mm_mul_epu32(xmm6, xmm0); 538 xmm7 = _mm_add_epi32(xmm7, xmm6); 539 540 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 541 RESIDUAL32_RESULT(xmm7); 542 } 543 } 544 } 545 else { /* order == 9, 10 */ 546 if(order == 10) { 547 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; 548 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 549 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 550 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 551 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 552 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 553 554 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 555 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 556 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 557 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 558 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); 559 560 for(i = 0; i < (int)data_len; i++) { 561 //sum = 0; 562 //sum += qlp_coeff[9] * data[i-10]; 563 //sum += qlp_coeff[8] * data[i-9]; 564 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 565 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 566 xmm7 = _mm_mul_epu32(xmm7, xmm4); 567 568 //sum += qlp_coeff[7] * data[i-8]; 569 //sum += qlp_coeff[6] * data[i-7]; 570 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 571 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 572 xmm6 = _mm_mul_epu32(xmm6, xmm3); 573 xmm7 = _mm_add_epi32(xmm7, xmm6); 574 575 //sum += qlp_coeff[5] * data[i-6]; 576 //sum += qlp_coeff[4] * data[i-5]; 577 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 578 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 579 xmm6 = _mm_mul_epu32(xmm6, xmm2); 580 xmm7 = _mm_add_epi32(xmm7, xmm6); 581 582 //sum += qlp_coeff[3] * data[i-4]; 583 //sum += qlp_coeff[2] * data[i-3]; 584 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 585 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 586 xmm6 = _mm_mul_epu32(xmm6, xmm1); 587 xmm7 = _mm_add_epi32(xmm7, xmm6); 588 589 //sum += qlp_coeff[1] * data[i-2]; 590 //sum += qlp_coeff[0] * data[i-1]; 591 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 592 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 593 xmm6 = _mm_mul_epu32(xmm6, xmm0); 594 xmm7 = _mm_add_epi32(xmm7, xmm6); 595 596 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 597 RESIDUAL32_RESULT(xmm7); 598 } 599 } 600 else { /* order == 9 */ 601 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; 602 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 603 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 604 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 605 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 606 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); 607 608 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 609 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 610 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 611 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 612 613 for(i = 0; i < (int)data_len; i++) { 614 //sum = 0; 615 //sum = qlp_coeff[8] * data[i-9]; 616 xmm7 = _mm_cvtsi32_si128(data[i-9]); 617 xmm7 = _mm_mul_epu32(xmm7, xmm4); 618 619 //sum += qlp_coeff[7] * data[i-8]; 620 //sum += qlp_coeff[6] * data[i-7]; 621 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 622 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 623 xmm6 = _mm_mul_epu32(xmm6, xmm3); 624 xmm7 = _mm_add_epi32(xmm7, xmm6); 625 626 //sum += qlp_coeff[5] * data[i-6]; 627 //sum += qlp_coeff[4] * data[i-5]; 628 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 629 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 630 xmm6 = _mm_mul_epu32(xmm6, xmm2); 631 xmm7 = _mm_add_epi32(xmm7, xmm6); 632 633 //sum += qlp_coeff[3] * data[i-4]; 634 //sum += qlp_coeff[2] * data[i-3]; 635 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 636 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 637 xmm6 = _mm_mul_epu32(xmm6, xmm1); 638 xmm7 = _mm_add_epi32(xmm7, xmm6); 639 640 //sum += qlp_coeff[1] * data[i-2]; 641 //sum += qlp_coeff[0] * data[i-1]; 642 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 643 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 644 xmm6 = _mm_mul_epu32(xmm6, xmm0); 645 xmm7 = _mm_add_epi32(xmm7, xmm6); 646 647 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 648 RESIDUAL32_RESULT(xmm7); 649 } 650 } 651 } 652 } 653 else if(order > 4) { /* order == 5, 6, 7, 8 */ 654 if(order > 6) { /* order == 7, 8 */ 655 if(order == 8) { 656 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; 657 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 658 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 659 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 660 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 661 662 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 663 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 664 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 665 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 666 667 for(i = 0; i < (int)data_len; i++) { 668 //sum = 0; 669 //sum += qlp_coeff[7] * data[i-8]; 670 //sum += qlp_coeff[6] * data[i-7]; 671 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 672 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 673 xmm7 = _mm_mul_epu32(xmm7, xmm3); 674 675 //sum += qlp_coeff[5] * data[i-6]; 676 //sum += qlp_coeff[4] * data[i-5]; 677 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 678 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 679 xmm6 = _mm_mul_epu32(xmm6, xmm2); 680 xmm7 = _mm_add_epi32(xmm7, xmm6); 681 682 //sum += qlp_coeff[3] * data[i-4]; 683 //sum += qlp_coeff[2] * data[i-3]; 684 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 685 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 686 xmm6 = _mm_mul_epu32(xmm6, xmm1); 687 xmm7 = _mm_add_epi32(xmm7, xmm6); 688 689 //sum += qlp_coeff[1] * data[i-2]; 690 //sum += qlp_coeff[0] * data[i-1]; 691 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 692 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 693 xmm6 = _mm_mul_epu32(xmm6, xmm0); 694 xmm7 = _mm_add_epi32(xmm7, xmm6); 695 696 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 697 RESIDUAL32_RESULT(xmm7); 698 } 699 } 700 else { /* order == 7 */ 701 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; 702 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 703 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 704 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 705 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); 706 707 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 708 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 709 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 710 711 for(i = 0; i < (int)data_len; i++) { 712 //sum = 0; 713 //sum = qlp_coeff[6] * data[i-7]; 714 xmm7 = _mm_cvtsi32_si128(data[i-7]); 715 xmm7 = _mm_mul_epu32(xmm7, xmm3); 716 717 //sum += qlp_coeff[5] * data[i-6]; 718 //sum += qlp_coeff[4] * data[i-5]; 719 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 720 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 721 xmm6 = _mm_mul_epu32(xmm6, xmm2); 722 xmm7 = _mm_add_epi32(xmm7, xmm6); 723 724 //sum += qlp_coeff[3] * data[i-4]; 725 //sum += qlp_coeff[2] * data[i-3]; 726 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 727 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 728 xmm6 = _mm_mul_epu32(xmm6, xmm1); 729 xmm7 = _mm_add_epi32(xmm7, xmm6); 730 731 //sum += qlp_coeff[1] * data[i-2]; 732 //sum += qlp_coeff[0] * data[i-1]; 733 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 734 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 735 xmm6 = _mm_mul_epu32(xmm6, xmm0); 736 xmm7 = _mm_add_epi32(xmm7, xmm6); 737 738 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 739 RESIDUAL32_RESULT(xmm7); 740 } 741 } 742 } 743 else { /* order == 5, 6 */ 744 if(order == 6) { 745 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; 746 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 747 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 748 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 749 750 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 751 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 752 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 753 754 for(i = 0; i < (int)data_len; i++) { 755 //sum = 0; 756 //sum += qlp_coeff[5] * data[i-6]; 757 //sum += qlp_coeff[4] * data[i-5]; 758 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 759 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 760 xmm7 = _mm_mul_epu32(xmm7, xmm2); 761 762 //sum += qlp_coeff[3] * data[i-4]; 763 //sum += qlp_coeff[2] * data[i-3]; 764 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 765 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 766 xmm6 = _mm_mul_epu32(xmm6, xmm1); 767 xmm7 = _mm_add_epi32(xmm7, xmm6); 768 769 //sum += qlp_coeff[1] * data[i-2]; 770 //sum += qlp_coeff[0] * data[i-1]; 771 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 772 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 773 xmm6 = _mm_mul_epu32(xmm6, xmm0); 774 xmm7 = _mm_add_epi32(xmm7, xmm6); 775 776 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 777 RESIDUAL32_RESULT(xmm7); 778 } 779 } 780 else { /* order == 5 */ 781 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; 782 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 783 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 784 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); 785 786 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 787 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 788 789 for(i = 0; i < (int)data_len; i++) { 790 //sum = 0; 791 //sum = qlp_coeff[4] * data[i-5]; 792 xmm7 = _mm_cvtsi32_si128(data[i-5]); 793 xmm7 = _mm_mul_epu32(xmm7, xmm2); 794 795 //sum += qlp_coeff[3] * data[i-4]; 796 //sum += qlp_coeff[2] * data[i-3]; 797 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 798 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 799 xmm6 = _mm_mul_epu32(xmm6, xmm1); 800 xmm7 = _mm_add_epi32(xmm7, xmm6); 801 802 //sum += qlp_coeff[1] * data[i-2]; 803 //sum += qlp_coeff[0] * data[i-1]; 804 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 805 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 806 xmm6 = _mm_mul_epu32(xmm6, xmm0); 807 xmm7 = _mm_add_epi32(xmm7, xmm6); 808 809 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 810 RESIDUAL32_RESULT(xmm7); 811 } 812 } 813 } 814 } 815 else { /* order == 1, 2, 3, 4 */ 816 if(order > 2) { /* order == 3, 4 */ 817 if(order == 4) { 818 __m128i xmm0, xmm1, xmm6, xmm7; 819 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 820 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 821 822 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 823 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 824 825 for(i = 0; i < (int)data_len; i++) { 826 //sum = 0; 827 //sum += qlp_coeff[3] * data[i-4]; 828 //sum += qlp_coeff[2] * data[i-3]; 829 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 830 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 831 xmm7 = _mm_mul_epu32(xmm7, xmm1); 832 833 //sum += qlp_coeff[1] * data[i-2]; 834 //sum += qlp_coeff[0] * data[i-1]; 835 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 836 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 837 xmm6 = _mm_mul_epu32(xmm6, xmm0); 838 xmm7 = _mm_add_epi32(xmm7, xmm6); 839 840 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 841 RESIDUAL32_RESULT(xmm7); 842 } 843 } 844 else { /* order == 3 */ 845 __m128i xmm0, xmm1, xmm6, xmm7; 846 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 847 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); 848 849 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 850 851 for(i = 0; i < (int)data_len; i++) { 852 //sum = 0; 853 //sum = qlp_coeff[2] * data[i-3]; 854 xmm7 = _mm_cvtsi32_si128(data[i-3]); 855 xmm7 = _mm_mul_epu32(xmm7, xmm1); 856 857 //sum += qlp_coeff[1] * data[i-2]; 858 //sum += qlp_coeff[0] * data[i-1]; 859 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 860 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 861 xmm6 = _mm_mul_epu32(xmm6, xmm0); 862 xmm7 = _mm_add_epi32(xmm7, xmm6); 863 864 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 865 RESIDUAL32_RESULT(xmm7); 866 } 867 } 868 } 869 else { /* order == 1, 2 */ 870 if(order == 2) { 871 __m128i xmm0, xmm7; 872 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 873 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 874 875 for(i = 0; i < (int)data_len; i++) { 876 //sum = 0; 877 //sum += qlp_coeff[1] * data[i-2]; 878 //sum += qlp_coeff[0] * data[i-1]; 879 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 880 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 881 xmm7 = _mm_mul_epu32(xmm7, xmm0); 882 883 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8)); 884 RESIDUAL32_RESULT(xmm7); 885 } 886 } 887 else { /* order == 1 */ 888 for(i = 0; i < (int)data_len; i++) 889 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization); 890 } 891 } 892 } 893 } 894 else { /* order > 12 */ 895 FLAC__int32 sum; 896 for(i = 0; i < (int)data_len; i++) { 897 sum = 0; 898 switch(order) { 899 case 32: sum += qlp_coeff[31] * data[i-32]; 900 case 31: sum += qlp_coeff[30] * data[i-31]; 901 case 30: sum += qlp_coeff[29] * data[i-30]; 902 case 29: sum += qlp_coeff[28] * data[i-29]; 903 case 28: sum += qlp_coeff[27] * data[i-28]; 904 case 27: sum += qlp_coeff[26] * data[i-27]; 905 case 26: sum += qlp_coeff[25] * data[i-26]; 906 case 25: sum += qlp_coeff[24] * data[i-25]; 907 case 24: sum += qlp_coeff[23] * data[i-24]; 908 case 23: sum += qlp_coeff[22] * data[i-23]; 909 case 22: sum += qlp_coeff[21] * data[i-22]; 910 case 21: sum += qlp_coeff[20] * data[i-21]; 911 case 20: sum += qlp_coeff[19] * data[i-20]; 912 case 19: sum += qlp_coeff[18] * data[i-19]; 913 case 18: sum += qlp_coeff[17] * data[i-18]; 914 case 17: sum += qlp_coeff[16] * data[i-17]; 915 case 16: sum += qlp_coeff[15] * data[i-16]; 916 case 15: sum += qlp_coeff[14] * data[i-15]; 917 case 14: sum += qlp_coeff[13] * data[i-14]; 918 case 13: sum += qlp_coeff[12] * data[i-13]; 919 sum += qlp_coeff[11] * data[i-12]; 920 sum += qlp_coeff[10] * data[i-11]; 921 sum += qlp_coeff[ 9] * data[i-10]; 922 sum += qlp_coeff[ 8] * data[i- 9]; 923 sum += qlp_coeff[ 7] * data[i- 8]; 924 sum += qlp_coeff[ 6] * data[i- 7]; 925 sum += qlp_coeff[ 5] * data[i- 6]; 926 sum += qlp_coeff[ 4] * data[i- 5]; 927 sum += qlp_coeff[ 3] * data[i- 4]; 928 sum += qlp_coeff[ 2] * data[i- 3]; 929 sum += qlp_coeff[ 1] * data[i- 2]; 930 sum += qlp_coeff[ 0] * data[i- 1]; 931 } 932 residual[i] = data[i] - (sum >> lp_quantization); 933 } 934 } 935} 936 937#if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */ 938 939FLAC__SSE_TARGET("sse2") 940void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 941{ 942 if (order < 8 || order > 12) { 943 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); 944 return; 945 } 946 if (data_len == 0) 947 return; 948 949 FLAC__ASSERT(order >= 8); 950 FLAC__ASSERT(order <= 12); 951 952 if(order > 8) { /* order == 9, 10, 11, 12 */ 953 FLAC__int32 curr; 954 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 955 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); 956 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); 957 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */ 958 switch(order) /* ...and zero them out */ 959 { 960 case 9: 961 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break; 962 case 10: 963 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break; 964 case 11: 965 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break; 966 } 967 xmm2 = _mm_setzero_si128(); 968 xmm0 = _mm_packs_epi32(xmm0, xmm6); 969 xmm1 = _mm_packs_epi32(xmm1, xmm2); 970 971 xmm4 = _mm_loadu_si128((const __m128i*)(data-12)); 972 xmm5 = _mm_loadu_si128((const __m128i*)(data-8)); 973 xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); 974 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3)); 975 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3)); 976 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); 977 xmm4 = _mm_packs_epi32(xmm4, xmm2); 978 xmm3 = _mm_packs_epi32(xmm3, xmm5); 979 980 xmm7 = _mm_slli_si128(xmm1, 2); 981 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14)); 982 xmm2 = _mm_slli_si128(xmm0, 2); 983 984 /* xmm0, xmm1: qlp_coeff 985 xmm2, xmm7: qlp_coeff << 16 bit 986 xmm3, xmm4: data */ 987 988 xmm5 = _mm_madd_epi16(xmm4, xmm1); 989 xmm6 = _mm_madd_epi16(xmm3, xmm0); 990 xmm6 = _mm_add_epi32(xmm6, xmm5); 991 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 992 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 993 994 DATA16_RESULT(xmm6); 995 996 data_len--; 997 998 if(data_len % 2) { 999 xmm6 = _mm_srli_si128(xmm3, 14); 1000 xmm4 = _mm_slli_si128(xmm4, 2); 1001 xmm3 = _mm_slli_si128(xmm3, 2); 1002 xmm4 = _mm_or_si128(xmm4, xmm6); 1003 xmm3 = _mm_insert_epi16(xmm3, curr, 0); 1004 1005 xmm5 = _mm_madd_epi16(xmm4, xmm1); 1006 xmm6 = _mm_madd_epi16(xmm3, xmm0); 1007 xmm6 = _mm_add_epi32(xmm6, xmm5); 1008 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1009 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1010 1011 DATA16_RESULT(xmm6); 1012 1013 data_len--; 1014 } 1015 1016 while(data_len) { /* data_len is a multiple of 2 */ 1017 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */ 1018 xmm6 = _mm_srli_si128(xmm3, 12); 1019 xmm4 = _mm_slli_si128(xmm4, 4); 1020 xmm3 = _mm_slli_si128(xmm3, 4); 1021 xmm4 = _mm_or_si128(xmm4, xmm6); 1022 xmm3 = _mm_insert_epi16(xmm3, curr, 1); 1023 1024 xmm5 = _mm_madd_epi16(xmm4, xmm7); 1025 xmm6 = _mm_madd_epi16(xmm3, xmm2); 1026 xmm6 = _mm_add_epi32(xmm6, xmm5); 1027 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1028 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1029 1030 DATA16_RESULT(xmm6); 1031 1032 xmm3 = _mm_insert_epi16(xmm3, curr, 0); 1033 1034 xmm5 = _mm_madd_epi16(xmm4, xmm1); 1035 xmm6 = _mm_madd_epi16(xmm3, xmm0); 1036 xmm6 = _mm_add_epi32(xmm6, xmm5); 1037 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1038 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1039 1040 DATA16_RESULT(xmm6); 1041 1042 data_len-=2; 1043 } 1044 } /* endif(order > 8) */ 1045 else 1046 { 1047 FLAC__int32 curr; 1048 __m128i xmm0, xmm1, xmm3, xmm6; 1049 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); 1050 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); 1051 xmm0 = _mm_packs_epi32(xmm0, xmm1); 1052 1053 xmm1 = _mm_loadu_si128((const __m128i*)(data-8)); 1054 xmm3 = _mm_loadu_si128((const __m128i*)(data-4)); 1055 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3)); 1056 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3)); 1057 xmm3 = _mm_packs_epi32(xmm3, xmm1); 1058 1059 /* xmm0: qlp_coeff 1060 xmm3: data */ 1061 1062 xmm6 = _mm_madd_epi16(xmm3, xmm0); 1063 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1064 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1065 1066 DATA16_RESULT(xmm6); 1067 1068 data_len--; 1069 1070 while(data_len) { 1071 xmm3 = _mm_slli_si128(xmm3, 2); 1072 xmm3 = _mm_insert_epi16(xmm3, curr, 0); 1073 1074 xmm6 = _mm_madd_epi16(xmm3, xmm0); 1075 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); 1076 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); 1077 1078 DATA16_RESULT(xmm6); 1079 1080 data_len--; 1081 } 1082 } 1083} 1084 1085#endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */ 1086 1087#endif /* FLAC__SSE2_SUPPORTED */ 1088#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ 1089#endif /* FLAC__NO_ASM */ 1090#endif /* FLAC__INTEGER_ONLY_LIBRARY */ 1091