1/* libFLAC - Free Lossless Audio Codec library 2 * Copyright (C) 2000-2009 Josh Coalson 3 * Copyright (C) 2011-2016 Xiph.Org Foundation 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * - Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * - Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * - Neither the name of the Xiph.org Foundation nor the names of its 17 * contributors may be used to endorse or promote products derived from 18 * this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#ifdef HAVE_CONFIG_H 34# include <config.h> 35#endif 36 37#include "private/cpu.h" 38 39#ifndef FLAC__INTEGER_ONLY_LIBRARY 40#ifndef FLAC__NO_ASM 41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN 42#include "private/lpc.h" 43#ifdef FLAC__SSE4_1_SUPPORTED 44 45#include "FLAC/assert.h" 46#include "FLAC/format.h" 47 48#include <smmintrin.h> /* SSE4.1 */ 49 50#if defined FLAC__CPU_IA32 /* unused for x64 */ 51 52#define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt)) 53#define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization)) 54 55FLAC__SSE_TARGET("sse4.1") 56void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 57{ 58 int i; 59 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 60 61 FLAC__ASSERT(order > 0); 62 FLAC__ASSERT(order <= 32); 63 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */ 64 65 if(order <= 12) { 66 if(order > 8) { /* order == 9, 10, 11, 12 */ 67 if(order > 10) { /* order == 11, 12 */ 68 if(order == 12) { 69 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 70 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] 71 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] 72 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] 73 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] 74 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] 75 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] 76 77 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0] 78 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2] 79 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4] 80 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6] 81 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8] 82 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10] 83 84 for(i = 0; i < (int)data_len; i++) { 85 //sum = 0; 86 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 87 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 88 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12] 89 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] 90 xmm7 = _mm_mul_epi32(xmm7, xmm5); 91 92 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10]; 93 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9]; 94 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 95 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 96 xmm6 = _mm_mul_epi32(xmm6, xmm4); 97 xmm7 = _mm_add_epi64(xmm7, xmm6); 98 99 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; 100 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; 101 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 102 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 103 xmm6 = _mm_mul_epi32(xmm6, xmm3); 104 xmm7 = _mm_add_epi64(xmm7, xmm6); 105 106 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 107 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 108 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 109 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 110 xmm6 = _mm_mul_epi32(xmm6, xmm2); 111 xmm7 = _mm_add_epi64(xmm7, xmm6); 112 113 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 114 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 115 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 116 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 117 xmm6 = _mm_mul_epi32(xmm6, xmm1); 118 xmm7 = _mm_add_epi64(xmm7, xmm6); 119 120 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 121 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 122 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 123 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 124 xmm6 = _mm_mul_epi32(xmm6, xmm0); 125 xmm7 = _mm_add_epi64(xmm7, xmm6); 126 127 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 128 RESIDUAL64_RESULT1(xmm7); 129 } 130 } 131 else { /* order == 11 */ 132 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 133 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 134 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 135 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 136 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 137 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 138 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]); 139 140 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 141 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 142 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 143 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 144 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); 145 146 for(i = 0; i < (int)data_len; i++) { 147 //sum = 0; 148 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11]; 149 xmm7 = _mm_cvtsi32_si128(data[i-11]); 150 xmm7 = _mm_mul_epi32(xmm7, xmm5); 151 152 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10]; 153 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9]; 154 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 155 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 156 xmm6 = _mm_mul_epi32(xmm6, xmm4); 157 xmm7 = _mm_add_epi64(xmm7, xmm6); 158 159 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; 160 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; 161 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 162 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 163 xmm6 = _mm_mul_epi32(xmm6, xmm3); 164 xmm7 = _mm_add_epi64(xmm7, xmm6); 165 166 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 167 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 168 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 169 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 170 xmm6 = _mm_mul_epi32(xmm6, xmm2); 171 xmm7 = _mm_add_epi64(xmm7, xmm6); 172 173 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 174 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 175 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 176 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 177 xmm6 = _mm_mul_epi32(xmm6, xmm1); 178 xmm7 = _mm_add_epi64(xmm7, xmm6); 179 180 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 181 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 182 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 183 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 184 xmm6 = _mm_mul_epi32(xmm6, xmm0); 185 xmm7 = _mm_add_epi64(xmm7, xmm6); 186 187 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 188 RESIDUAL64_RESULT1(xmm7); 189 } 190 } 191 } 192 else { /* order == 9, 10 */ 193 if(order == 10) { 194 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; 195 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 196 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 197 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 198 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 199 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 200 201 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 202 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 203 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 204 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 205 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); 206 207 for(i = 0; i < (int)data_len; i++) { 208 //sum = 0; 209 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10]; 210 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9]; 211 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10)); 212 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 213 xmm7 = _mm_mul_epi32(xmm7, xmm4); 214 215 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; 216 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; 217 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 218 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 219 xmm6 = _mm_mul_epi32(xmm6, xmm3); 220 xmm7 = _mm_add_epi64(xmm7, xmm6); 221 222 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 223 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 224 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 225 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 226 xmm6 = _mm_mul_epi32(xmm6, xmm2); 227 xmm7 = _mm_add_epi64(xmm7, xmm6); 228 229 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 230 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 231 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 232 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 233 xmm6 = _mm_mul_epi32(xmm6, xmm1); 234 xmm7 = _mm_add_epi64(xmm7, xmm6); 235 236 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 237 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 238 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 239 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 240 xmm6 = _mm_mul_epi32(xmm6, xmm0); 241 xmm7 = _mm_add_epi64(xmm7, xmm6); 242 243 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 244 RESIDUAL64_RESULT(xmm7); 245 } 246 } 247 else { /* order == 9 */ 248 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7; 249 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 250 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 251 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 252 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 253 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]); 254 255 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 256 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 257 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 258 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 259 260 for(i = 0; i < (int)data_len; i++) { 261 //sum = 0; 262 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9]; 263 xmm7 = _mm_cvtsi32_si128(data[i-9]); 264 xmm7 = _mm_mul_epi32(xmm7, xmm4); 265 266 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; 267 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; 268 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 269 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 270 xmm6 = _mm_mul_epi32(xmm6, xmm3); 271 xmm7 = _mm_add_epi64(xmm7, xmm6); 272 273 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 274 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 275 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 276 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 277 xmm6 = _mm_mul_epi32(xmm6, xmm2); 278 xmm7 = _mm_add_epi64(xmm7, xmm6); 279 280 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 281 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 282 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 283 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 284 xmm6 = _mm_mul_epi32(xmm6, xmm1); 285 xmm7 = _mm_add_epi64(xmm7, xmm6); 286 287 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 288 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 289 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 290 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 291 xmm6 = _mm_mul_epi32(xmm6, xmm0); 292 xmm7 = _mm_add_epi64(xmm7, xmm6); 293 294 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 295 RESIDUAL64_RESULT(xmm7); 296 } 297 } 298 } 299 } 300 else if(order > 4) { /* order == 5, 6, 7, 8 */ 301 if(order > 6) { /* order == 7, 8 */ 302 if(order == 8) { 303 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; 304 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 305 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 306 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 307 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 308 309 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 310 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 311 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 312 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); 313 314 for(i = 0; i < (int)data_len; i++) { 315 //sum = 0; 316 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8]; 317 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7]; 318 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8)); 319 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 320 xmm7 = _mm_mul_epi32(xmm7, xmm3); 321 322 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 323 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 324 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 325 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 326 xmm6 = _mm_mul_epi32(xmm6, xmm2); 327 xmm7 = _mm_add_epi64(xmm7, xmm6); 328 329 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 330 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 331 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 332 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 333 xmm6 = _mm_mul_epi32(xmm6, xmm1); 334 xmm7 = _mm_add_epi64(xmm7, xmm6); 335 336 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 337 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 338 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 339 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 340 xmm6 = _mm_mul_epi32(xmm6, xmm0); 341 xmm7 = _mm_add_epi64(xmm7, xmm6); 342 343 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 344 RESIDUAL64_RESULT(xmm7); 345 } 346 } 347 else { /* order == 7 */ 348 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7; 349 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 350 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 351 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 352 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]); 353 354 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 355 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 356 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 357 358 for(i = 0; i < (int)data_len; i++) { 359 //sum = 0; 360 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7]; 361 xmm7 = _mm_cvtsi32_si128(data[i-7]); 362 xmm7 = _mm_mul_epi32(xmm7, xmm3); 363 364 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 365 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 366 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 367 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 368 xmm6 = _mm_mul_epi32(xmm6, xmm2); 369 xmm7 = _mm_add_epi64(xmm7, xmm6); 370 371 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 372 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 373 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 374 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 375 xmm6 = _mm_mul_epi32(xmm6, xmm1); 376 xmm7 = _mm_add_epi64(xmm7, xmm6); 377 378 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 379 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 380 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 381 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 382 xmm6 = _mm_mul_epi32(xmm6, xmm0); 383 xmm7 = _mm_add_epi64(xmm7, xmm6); 384 385 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 386 RESIDUAL64_RESULT(xmm7); 387 } 388 } 389 } 390 else { /* order == 5, 6 */ 391 if(order == 6) { 392 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; 393 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 394 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 395 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 396 397 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 398 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 399 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); 400 401 for(i = 0; i < (int)data_len; i++) { 402 //sum = 0; 403 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6]; 404 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5]; 405 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6)); 406 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 407 xmm7 = _mm_mul_epi32(xmm7, xmm2); 408 409 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 410 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 411 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 412 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 413 xmm6 = _mm_mul_epi32(xmm6, xmm1); 414 xmm7 = _mm_add_epi64(xmm7, xmm6); 415 416 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 417 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 418 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 419 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 420 xmm6 = _mm_mul_epi32(xmm6, xmm0); 421 xmm7 = _mm_add_epi64(xmm7, xmm6); 422 423 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 424 RESIDUAL64_RESULT(xmm7); 425 } 426 } 427 else { /* order == 5 */ 428 __m128i xmm0, xmm1, xmm2, xmm6, xmm7; 429 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 430 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 431 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]); 432 433 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 434 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 435 436 for(i = 0; i < (int)data_len; i++) { 437 //sum = 0; 438 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5]; 439 xmm7 = _mm_cvtsi32_si128(data[i-5]); 440 xmm7 = _mm_mul_epi32(xmm7, xmm2); 441 442 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 443 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 444 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 445 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 446 xmm6 = _mm_mul_epi32(xmm6, xmm1); 447 xmm7 = _mm_add_epi64(xmm7, xmm6); 448 449 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 450 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 451 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 452 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 453 xmm6 = _mm_mul_epi32(xmm6, xmm0); 454 xmm7 = _mm_add_epi64(xmm7, xmm6); 455 456 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 457 RESIDUAL64_RESULT(xmm7); 458 } 459 } 460 } 461 } 462 else { /* order == 1, 2, 3, 4 */ 463 if(order > 2) { /* order == 3, 4 */ 464 if(order == 4) { 465 __m128i xmm0, xmm1, xmm6, xmm7; 466 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 467 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 468 469 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 470 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); 471 472 for(i = 0; i < (int)data_len; i++) { 473 //sum = 0; 474 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4]; 475 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3]; 476 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4)); 477 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 478 xmm7 = _mm_mul_epi32(xmm7, xmm1); 479 480 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 481 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 482 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 483 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 484 xmm6 = _mm_mul_epi32(xmm6, xmm0); 485 xmm7 = _mm_add_epi64(xmm7, xmm6); 486 487 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 488 RESIDUAL64_RESULT(xmm7); 489 } 490 } 491 else { /* order == 3 */ 492 __m128i xmm0, xmm1, xmm6, xmm7; 493 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 494 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]); 495 496 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 497 498 for(i = 0; i < (int)data_len; i++) { 499 //sum = 0; 500 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3]; 501 xmm7 = _mm_cvtsi32_si128(data[i-3]); 502 xmm7 = _mm_mul_epi32(xmm7, xmm1); 503 504 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 505 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 506 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 507 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1)); 508 xmm6 = _mm_mul_epi32(xmm6, xmm0); 509 xmm7 = _mm_add_epi64(xmm7, xmm6); 510 511 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 512 RESIDUAL64_RESULT(xmm7); 513 } 514 } 515 } 516 else { /* order == 1, 2 */ 517 if(order == 2) { 518 __m128i xmm0, xmm7; 519 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 520 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); 521 522 for(i = 0; i < (int)data_len; i++) { 523 //sum = 0; 524 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2]; 525 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1]; 526 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2)); 527 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); 528 xmm7 = _mm_mul_epi32(xmm7, xmm0); 529 530 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8)); 531 RESIDUAL64_RESULT(xmm7); 532 } 533 } 534 else { /* order == 1 */ 535 __m128i xmm0, xmm7; 536 xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]); 537 538 for(i = 0; i < (int)data_len; i++) { 539 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1]; 540 xmm7 = _mm_cvtsi32_si128(data[i-1]); 541 xmm7 = _mm_mul_epi32(xmm7, xmm0); 542 RESIDUAL64_RESULT(xmm7); 543 } 544 } 545 } 546 } 547 } 548 else { /* order > 12 */ 549 FLAC__int64 sum; 550 for(i = 0; i < (int)data_len; i++) { 551 sum = 0; 552 switch(order) { 553 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; 554 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; 555 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; 556 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; 557 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; 558 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; 559 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; 560 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; 561 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; 562 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; 563 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; 564 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; 565 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; 566 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; 567 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; 568 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; 569 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; 570 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; 571 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; 572 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; 573 sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 574 sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 575 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; 576 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; 577 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; 578 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; 579 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; 580 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; 581 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; 582 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; 583 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; 584 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; 585 } 586 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 587 } 588 } 589} 590 591FLAC__SSE_TARGET("sse4.1") 592void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 593{ 594 int i; 595 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 596 597 if (!data_len) 598 return; 599 600 FLAC__ASSERT(order > 0); 601 FLAC__ASSERT(order <= 32); 602 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */ 603 604 if(order <= 12) { 605 if(order > 8) { /* order == 9, 10, 11, 12 */ 606 if(order > 10) { /* order == 11, 12 */ 607 __m128i qlp[6], dat[6]; 608 __m128i summ, temp; 609 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0] 610 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2] 611 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4] 612 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6] 613 qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8] 614 if (order == 12) 615 qlp[5] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10] 616 else 617 qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10]); // 0 0 0 q[10] 618 619 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); // 0 q[0] 0 q[1] 620 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); // 0 q[2] 0 q[3] 621 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); // 0 q[4] 0 q[5] 622 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1)); // 0 q[5] 0 q[7] 623 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1)); // 0 q[8] 0 q[9] 624 qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2,0,3,1)); // 0 q[10] 0 q[11] 625 626 dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-12))); // ? d[i-11] ? d[i-12] 627 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10))); // ? d[i-9] ? d[i-10] 628 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 ))); // ? d[i-7] ? d[i-8] 629 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); // ? d[i-5] ? d[i-6] 630 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); // ? d[i-3] ? d[i-4] 631 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); // ? d[i-1] ? d[i-2] 632 633 summ = _mm_mul_epi32(dat[5], qlp[5]) ; 634 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4])); 635 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); 636 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); 637 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 638 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 639 640 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64 641 summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32 642 temp = _mm_cvtsi32_si128(residual[0]); // 0 0 0 r[i] 643 temp = _mm_add_epi32(temp, summ); // ? ? ? d[i] 644 data[0] = _mm_cvtsi128_si32(temp); 645 646 for(i = 1; i < (int)data_len; i++) { 647 dat[5] = _mm_alignr_epi8(dat[4], dat[5], 8); // ? d[i-10] ? d[i-11] 648 dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8); // ? d[i-8] ? d[i-9] 649 dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); // ? d[i-6] ? d[i-7] 650 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); // ? d[i-4] ? d[i-5] 651 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); // ? d[i-2] ? d[i-3] 652 dat[0] = _mm_alignr_epi8(temp, dat[0], 8); // ? d[i ] ? d[i-1] 653 654 summ = _mm_mul_epi32(dat[5], qlp[5]) ; 655 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4])); 656 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); 657 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); 658 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 659 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 660 661 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64 662 summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32 663 temp = _mm_cvtsi32_si128(residual[i]); // 0 0 0 r[i] 664 temp = _mm_add_epi32(temp, summ); // ? ? ? d[i] 665 data[i] = _mm_cvtsi128_si32(temp); 666 } 667 } 668 else { /* order == 9, 10 */ 669 __m128i qlp[5], dat[5]; 670 __m128i summ, temp; 671 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 672 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 673 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 674 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 675 if (order == 10) 676 qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); 677 else 678 qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8]); 679 680 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); 681 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); 682 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); 683 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1)); 684 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2,0,3,1)); 685 686 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-10))); 687 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 ))); 688 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); 689 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); 690 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); 691 692 summ = _mm_mul_epi32(dat[4], qlp[4]) ; 693 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); 694 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); 695 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 696 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 697 698 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 699 summ = _mm_srl_epi64(summ, cnt); 700 temp = _mm_cvtsi32_si128(residual[0]); 701 temp = _mm_add_epi32(temp, summ); 702 data[0] = _mm_cvtsi128_si32(temp); 703 704 for(i = 1; i < (int)data_len; i++) { 705 dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8); 706 dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); 707 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); 708 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); 709 dat[0] = _mm_alignr_epi8(temp, dat[0], 8); 710 711 summ = _mm_mul_epi32(dat[4], qlp[4]) ; 712 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); 713 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); 714 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 715 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 716 717 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 718 summ = _mm_srl_epi64(summ, cnt); 719 temp = _mm_cvtsi32_si128(residual[i]); 720 temp = _mm_add_epi32(temp, summ); 721 data[i] = _mm_cvtsi128_si32(temp); 722 } 723 } 724 } 725 else if(order > 4) { /* order == 5, 6, 7, 8 */ 726 if(order > 6) { /* order == 7, 8 */ 727 __m128i qlp[4], dat[4]; 728 __m128i summ, temp; 729 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 730 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 731 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 732 if (order == 8) 733 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); 734 else 735 qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6]); 736 737 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); 738 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); 739 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); 740 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2,0,3,1)); 741 742 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-8 ))); 743 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); 744 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); 745 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); 746 747 summ = _mm_mul_epi32(dat[3], qlp[3]) ; 748 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); 749 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 750 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 751 752 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 753 summ = _mm_srl_epi64(summ, cnt); 754 temp = _mm_cvtsi32_si128(residual[0]); 755 temp = _mm_add_epi32(temp, summ); 756 data[0] = _mm_cvtsi128_si32(temp); 757 758 for(i = 1; i < (int)data_len; i++) { 759 dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); 760 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); 761 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); 762 dat[0] = _mm_alignr_epi8(temp, dat[0], 8); 763 764 summ = _mm_mul_epi32(dat[3], qlp[3]) ; 765 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); 766 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 767 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 768 769 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 770 summ = _mm_srl_epi64(summ, cnt); 771 temp = _mm_cvtsi32_si128(residual[i]); 772 temp = _mm_add_epi32(temp, summ); 773 data[i] = _mm_cvtsi128_si32(temp); 774 } 775 } 776 else { /* order == 5, 6 */ 777 __m128i qlp[3], dat[3]; 778 __m128i summ, temp; 779 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 780 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 781 if (order == 6) 782 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); 783 else 784 qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4]); 785 786 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); 787 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); 788 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2,0,3,1)); 789 790 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-6 ))); 791 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); 792 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); 793 794 summ = _mm_mul_epi32(dat[2], qlp[2]) ; 795 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 796 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 797 798 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 799 summ = _mm_srl_epi64(summ, cnt); 800 temp = _mm_cvtsi32_si128(residual[0]); 801 temp = _mm_add_epi32(temp, summ); 802 data[0] = _mm_cvtsi128_si32(temp); 803 804 for(i = 1; i < (int)data_len; i++) { 805 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); 806 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); 807 dat[0] = _mm_alignr_epi8(temp, dat[0], 8); 808 809 summ = _mm_mul_epi32(dat[2], qlp[2]) ; 810 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); 811 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 812 813 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 814 summ = _mm_srl_epi64(summ, cnt); 815 temp = _mm_cvtsi32_si128(residual[i]); 816 temp = _mm_add_epi32(temp, summ); 817 data[i] = _mm_cvtsi128_si32(temp); 818 } 819 } 820 } 821 else { /* order == 1, 2, 3, 4 */ 822 if(order > 2) { /* order == 3, 4 */ 823 __m128i qlp[2], dat[2]; 824 __m128i summ, temp; 825 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); 826 if (order == 4) 827 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); 828 else 829 qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2]); 830 831 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2,0,3,1)); 832 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2,0,3,1)); 833 834 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-4 ))); 835 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); 836 837 summ = _mm_mul_epi32(dat[1], qlp[1]) ; 838 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 839 840 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 841 summ = _mm_srl_epi64(summ, cnt); 842 temp = _mm_cvtsi32_si128(residual[0]); 843 temp = _mm_add_epi32(temp, summ); 844 data[0] = _mm_cvtsi128_si32(temp); 845 846 for(i = 1; i < (int)data_len; i++) { 847 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); 848 dat[0] = _mm_alignr_epi8(temp, dat[0], 8); 849 850 summ = _mm_mul_epi32(dat[1], qlp[1]) ; 851 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); 852 853 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 854 summ = _mm_srl_epi64(summ, cnt); 855 temp = _mm_cvtsi32_si128(residual[i]); 856 temp = _mm_add_epi32(temp, summ); 857 data[i] = _mm_cvtsi128_si32(temp); 858 } 859 } 860 else { /* order == 1, 2 */ 861 if(order == 2) { 862 __m128i qlp0, dat0; 863 __m128i summ, temp; 864 qlp0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff)); 865 qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFFLE(2,0,3,1)); 866 867 dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(data-2 ))); 868 869 summ = _mm_mul_epi32(dat0, qlp0) ; 870 871 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 872 summ = _mm_srl_epi64(summ, cnt); 873 temp = _mm_cvtsi32_si128(residual[0]); 874 temp = _mm_add_epi32(temp, summ); 875 data[0] = _mm_cvtsi128_si32(temp); 876 877 for(i = 1; i < (int)data_len; i++) { 878 dat0 = _mm_alignr_epi8(temp, dat0, 8); 879 880 summ = _mm_mul_epi32(dat0, qlp0) ; 881 882 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); 883 summ = _mm_srl_epi64(summ, cnt); 884 temp = _mm_cvtsi32_si128(residual[i]); 885 temp = _mm_add_epi32(temp, summ); 886 data[i] = _mm_cvtsi128_si32(temp); 887 } 888 } 889 else { /* order == 1 */ 890 __m128i qlp0; 891 __m128i summ, temp; 892 qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]); 893 temp = _mm_cvtsi32_si128(data[-1]); 894 895 summ = _mm_mul_epi32(temp, qlp0); 896 summ = _mm_srl_epi64(summ, cnt); 897 temp = _mm_cvtsi32_si128(residual[0]); 898 temp = _mm_add_epi32(temp, summ); 899 data[0] = _mm_cvtsi128_si32(temp); 900 901 for(i = 1; i < (int)data_len; i++) { 902 summ = _mm_mul_epi32(temp, qlp0) ; 903 summ = _mm_srl_epi64(summ, cnt); 904 temp = _mm_cvtsi32_si128(residual[i]); 905 temp = _mm_add_epi32(temp, summ); 906 data[i] = _mm_cvtsi128_si32(temp); 907 } 908 } 909 } 910 } 911 } 912 else { /* order > 12 */ 913 FLAC__int64 sum; 914 for(i = 0; i < (int)data_len; i++) { 915 sum = 0; 916 switch(order) { 917 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; 918 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; 919 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; 920 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; 921 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; 922 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; 923 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; 924 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; 925 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; 926 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; 927 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; 928 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; 929 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; 930 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; 931 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; 932 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; 933 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; 934 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; 935 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; 936 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; 937 sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 938 sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 939 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; 940 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; 941 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; 942 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; 943 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; 944 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; 945 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; 946 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; 947 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; 948 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; 949 } 950 data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); 951 } 952 } 953} 954 955#endif /* defined FLAC__CPU_IA32 */ 956 957FLAC__SSE_TARGET("sse4.1") 958void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 959{ 960 int i; 961 FLAC__int32 sum; 962 __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 963 964 FLAC__ASSERT(order > 0); 965 FLAC__ASSERT(order <= 32); 966 967 if(order <= 12) { 968 if(order > 8) { 969 if(order > 10) { 970 if(order == 12) { 971 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 972 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 973 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 974 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 975 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 976 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 977 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 978 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 979 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 980 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 981 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 982 q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); 983 q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0)); 984 985 for(i = 0; i < (int)data_len-3; i+=4) { 986 __m128i summ, mull; 987 summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12))); 988 mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull); 989 mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); 990 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 991 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 992 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 993 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 994 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 995 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 996 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 997 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 998 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 999 summ = _mm_sra_epi32(summ, cnt); 1000 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1001 } 1002 } 1003 else { /* order == 11 */ 1004 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 1005 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1006 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1007 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1008 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1009 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1010 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 1011 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 1012 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 1013 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 1014 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 1015 q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0)); 1016 1017 for(i = 0; i < (int)data_len-3; i+=4) { 1018 __m128i summ, mull; 1019 summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); 1020 mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull); 1021 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 1022 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 1023 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 1024 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 1025 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 1026 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1027 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1028 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1029 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1030 summ = _mm_sra_epi32(summ, cnt); 1031 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1032 } 1033 } 1034 } 1035 else { 1036 if(order == 10) { 1037 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 1038 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1039 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1040 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1041 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1042 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1043 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 1044 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 1045 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 1046 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 1047 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0)); 1048 1049 for(i = 0; i < (int)data_len-3; i+=4) { 1050 __m128i summ, mull; 1051 summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); 1052 mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull); 1053 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 1054 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 1055 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 1056 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 1057 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1058 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1059 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1060 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1061 summ = _mm_sra_epi32(summ, cnt); 1062 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1063 } 1064 } 1065 else { /* order == 9 */ 1066 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8; 1067 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1068 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1069 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1070 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1071 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1072 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 1073 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 1074 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 1075 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0)); 1076 1077 for(i = 0; i < (int)data_len-3; i+=4) { 1078 __m128i summ, mull; 1079 summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); 1080 mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull); 1081 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 1082 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 1083 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 1084 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1085 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1086 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1087 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1088 summ = _mm_sra_epi32(summ, cnt); 1089 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1090 } 1091 } 1092 } 1093 } 1094 else if(order > 4) { 1095 if(order > 6) { 1096 if(order == 8) { 1097 __m128i q0, q1, q2, q3, q4, q5, q6, q7; 1098 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1099 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1100 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1101 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1102 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1103 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 1104 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 1105 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0)); 1106 1107 for(i = 0; i < (int)data_len-3; i+=4) { 1108 __m128i summ, mull; 1109 summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); 1110 mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull); 1111 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 1112 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 1113 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1114 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1115 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1116 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1117 summ = _mm_sra_epi32(summ, cnt); 1118 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1119 } 1120 } 1121 else { /* order == 7 */ 1122 __m128i q0, q1, q2, q3, q4, q5, q6; 1123 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1124 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1125 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1126 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1127 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1128 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 1129 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0)); 1130 1131 for(i = 0; i < (int)data_len-3; i+=4) { 1132 __m128i summ, mull; 1133 summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); 1134 mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull); 1135 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 1136 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1137 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1138 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1139 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1140 summ = _mm_sra_epi32(summ, cnt); 1141 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1142 } 1143 } 1144 } 1145 else { 1146 if(order == 6) { 1147 __m128i q0, q1, q2, q3, q4, q5; 1148 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1149 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1150 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1151 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1152 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1153 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0)); 1154 1155 for(i = 0; i < (int)data_len-3; i+=4) { 1156 __m128i summ, mull; 1157 summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); 1158 mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull); 1159 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1160 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1161 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1162 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1163 summ = _mm_sra_epi32(summ, cnt); 1164 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1165 } 1166 } 1167 else { /* order == 5 */ 1168 __m128i q0, q1, q2, q3, q4; 1169 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1170 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1171 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1172 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1173 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0)); 1174 1175 for(i = 0; i < (int)data_len-3; i+=4) { 1176 __m128i summ, mull; 1177 summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); 1178 mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull); 1179 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1180 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1181 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1182 summ = _mm_sra_epi32(summ, cnt); 1183 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1184 } 1185 } 1186 } 1187 } 1188 else { 1189 if(order > 2) { 1190 if(order == 4) { 1191 __m128i q0, q1, q2, q3; 1192 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1193 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1194 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1195 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0)); 1196 1197 for(i = 0; i < (int)data_len-3; i+=4) { 1198 __m128i summ, mull; 1199 summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); 1200 mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull); 1201 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1202 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1203 summ = _mm_sra_epi32(summ, cnt); 1204 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1205 } 1206 } 1207 else { /* order == 3 */ 1208 __m128i q0, q1, q2; 1209 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1210 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1211 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0)); 1212 1213 for(i = 0; i < (int)data_len-3; i+=4) { 1214 __m128i summ, mull; 1215 summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); 1216 mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull); 1217 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1218 summ = _mm_sra_epi32(summ, cnt); 1219 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1220 } 1221 } 1222 } 1223 else { 1224 if(order == 2) { 1225 __m128i q0, q1; 1226 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1227 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0)); 1228 1229 for(i = 0; i < (int)data_len-3; i+=4) { 1230 __m128i summ, mull; 1231 summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); 1232 mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull); 1233 summ = _mm_sra_epi32(summ, cnt); 1234 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1235 } 1236 } 1237 else { /* order == 1 */ 1238 __m128i q0; 1239 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0)); 1240 1241 for(i = 0; i < (int)data_len-3; i+=4) { 1242 __m128i summ; 1243 summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); 1244 summ = _mm_sra_epi32(summ, cnt); 1245 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ)); 1246 } 1247 } 1248 } 1249 } 1250 for(; i < (int)data_len; i++) { 1251 sum = 0; 1252 switch(order) { 1253 case 12: sum += qlp_coeff[11] * data[i-12]; 1254 case 11: sum += qlp_coeff[10] * data[i-11]; 1255 case 10: sum += qlp_coeff[ 9] * data[i-10]; 1256 case 9: sum += qlp_coeff[ 8] * data[i- 9]; 1257 case 8: sum += qlp_coeff[ 7] * data[i- 8]; 1258 case 7: sum += qlp_coeff[ 6] * data[i- 7]; 1259 case 6: sum += qlp_coeff[ 5] * data[i- 6]; 1260 case 5: sum += qlp_coeff[ 4] * data[i- 5]; 1261 case 4: sum += qlp_coeff[ 3] * data[i- 4]; 1262 case 3: sum += qlp_coeff[ 2] * data[i- 3]; 1263 case 2: sum += qlp_coeff[ 1] * data[i- 2]; 1264 case 1: sum += qlp_coeff[ 0] * data[i- 1]; 1265 } 1266 residual[i] = data[i] - (sum >> lp_quantization); 1267 } 1268 } 1269 else { /* order > 12 */ 1270 for(i = 0; i < (int)data_len; i++) { 1271 sum = 0; 1272 switch(order) { 1273 case 32: sum += qlp_coeff[31] * data[i-32]; 1274 case 31: sum += qlp_coeff[30] * data[i-31]; 1275 case 30: sum += qlp_coeff[29] * data[i-30]; 1276 case 29: sum += qlp_coeff[28] * data[i-29]; 1277 case 28: sum += qlp_coeff[27] * data[i-28]; 1278 case 27: sum += qlp_coeff[26] * data[i-27]; 1279 case 26: sum += qlp_coeff[25] * data[i-26]; 1280 case 25: sum += qlp_coeff[24] * data[i-25]; 1281 case 24: sum += qlp_coeff[23] * data[i-24]; 1282 case 23: sum += qlp_coeff[22] * data[i-23]; 1283 case 22: sum += qlp_coeff[21] * data[i-22]; 1284 case 21: sum += qlp_coeff[20] * data[i-21]; 1285 case 20: sum += qlp_coeff[19] * data[i-20]; 1286 case 19: sum += qlp_coeff[18] * data[i-19]; 1287 case 18: sum += qlp_coeff[17] * data[i-18]; 1288 case 17: sum += qlp_coeff[16] * data[i-17]; 1289 case 16: sum += qlp_coeff[15] * data[i-16]; 1290 case 15: sum += qlp_coeff[14] * data[i-15]; 1291 case 14: sum += qlp_coeff[13] * data[i-14]; 1292 case 13: sum += qlp_coeff[12] * data[i-13]; 1293 sum += qlp_coeff[11] * data[i-12]; 1294 sum += qlp_coeff[10] * data[i-11]; 1295 sum += qlp_coeff[ 9] * data[i-10]; 1296 sum += qlp_coeff[ 8] * data[i- 9]; 1297 sum += qlp_coeff[ 7] * data[i- 8]; 1298 sum += qlp_coeff[ 6] * data[i- 7]; 1299 sum += qlp_coeff[ 5] * data[i- 6]; 1300 sum += qlp_coeff[ 4] * data[i- 5]; 1301 sum += qlp_coeff[ 3] * data[i- 4]; 1302 sum += qlp_coeff[ 2] * data[i- 3]; 1303 sum += qlp_coeff[ 1] * data[i- 2]; 1304 sum += qlp_coeff[ 0] * data[i- 1]; 1305 } 1306 residual[i] = data[i] - (sum >> lp_quantization); 1307 } 1308 } 1309} 1310 1311#endif /* FLAC__SSE4_1_SUPPORTED */ 1312#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ 1313#endif /* FLAC__NO_ASM */ 1314#endif /* FLAC__INTEGER_ONLY_LIBRARY */ 1315