1/* libFLAC - Free Lossless Audio Codec library 2 * Copyright (C) 2000-2009 Josh Coalson 3 * Copyright (C) 2011-2016 Xiph.Org Foundation 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * - Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * - Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * - Neither the name of the Xiph.org Foundation nor the names of its 17 * contributors may be used to endorse or promote products derived from 18 * this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33#ifdef HAVE_CONFIG_H 34# include <config.h> 35#endif 36 37#include "private/cpu.h" 38 39#ifndef FLAC__INTEGER_ONLY_LIBRARY 40#ifndef FLAC__NO_ASM 41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN 42#include "private/lpc.h" 43#ifdef FLAC__SSE_SUPPORTED 44#include "FLAC/assert.h" 45#include "FLAC/format.h" 46 47#include <xmmintrin.h> /* SSE */ 48 49/* new routines: more unaligned loads, less shuffle 50 * old routines: less unaligned loads, more shuffle 51 * these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm 52 */ 53 54/* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */ 55 56FLAC__SSE_TARGET("sse") 57void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 58{ 59 int i; 60 int limit = data_len - 4; 61 __m128 sum0; 62 63 (void) lag; 64 FLAC__ASSERT(lag <= 4); 65 FLAC__ASSERT(lag <= data_len); 66 67 sum0 = _mm_setzero_ps(); 68 69 for(i = 0; i <= limit; i++) { 70 __m128 d, d0; 71 d0 = _mm_loadu_ps(data+i); 72 d = d0; d = _mm_shuffle_ps(d, d, 0); 73 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); 74 } 75 76 { 77 __m128 d0 = _mm_setzero_ps(); 78 limit++; if(limit < 0) limit = 0; 79 80 for(i = data_len-1; i >= limit; i--) { 81 __m128 d; 82 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); 83 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); 84 d0 = _mm_move_ss(d0, d); 85 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); 86 } 87 } 88 89 _mm_storeu_ps(autoc, sum0); 90} 91 92FLAC__SSE_TARGET("sse") 93void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 94{ 95 int i; 96 int limit = data_len - 8; 97 __m128 sum0, sum1; 98 99 (void) lag; 100 FLAC__ASSERT(lag <= 8); 101 FLAC__ASSERT(lag <= data_len); 102 103 sum0 = _mm_setzero_ps(); 104 sum1 = _mm_setzero_ps(); 105 106 for(i = 0; i <= limit; i++) { 107 __m128 d, d0, d1; 108 d0 = _mm_loadu_ps(data+i); 109 d1 = _mm_loadu_ps(data+i+4); 110 d = d0; d = _mm_shuffle_ps(d, d, 0); 111 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); 112 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); 113 } 114 115 { 116 __m128 d0 = _mm_setzero_ps(); 117 __m128 d1 = _mm_setzero_ps(); 118 limit++; if(limit < 0) limit = 0; 119 120 for(i = data_len-1; i >= limit; i--) { 121 __m128 d; 122 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); 123 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); 124 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); 125 d1 = _mm_move_ss(d1, d0); 126 d0 = _mm_move_ss(d0, d); 127 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); 128 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); 129 } 130 } 131 132 _mm_storeu_ps(autoc, sum0); 133 _mm_storeu_ps(autoc+4, sum1); 134} 135 136FLAC__SSE_TARGET("sse") 137void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 138{ 139 int i; 140 int limit = data_len - 12; 141 __m128 sum0, sum1, sum2; 142 143 (void) lag; 144 FLAC__ASSERT(lag <= 12); 145 FLAC__ASSERT(lag <= data_len); 146 147 sum0 = _mm_setzero_ps(); 148 sum1 = _mm_setzero_ps(); 149 sum2 = _mm_setzero_ps(); 150 151 for(i = 0; i <= limit; i++) { 152 __m128 d, d0, d1, d2; 153 d0 = _mm_loadu_ps(data+i); 154 d1 = _mm_loadu_ps(data+i+4); 155 d2 = _mm_loadu_ps(data+i+8); 156 d = d0; d = _mm_shuffle_ps(d, d, 0); 157 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); 158 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); 159 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d)); 160 } 161 162 { 163 __m128 d0 = _mm_setzero_ps(); 164 __m128 d1 = _mm_setzero_ps(); 165 __m128 d2 = _mm_setzero_ps(); 166 limit++; if(limit < 0) limit = 0; 167 168 for(i = data_len-1; i >= limit; i--) { 169 __m128 d; 170 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); 171 d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3)); 172 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); 173 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); 174 d2 = _mm_move_ss(d2, d1); 175 d1 = _mm_move_ss(d1, d0); 176 d0 = _mm_move_ss(d0, d); 177 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2)); 178 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); 179 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); 180 } 181 } 182 183 _mm_storeu_ps(autoc, sum0); 184 _mm_storeu_ps(autoc+4, sum1); 185 _mm_storeu_ps(autoc+8, sum2); 186} 187 188FLAC__SSE_TARGET("sse") 189void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 190{ 191 int i; 192 int limit = data_len - 16; 193 __m128 sum0, sum1, sum2, sum3; 194 195 (void) lag; 196 FLAC__ASSERT(lag <= 16); 197 FLAC__ASSERT(lag <= data_len); 198 199 sum0 = _mm_setzero_ps(); 200 sum1 = _mm_setzero_ps(); 201 sum2 = _mm_setzero_ps(); 202 sum3 = _mm_setzero_ps(); 203 204 for(i = 0; i <= limit; i++) { 205 __m128 d, d0, d1, d2, d3; 206 d0 = _mm_loadu_ps(data+i); 207 d1 = _mm_loadu_ps(data+i+4); 208 d2 = _mm_loadu_ps(data+i+8); 209 d3 = _mm_loadu_ps(data+i+12); 210 d = d0; d = _mm_shuffle_ps(d, d, 0); 211 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d)); 212 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d)); 213 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d)); 214 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d)); 215 } 216 217 { 218 __m128 d0 = _mm_setzero_ps(); 219 __m128 d1 = _mm_setzero_ps(); 220 __m128 d2 = _mm_setzero_ps(); 221 __m128 d3 = _mm_setzero_ps(); 222 limit++; if(limit < 0) limit = 0; 223 224 for(i = data_len-1; i >= limit; i--) { 225 __m128 d; 226 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0); 227 d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3)); 228 d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3)); 229 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3)); 230 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3)); 231 d3 = _mm_move_ss(d3, d2); 232 d2 = _mm_move_ss(d2, d1); 233 d1 = _mm_move_ss(d1, d0); 234 d0 = _mm_move_ss(d0, d); 235 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3)); 236 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2)); 237 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1)); 238 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0)); 239 } 240 } 241 242 _mm_storeu_ps(autoc, sum0); 243 _mm_storeu_ps(autoc+4, sum1); 244 _mm_storeu_ps(autoc+8, sum2); 245 _mm_storeu_ps(autoc+12,sum3); 246} 247 248/* old routines: faster on older Intel CPUs (up to Core 2) */ 249 250FLAC__SSE_TARGET("sse") 251void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 252{ 253 __m128 xmm0, xmm2, xmm5; 254 255 (void) lag; 256 FLAC__ASSERT(lag > 0); 257 FLAC__ASSERT(lag <= 4); 258 FLAC__ASSERT(lag <= data_len); 259 FLAC__ASSERT(data_len > 0); 260 261 xmm5 = _mm_setzero_ps(); 262 263 xmm0 = _mm_load_ss(data++); 264 xmm2 = xmm0; 265 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); 266 267 xmm0 = _mm_mul_ps(xmm0, xmm2); 268 xmm5 = _mm_add_ps(xmm5, xmm0); 269 270 data_len--; 271 272 while(data_len) 273 { 274 xmm0 = _mm_load1_ps(data++); 275 276 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); 277 xmm2 = _mm_move_ss(xmm2, xmm0); 278 xmm0 = _mm_mul_ps(xmm0, xmm2); 279 xmm5 = _mm_add_ps(xmm5, xmm0); 280 281 data_len--; 282 } 283 284 _mm_storeu_ps(autoc, xmm5); 285} 286 287FLAC__SSE_TARGET("sse") 288void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 289{ 290 __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6; 291 292 (void) lag; 293 FLAC__ASSERT(lag > 0); 294 FLAC__ASSERT(lag <= 8); 295 FLAC__ASSERT(lag <= data_len); 296 FLAC__ASSERT(data_len > 0); 297 298 xmm5 = _mm_setzero_ps(); 299 xmm6 = _mm_setzero_ps(); 300 301 xmm0 = _mm_load_ss(data++); 302 xmm2 = xmm0; 303 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); 304 xmm3 = _mm_setzero_ps(); 305 306 xmm0 = _mm_mul_ps(xmm0, xmm2); 307 xmm5 = _mm_add_ps(xmm5, xmm0); 308 309 data_len--; 310 311 while(data_len) 312 { 313 xmm0 = _mm_load1_ps(data++); 314 315 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); 316 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3)); 317 xmm3 = _mm_move_ss(xmm3, xmm2); 318 xmm2 = _mm_move_ss(xmm2, xmm0); 319 320 xmm1 = xmm0; 321 xmm1 = _mm_mul_ps(xmm1, xmm3); 322 xmm0 = _mm_mul_ps(xmm0, xmm2); 323 xmm6 = _mm_add_ps(xmm6, xmm1); 324 xmm5 = _mm_add_ps(xmm5, xmm0); 325 326 data_len--; 327 } 328 329 _mm_storeu_ps(autoc, xmm5); 330 _mm_storeu_ps(autoc+4, xmm6); 331} 332 333FLAC__SSE_TARGET("sse") 334void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 335{ 336 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 337 338 (void) lag; 339 FLAC__ASSERT(lag > 0); 340 FLAC__ASSERT(lag <= 12); 341 FLAC__ASSERT(lag <= data_len); 342 FLAC__ASSERT(data_len > 0); 343 344 xmm5 = _mm_setzero_ps(); 345 xmm6 = _mm_setzero_ps(); 346 xmm7 = _mm_setzero_ps(); 347 348 xmm0 = _mm_load_ss(data++); 349 xmm2 = xmm0; 350 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); 351 xmm3 = _mm_setzero_ps(); 352 xmm4 = _mm_setzero_ps(); 353 354 xmm0 = _mm_mul_ps(xmm0, xmm2); 355 xmm5 = _mm_add_ps(xmm5, xmm0); 356 357 data_len--; 358 359 while(data_len) 360 { 361 xmm0 = _mm_load1_ps(data++); 362 363 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); 364 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3)); 365 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3)); 366 xmm4 = _mm_move_ss(xmm4, xmm3); 367 xmm3 = _mm_move_ss(xmm3, xmm2); 368 xmm2 = _mm_move_ss(xmm2, xmm0); 369 370 xmm1 = xmm0; 371 xmm1 = _mm_mul_ps(xmm1, xmm2); 372 xmm5 = _mm_add_ps(xmm5, xmm1); 373 xmm1 = xmm0; 374 xmm1 = _mm_mul_ps(xmm1, xmm3); 375 xmm6 = _mm_add_ps(xmm6, xmm1); 376 xmm0 = _mm_mul_ps(xmm0, xmm4); 377 xmm7 = _mm_add_ps(xmm7, xmm0); 378 379 data_len--; 380 } 381 382 _mm_storeu_ps(autoc, xmm5); 383 _mm_storeu_ps(autoc+4, xmm6); 384 _mm_storeu_ps(autoc+8, xmm7); 385} 386 387FLAC__SSE_TARGET("sse") 388void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 389{ 390 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9; 391 392 (void) lag; 393 FLAC__ASSERT(lag > 0); 394 FLAC__ASSERT(lag <= 16); 395 FLAC__ASSERT(lag <= data_len); 396 FLAC__ASSERT(data_len > 0); 397 398 xmm6 = _mm_setzero_ps(); 399 xmm7 = _mm_setzero_ps(); 400 xmm8 = _mm_setzero_ps(); 401 xmm9 = _mm_setzero_ps(); 402 403 xmm0 = _mm_load_ss(data++); 404 xmm2 = xmm0; 405 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0); 406 xmm3 = _mm_setzero_ps(); 407 xmm4 = _mm_setzero_ps(); 408 xmm5 = _mm_setzero_ps(); 409 410 xmm0 = _mm_mul_ps(xmm0, xmm2); 411 xmm6 = _mm_add_ps(xmm6, xmm0); 412 413 data_len--; 414 415 while(data_len) 416 { 417 xmm0 = _mm_load1_ps(data++); 418 419 /* shift xmm5:xmm4:xmm3:xmm2 left by one float */ 420 xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3)); 421 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3)); 422 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3)); 423 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3)); 424 xmm5 = _mm_move_ss(xmm5, xmm4); 425 xmm4 = _mm_move_ss(xmm4, xmm3); 426 xmm3 = _mm_move_ss(xmm3, xmm2); 427 xmm2 = _mm_move_ss(xmm2, xmm0); 428 429 /* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */ 430 xmm1 = xmm0; 431 xmm1 = _mm_mul_ps(xmm1, xmm5); 432 xmm9 = _mm_add_ps(xmm9, xmm1); 433 xmm1 = xmm0; 434 xmm1 = _mm_mul_ps(xmm1, xmm4); 435 xmm8 = _mm_add_ps(xmm8, xmm1); 436 xmm1 = xmm0; 437 xmm1 = _mm_mul_ps(xmm1, xmm3); 438 xmm7 = _mm_add_ps(xmm7, xmm1); 439 xmm0 = _mm_mul_ps(xmm0, xmm2); 440 xmm6 = _mm_add_ps(xmm6, xmm0); 441 442 data_len--; 443 } 444 445 _mm_storeu_ps(autoc, xmm6); 446 _mm_storeu_ps(autoc+4, xmm7); 447 _mm_storeu_ps(autoc+8, xmm8); 448 _mm_storeu_ps(autoc+12,xmm9); 449} 450 451#endif /* FLAC__SSE_SUPPORTED */ 452#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ 453#endif /* FLAC__NO_ASM */ 454#endif /* FLAC__INTEGER_ONLY_LIBRARY */ 455