emmintrin.h revision 9436ed50b0923368d5ae7a97f1b67c56b6837430
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36typedef int __v4si __attribute__((__vector_size__(16))); 37typedef short __v8hi __attribute__((__vector_size__(16))); 38typedef signed char __v16qi __attribute__((__vector_size__(16))); 39 40static inline __m128d __attribute__((__always_inline__, __nodebug__)) 41_mm_add_sd(__m128d a, __m128d b) 42{ 43 a[0] += b[0]; 44 return a; 45} 46 47static inline __m128d __attribute__((__always_inline__, __nodebug__)) 48_mm_add_pd(__m128d a, __m128d b) 49{ 50 return a + b; 51} 52 53static inline __m128d __attribute__((__always_inline__, __nodebug__)) 54_mm_sub_sd(__m128d a, __m128d b) 55{ 56 a[0] -= b[0]; 57 return a; 58} 59 60static inline __m128d __attribute__((__always_inline__, __nodebug__)) 61_mm_sub_pd(__m128d a, __m128d b) 62{ 63 return a - b; 64} 65 66static inline __m128d __attribute__((__always_inline__, __nodebug__)) 67_mm_mul_sd(__m128d a, __m128d b) 68{ 69 a[0] *= b[0]; 70 return a; 71} 72 73static inline __m128d __attribute__((__always_inline__, __nodebug__)) 74_mm_mul_pd(__m128d a, __m128d b) 75{ 76 return a * b; 77} 78 79static inline __m128d __attribute__((__always_inline__, __nodebug__)) 80_mm_div_sd(__m128d a, __m128d b) 81{ 82 a[0] /= b[0]; 83 return a; 84} 85 86static inline __m128d __attribute__((__always_inline__, __nodebug__)) 87_mm_div_pd(__m128d a, __m128d b) 88{ 89 return a / b; 90} 91 92static inline __m128d __attribute__((__always_inline__, __nodebug__)) 93_mm_sqrt_sd(__m128d a, __m128d b) 94{ 95 __m128d c = __builtin_ia32_sqrtsd(b); 96 return (__m128d) { c[0], a[1] }; 97} 98 99static inline __m128d __attribute__((__always_inline__, __nodebug__)) 100_mm_sqrt_pd(__m128d a) 101{ 102 return __builtin_ia32_sqrtpd(a); 103} 104 105static inline __m128d __attribute__((__always_inline__, __nodebug__)) 106_mm_min_sd(__m128d a, __m128d b) 107{ 108 return __builtin_ia32_minsd(a, b); 109} 110 111static inline __m128d __attribute__((__always_inline__, __nodebug__)) 112_mm_min_pd(__m128d a, __m128d b) 113{ 114 return __builtin_ia32_minpd(a, b); 115} 116 117static inline __m128d __attribute__((__always_inline__, __nodebug__)) 118_mm_max_sd(__m128d a, __m128d b) 119{ 120 return __builtin_ia32_maxsd(a, b); 121} 122 123static inline __m128d __attribute__((__always_inline__, __nodebug__)) 124_mm_max_pd(__m128d a, __m128d b) 125{ 126 return __builtin_ia32_maxpd(a, b); 127} 128 129static inline __m128d __attribute__((__always_inline__, __nodebug__)) 130_mm_and_pd(__m128d a, __m128d b) 131{ 132 return (__m128d)((__v4si)a & (__v4si)b); 133} 134 135static inline __m128d __attribute__((__always_inline__, __nodebug__)) 136_mm_andnot_pd(__m128d a, __m128d b) 137{ 138 return (__m128d)(~(__v4si)a & (__v4si)b); 139} 140 141static inline __m128d __attribute__((__always_inline__, __nodebug__)) 142_mm_or_pd(__m128d a, __m128d b) 143{ 144 return (__m128d)((__v4si)a | (__v4si)b); 145} 146 147static inline __m128d __attribute__((__always_inline__, __nodebug__)) 148_mm_xor_pd(__m128d a, __m128d b) 149{ 150 return (__m128d)((__v4si)a ^ (__v4si)b); 151} 152 153static inline __m128d __attribute__((__always_inline__, __nodebug__)) 154_mm_cmpeq_pd(__m128d a, __m128d b) 155{ 156 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 157} 158 159static inline __m128d __attribute__((__always_inline__, __nodebug__)) 160_mm_cmplt_pd(__m128d a, __m128d b) 161{ 162 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 163} 164 165static inline __m128d __attribute__((__always_inline__, __nodebug__)) 166_mm_cmple_pd(__m128d a, __m128d b) 167{ 168 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 169} 170 171static inline __m128d __attribute__((__always_inline__, __nodebug__)) 172_mm_cmpgt_pd(__m128d a, __m128d b) 173{ 174 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 175} 176 177static inline __m128d __attribute__((__always_inline__, __nodebug__)) 178_mm_cmpge_pd(__m128d a, __m128d b) 179{ 180 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 181} 182 183static inline __m128d __attribute__((__always_inline__, __nodebug__)) 184_mm_cmpord_pd(__m128d a, __m128d b) 185{ 186 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 187} 188 189static inline __m128d __attribute__((__always_inline__, __nodebug__)) 190_mm_cmpunord_pd(__m128d a, __m128d b) 191{ 192 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 193} 194 195static inline __m128d __attribute__((__always_inline__, __nodebug__)) 196_mm_cmpneq_pd(__m128d a, __m128d b) 197{ 198 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 199} 200 201static inline __m128d __attribute__((__always_inline__, __nodebug__)) 202_mm_cmpnlt_pd(__m128d a, __m128d b) 203{ 204 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 205} 206 207static inline __m128d __attribute__((__always_inline__, __nodebug__)) 208_mm_cmpnle_pd(__m128d a, __m128d b) 209{ 210 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 211} 212 213static inline __m128d __attribute__((__always_inline__, __nodebug__)) 214_mm_cmpngt_pd(__m128d a, __m128d b) 215{ 216 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 217} 218 219static inline __m128d __attribute__((__always_inline__, __nodebug__)) 220_mm_cmpnge_pd(__m128d a, __m128d b) 221{ 222 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 223} 224 225static inline __m128d __attribute__((__always_inline__, __nodebug__)) 226_mm_cmpeq_sd(__m128d a, __m128d b) 227{ 228 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 229} 230 231static inline __m128d __attribute__((__always_inline__, __nodebug__)) 232_mm_cmplt_sd(__m128d a, __m128d b) 233{ 234 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 235} 236 237static inline __m128d __attribute__((__always_inline__, __nodebug__)) 238_mm_cmple_sd(__m128d a, __m128d b) 239{ 240 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 241} 242 243static inline __m128d __attribute__((__always_inline__, __nodebug__)) 244_mm_cmpgt_sd(__m128d a, __m128d b) 245{ 246 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 247} 248 249static inline __m128d __attribute__((__always_inline__, __nodebug__)) 250_mm_cmpge_sd(__m128d a, __m128d b) 251{ 252 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 253} 254 255static inline __m128d __attribute__((__always_inline__, __nodebug__)) 256_mm_cmpord_sd(__m128d a, __m128d b) 257{ 258 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 259} 260 261static inline __m128d __attribute__((__always_inline__, __nodebug__)) 262_mm_cmpunord_sd(__m128d a, __m128d b) 263{ 264 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 265} 266 267static inline __m128d __attribute__((__always_inline__, __nodebug__)) 268_mm_cmpneq_sd(__m128d a, __m128d b) 269{ 270 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 271} 272 273static inline __m128d __attribute__((__always_inline__, __nodebug__)) 274_mm_cmpnlt_sd(__m128d a, __m128d b) 275{ 276 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 277} 278 279static inline __m128d __attribute__((__always_inline__, __nodebug__)) 280_mm_cmpnle_sd(__m128d a, __m128d b) 281{ 282 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 283} 284 285static inline __m128d __attribute__((__always_inline__, __nodebug__)) 286_mm_cmpngt_sd(__m128d a, __m128d b) 287{ 288 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 289} 290 291static inline __m128d __attribute__((__always_inline__, __nodebug__)) 292_mm_cmpnge_sd(__m128d a, __m128d b) 293{ 294 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 295} 296 297static inline int __attribute__((__always_inline__, __nodebug__)) 298_mm_comieq_sd(__m128d a, __m128d b) 299{ 300 return __builtin_ia32_comisdeq(a, b); 301} 302 303static inline int __attribute__((__always_inline__, __nodebug__)) 304_mm_comilt_sd(__m128d a, __m128d b) 305{ 306 return __builtin_ia32_comisdlt(a, b); 307} 308 309static inline int __attribute__((__always_inline__, __nodebug__)) 310_mm_comile_sd(__m128d a, __m128d b) 311{ 312 return __builtin_ia32_comisdle(a, b); 313} 314 315static inline int __attribute__((__always_inline__, __nodebug__)) 316_mm_comigt_sd(__m128d a, __m128d b) 317{ 318 return __builtin_ia32_comisdgt(a, b); 319} 320 321static inline int __attribute__((__always_inline__, __nodebug__)) 322_mm_comineq_sd(__m128d a, __m128d b) 323{ 324 return __builtin_ia32_comisdneq(a, b); 325} 326 327static inline int __attribute__((__always_inline__, __nodebug__)) 328_mm_ucomieq_sd(__m128d a, __m128d b) 329{ 330 return __builtin_ia32_ucomisdeq(a, b); 331} 332 333static inline int __attribute__((__always_inline__, __nodebug__)) 334_mm_ucomilt_sd(__m128d a, __m128d b) 335{ 336 return __builtin_ia32_ucomisdlt(a, b); 337} 338 339static inline int __attribute__((__always_inline__, __nodebug__)) 340_mm_ucomile_sd(__m128d a, __m128d b) 341{ 342 return __builtin_ia32_ucomisdle(a, b); 343} 344 345static inline int __attribute__((__always_inline__, __nodebug__)) 346_mm_ucomigt_sd(__m128d a, __m128d b) 347{ 348 return __builtin_ia32_ucomisdgt(a, b); 349} 350 351static inline int __attribute__((__always_inline__, __nodebug__)) 352_mm_ucomineq_sd(__m128d a, __m128d b) 353{ 354 return __builtin_ia32_ucomisdneq(a, b); 355} 356 357static inline __m128 __attribute__((__always_inline__, __nodebug__)) 358_mm_cvtpd_ps(__m128d a) 359{ 360 return __builtin_ia32_cvtpd2ps(a); 361} 362 363static inline __m128d __attribute__((__always_inline__, __nodebug__)) 364_mm_cvtps_pd(__m128 a) 365{ 366 return __builtin_ia32_cvtps2pd(a); 367} 368 369static inline __m128d __attribute__((__always_inline__, __nodebug__)) 370_mm_cvtepi32_pd(__m128i a) 371{ 372 return __builtin_ia32_cvtdq2pd((__v4si)a); 373} 374 375static inline __m128i __attribute__((__always_inline__, __nodebug__)) 376_mm_cvtpd_epi32(__m128d a) 377{ 378 return __builtin_ia32_cvtpd2dq(a); 379} 380 381static inline int __attribute__((__always_inline__, __nodebug__)) 382_mm_cvtsd_si32(__m128d a) 383{ 384 return __builtin_ia32_cvtsd2si(a); 385} 386 387static inline __m128 __attribute__((__always_inline__, __nodebug__)) 388_mm_cvtsd_ss(__m128 a, __m128d b) 389{ 390 a[0] = b[0]; 391 return a; 392} 393 394static inline __m128d __attribute__((__always_inline__, __nodebug__)) 395_mm_cvtsi32_sd(__m128d a, int b) 396{ 397 a[0] = b; 398 return a; 399} 400 401static inline __m128d __attribute__((__always_inline__, __nodebug__)) 402_mm_cvtss_sd(__m128d a, __m128 b) 403{ 404 a[0] = b[0]; 405 return a; 406} 407 408static inline __m128i __attribute__((__always_inline__, __nodebug__)) 409_mm_cvttpd_epi32(__m128d a) 410{ 411 return (__m128i)__builtin_ia32_cvttpd2dq(a); 412} 413 414static inline int __attribute__((__always_inline__, __nodebug__)) 415_mm_cvttsd_si32(__m128d a) 416{ 417 return a[0]; 418} 419 420static inline __m64 __attribute__((__always_inline__, __nodebug__)) 421_mm_cvtpd_pi32(__m128d a) 422{ 423 return (__m64)__builtin_ia32_cvtpd2pi(a); 424} 425 426static inline __m64 __attribute__((__always_inline__, __nodebug__)) 427_mm_cvttpd_pi32(__m128d a) 428{ 429 return (__m64)__builtin_ia32_cvttpd2pi(a); 430} 431 432static inline __m128d __attribute__((__always_inline__, __nodebug__)) 433_mm_cvtpi32_pd(__m64 a) 434{ 435 return __builtin_ia32_cvtpi2pd((__v2si)a); 436} 437 438static inline double __attribute__((__always_inline__, __nodebug__)) 439_mm_cvtsd_f64(__m128d a) 440{ 441 return a[0]; 442} 443 444static inline __m128d __attribute__((__always_inline__, __nodebug__)) 445_mm_load_pd(double const *dp) 446{ 447 return *(__m128d*)dp; 448} 449 450static inline __m128d __attribute__((__always_inline__, __nodebug__)) 451_mm_load1_pd(double const *dp) 452{ 453 return (__m128d){ dp[0], dp[0] }; 454} 455 456#define _mm_load_pd1(dp) _mm_load1_pd(dp) 457 458static inline __m128d __attribute__((__always_inline__, __nodebug__)) 459_mm_loadr_pd(double const *dp) 460{ 461 return (__m128d){ dp[1], dp[0] }; 462} 463 464static inline __m128d __attribute__((__always_inline__, __nodebug__)) 465_mm_loadu_pd(double const *dp) 466{ 467 return __builtin_ia32_loadupd(dp); 468} 469 470static inline __m128d __attribute__((__always_inline__, __nodebug__)) 471_mm_load_sd(double const *dp) 472{ 473 return (__m128d){ *dp, 0.0 }; 474} 475 476static inline __m128d __attribute__((__always_inline__, __nodebug__)) 477_mm_loadh_pd(__m128d a, double const *dp) 478{ 479 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); 480} 481 482static inline __m128d __attribute__((__always_inline__, __nodebug__)) 483_mm_loadl_pd(__m128d a, double const *dp) 484{ 485 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); 486} 487 488static inline __m128d __attribute__((__always_inline__, __nodebug__)) 489_mm_set_sd(double w) 490{ 491 return (__m128d){ w, 0 }; 492} 493 494static inline __m128d __attribute__((__always_inline__, __nodebug__)) 495_mm_set1_pd(double w) 496{ 497 return (__m128d){ w, w }; 498} 499 500static inline __m128d __attribute__((__always_inline__, __nodebug__)) 501_mm_set_pd(double w, double x) 502{ 503 return (__m128d){ x, w }; 504} 505 506static inline __m128d __attribute__((__always_inline__, __nodebug__)) 507_mm_setr_pd(double w, double x) 508{ 509 return (__m128d){ w, x }; 510} 511 512static inline __m128d __attribute__((__always_inline__, __nodebug__)) 513_mm_setzero_pd(void) 514{ 515 return (__m128d){ 0, 0 }; 516} 517 518static inline __m128d __attribute__((__always_inline__, __nodebug__)) 519_mm_move_sd(__m128d a, __m128d b) 520{ 521 return (__m128d){ b[0], a[1] }; 522} 523 524static inline void __attribute__((__always_inline__, __nodebug__)) 525_mm_store_sd(double *dp, __m128d a) 526{ 527 dp[0] = a[0]; 528} 529 530static inline void __attribute__((__always_inline__, __nodebug__)) 531_mm_store1_pd(double *dp, __m128d a) 532{ 533 dp[0] = a[0]; 534 dp[1] = a[0]; 535} 536 537static inline void __attribute__((__always_inline__, __nodebug__)) 538_mm_store_pd(double *dp, __m128d a) 539{ 540 *(__m128d *)dp = a; 541} 542 543static inline void __attribute__((__always_inline__, __nodebug__)) 544_mm_storeu_pd(double *dp, __m128d a) 545{ 546 __builtin_ia32_storeupd(dp, a); 547} 548 549static inline void __attribute__((__always_inline__, __nodebug__)) 550_mm_storer_pd(double *dp, __m128d a) 551{ 552 dp[0] = a[1]; 553 dp[1] = a[0]; 554} 555 556static inline void __attribute__((__always_inline__, __nodebug__)) 557_mm_storeh_pd(double *dp, __m128d a) 558{ 559 dp[0] = a[1]; 560} 561 562static inline void __attribute__((__always_inline__, __nodebug__)) 563_mm_storel_pd(double *dp, __m128d a) 564{ 565 dp[0] = a[0]; 566} 567 568static inline __m128i __attribute__((__always_inline__, __nodebug__)) 569_mm_add_epi8(__m128i a, __m128i b) 570{ 571 return (__m128i)((__v16qi)a + (__v16qi)b); 572} 573 574static inline __m128i __attribute__((__always_inline__, __nodebug__)) 575_mm_add_epi16(__m128i a, __m128i b) 576{ 577 return (__m128i)((__v8hi)a + (__v8hi)b); 578} 579 580static inline __m128i __attribute__((__always_inline__, __nodebug__)) 581_mm_add_epi32(__m128i a, __m128i b) 582{ 583 return (__m128i)((__v4si)a + (__v4si)b); 584} 585 586static inline __m64 __attribute__((__always_inline__, __nodebug__)) 587_mm_add_si64(__m64 a, __m64 b) 588{ 589 return a + b; 590} 591 592static inline __m128i __attribute__((__always_inline__, __nodebug__)) 593_mm_add_epi64(__m128i a, __m128i b) 594{ 595 return a + b; 596} 597 598static inline __m128i __attribute__((__always_inline__, __nodebug__)) 599_mm_adds_epi8(__m128i a, __m128i b) 600{ 601 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 602} 603 604static inline __m128i __attribute__((__always_inline__, __nodebug__)) 605_mm_adds_epi16(__m128i a, __m128i b) 606{ 607 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 608} 609 610static inline __m128i __attribute__((__always_inline__, __nodebug__)) 611_mm_adds_epu8(__m128i a, __m128i b) 612{ 613 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 614} 615 616static inline __m128i __attribute__((__always_inline__, __nodebug__)) 617_mm_adds_epu16(__m128i a, __m128i b) 618{ 619 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 620} 621 622static inline __m128i __attribute__((__always_inline__, __nodebug__)) 623_mm_avg_epu8(__m128i a, __m128i b) 624{ 625 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 626} 627 628static inline __m128i __attribute__((__always_inline__, __nodebug__)) 629_mm_avg_epu16(__m128i a, __m128i b) 630{ 631 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 632} 633 634static inline __m128i __attribute__((__always_inline__, __nodebug__)) 635_mm_madd_epi16(__m128i a, __m128i b) 636{ 637 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 638} 639 640static inline __m128i __attribute__((__always_inline__, __nodebug__)) 641_mm_max_epi16(__m128i a, __m128i b) 642{ 643 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 644} 645 646static inline __m128i __attribute__((__always_inline__, __nodebug__)) 647_mm_max_epu8(__m128i a, __m128i b) 648{ 649 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 650} 651 652static inline __m128i __attribute__((__always_inline__, __nodebug__)) 653_mm_min_epi16(__m128i a, __m128i b) 654{ 655 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 656} 657 658static inline __m128i __attribute__((__always_inline__, __nodebug__)) 659_mm_min_epu8(__m128i a, __m128i b) 660{ 661 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 662} 663 664static inline __m128i __attribute__((__always_inline__, __nodebug__)) 665_mm_mulhi_epi16(__m128i a, __m128i b) 666{ 667 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 668} 669 670static inline __m128i __attribute__((__always_inline__, __nodebug__)) 671_mm_mulhi_epu16(__m128i a, __m128i b) 672{ 673 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 674} 675 676static inline __m128i __attribute__((__always_inline__, __nodebug__)) 677_mm_mullo_epi16(__m128i a, __m128i b) 678{ 679 return (__m128i)((__v8hi)a * (__v8hi)b); 680} 681 682static inline __m64 __attribute__((__always_inline__, __nodebug__)) 683_mm_mul_su32(__m64 a, __m64 b) 684{ 685 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 686} 687 688static inline __m128i __attribute__((__always_inline__, __nodebug__)) 689_mm_mul_epu32(__m128i a, __m128i b) 690{ 691 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 692} 693 694static inline __m128i __attribute__((__always_inline__, __nodebug__)) 695_mm_sad_epu8(__m128i a, __m128i b) 696{ 697 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 698} 699 700static inline __m128i __attribute__((__always_inline__, __nodebug__)) 701_mm_sub_epi8(__m128i a, __m128i b) 702{ 703 return (__m128i)((__v16qi)a - (__v16qi)b); 704} 705 706static inline __m128i __attribute__((__always_inline__, __nodebug__)) 707_mm_sub_epi16(__m128i a, __m128i b) 708{ 709 return (__m128i)((__v8hi)a - (__v8hi)b); 710} 711 712static inline __m128i __attribute__((__always_inline__, __nodebug__)) 713_mm_sub_epi32(__m128i a, __m128i b) 714{ 715 return (__m128i)((__v4si)a - (__v4si)b); 716} 717 718static inline __m64 __attribute__((__always_inline__, __nodebug__)) 719_mm_sub_si64(__m64 a, __m64 b) 720{ 721 return a - b; 722} 723 724static inline __m128i __attribute__((__always_inline__, __nodebug__)) 725_mm_sub_epi64(__m128i a, __m128i b) 726{ 727 return a - b; 728} 729 730static inline __m128i __attribute__((__always_inline__, __nodebug__)) 731_mm_subs_epi8(__m128i a, __m128i b) 732{ 733 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 734} 735 736static inline __m128i __attribute__((__always_inline__, __nodebug__)) 737_mm_subs_epi16(__m128i a, __m128i b) 738{ 739 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 740} 741 742static inline __m128i __attribute__((__always_inline__, __nodebug__)) 743_mm_subs_epu8(__m128i a, __m128i b) 744{ 745 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 746} 747 748static inline __m128i __attribute__((__always_inline__, __nodebug__)) 749_mm_subs_epu16(__m128i a, __m128i b) 750{ 751 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 752} 753 754static inline __m128i __attribute__((__always_inline__, __nodebug__)) 755_mm_and_si128(__m128i a, __m128i b) 756{ 757 return a & b; 758} 759 760static inline __m128i __attribute__((__always_inline__, __nodebug__)) 761_mm_andnot_si128(__m128i a, __m128i b) 762{ 763 return ~a & b; 764} 765 766static inline __m128i __attribute__((__always_inline__, __nodebug__)) 767_mm_or_si128(__m128i a, __m128i b) 768{ 769 return a | b; 770} 771 772static inline __m128i __attribute__((__always_inline__, __nodebug__)) 773_mm_xor_si128(__m128i a, __m128i b) 774{ 775 return a ^ b; 776} 777 778static inline __m128i __attribute__((__always_inline__, __nodebug__)) 779_mm_slli_si128(__m128i a, int imm) 780{ 781 return __builtin_ia32_pslldqi128(a, imm * 8); 782} 783 784static inline __m128i __attribute__((__always_inline__, __nodebug__)) 785_mm_slli_epi16(__m128i a, int count) 786{ 787 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 788} 789 790static inline __m128i __attribute__((__always_inline__, __nodebug__)) 791_mm_sll_epi16(__m128i a, __m128i count) 792{ 793 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 794} 795 796static inline __m128i __attribute__((__always_inline__, __nodebug__)) 797_mm_slli_epi32(__m128i a, int count) 798{ 799 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 800} 801 802static inline __m128i __attribute__((__always_inline__, __nodebug__)) 803_mm_sll_epi32(__m128i a, __m128i count) 804{ 805 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 806} 807 808static inline __m128i __attribute__((__always_inline__, __nodebug__)) 809_mm_slli_epi64(__m128i a, int count) 810{ 811 return __builtin_ia32_psllqi128(a, count); 812} 813 814static inline __m128i __attribute__((__always_inline__, __nodebug__)) 815_mm_sll_epi64(__m128i a, __m128i count) 816{ 817 return __builtin_ia32_psllq128(a, count); 818} 819 820static inline __m128i __attribute__((__always_inline__, __nodebug__)) 821_mm_srai_epi16(__m128i a, int count) 822{ 823 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 824} 825 826static inline __m128i __attribute__((__always_inline__, __nodebug__)) 827_mm_sra_epi16(__m128i a, __m128i count) 828{ 829 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 830} 831 832static inline __m128i __attribute__((__always_inline__, __nodebug__)) 833_mm_srai_epi32(__m128i a, int count) 834{ 835 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 836} 837 838static inline __m128i __attribute__((__always_inline__, __nodebug__)) 839_mm_sra_epi32(__m128i a, __m128i count) 840{ 841 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 842} 843 844static inline __m128i __attribute__((__always_inline__, __nodebug__)) 845_mm_srli_si128(__m128i a, int imm) 846{ 847 return __builtin_ia32_psrldqi128(a, imm * 8); 848} 849 850static inline __m128i __attribute__((__always_inline__, __nodebug__)) 851_mm_srli_epi16(__m128i a, int count) 852{ 853 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 854} 855 856static inline __m128i __attribute__((__always_inline__, __nodebug__)) 857_mm_srl_epi16(__m128i a, __m128i count) 858{ 859 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 860} 861 862static inline __m128i __attribute__((__always_inline__, __nodebug__)) 863_mm_srli_epi32(__m128i a, int count) 864{ 865 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 866} 867 868static inline __m128i __attribute__((__always_inline__, __nodebug__)) 869_mm_srl_epi32(__m128i a, __m128i count) 870{ 871 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 872} 873 874static inline __m128i __attribute__((__always_inline__, __nodebug__)) 875_mm_srli_epi64(__m128i a, int count) 876{ 877 return __builtin_ia32_psrlqi128(a, count); 878} 879 880static inline __m128i __attribute__((__always_inline__, __nodebug__)) 881_mm_srl_epi64(__m128i a, __m128i count) 882{ 883 return __builtin_ia32_psrlq128(a, count); 884} 885 886static inline __m128i __attribute__((__always_inline__, __nodebug__)) 887_mm_cmpeq_epi8(__m128i a, __m128i b) 888{ 889 return (__m128i)((__v16qi)a == (__v16qi)b); 890} 891 892static inline __m128i __attribute__((__always_inline__, __nodebug__)) 893_mm_cmpeq_epi16(__m128i a, __m128i b) 894{ 895 return (__m128i)((__v8hi)a == (__v8hi)b); 896} 897 898static inline __m128i __attribute__((__always_inline__, __nodebug__)) 899_mm_cmpeq_epi32(__m128i a, __m128i b) 900{ 901 return (__m128i)((__v4si)a == (__v4si)b); 902} 903 904static inline __m128i __attribute__((__always_inline__, __nodebug__)) 905_mm_cmpgt_epi8(__m128i a, __m128i b) 906{ 907 return (__m128i)((__v16qi)a > (__v16qi)b); 908} 909 910static inline __m128i __attribute__((__always_inline__, __nodebug__)) 911_mm_cmpgt_epi16(__m128i a, __m128i b) 912{ 913 return (__m128i)((__v8hi)a > (__v8hi)b); 914} 915 916static inline __m128i __attribute__((__always_inline__, __nodebug__)) 917_mm_cmpgt_epi32(__m128i a, __m128i b) 918{ 919 return (__m128i)((__v4si)a > (__v4si)b); 920} 921 922static inline __m128i __attribute__((__always_inline__, __nodebug__)) 923_mm_cmplt_epi8(__m128i a, __m128i b) 924{ 925 return _mm_cmpgt_epi8(b,a); 926} 927 928static inline __m128i __attribute__((__always_inline__, __nodebug__)) 929_mm_cmplt_epi16(__m128i a, __m128i b) 930{ 931 return _mm_cmpgt_epi16(b,a); 932} 933 934static inline __m128i __attribute__((__always_inline__, __nodebug__)) 935_mm_cmplt_epi32(__m128i a, __m128i b) 936{ 937 return _mm_cmpgt_epi32(b,a); 938} 939 940#ifdef __x86_64__ 941static inline __m128d __attribute__((__always_inline__, __nodebug__)) 942_mm_cvtsi64_sd(__m128d a, long long b) 943{ 944 a[0] = b; 945 return a; 946} 947 948static inline long long __attribute__((__always_inline__, __nodebug__)) 949_mm_cvtsd_si64(__m128d a) 950{ 951 return __builtin_ia32_cvtsd2si64(a); 952} 953 954static inline long long __attribute__((__always_inline__, __nodebug__)) 955_mm_cvttsd_si64(__m128d a) 956{ 957 return a[0]; 958} 959#endif 960 961static inline __m128 __attribute__((__always_inline__, __nodebug__)) 962_mm_cvtepi32_ps(__m128i a) 963{ 964 return __builtin_ia32_cvtdq2ps((__v4si)a); 965} 966 967static inline __m128i __attribute__((__always_inline__, __nodebug__)) 968_mm_cvtps_epi32(__m128 a) 969{ 970 return (__m128i)__builtin_ia32_cvtps2dq(a); 971} 972 973static inline __m128i __attribute__((__always_inline__, __nodebug__)) 974_mm_cvttps_epi32(__m128 a) 975{ 976 return (__m128i)__builtin_ia32_cvttps2dq(a); 977} 978 979static inline __m128i __attribute__((__always_inline__, __nodebug__)) 980_mm_cvtsi32_si128(int a) 981{ 982 return (__m128i)(__v4si){ a, 0, 0, 0 }; 983} 984 985#ifdef __x86_64__ 986static inline __m128i __attribute__((__always_inline__, __nodebug__)) 987_mm_cvtsi64_si128(long long a) 988{ 989 return (__m128i){ a, 0 }; 990} 991#endif 992 993static inline int __attribute__((__always_inline__, __nodebug__)) 994_mm_cvtsi128_si32(__m128i a) 995{ 996 __v4si b = (__v4si)a; 997 return b[0]; 998} 999 1000#ifdef __x86_64__ 1001static inline long long __attribute__((__always_inline__, __nodebug__)) 1002_mm_cvtsi128_si64(__m128i a) 1003{ 1004 return a[0]; 1005} 1006#endif 1007 1008static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1009_mm_load_si128(__m128i const *p) 1010{ 1011 return *p; 1012} 1013 1014static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1015_mm_loadu_si128(__m128i const *p) 1016{ 1017 return (__m128i)__builtin_ia32_loaddqu((char const *)p); 1018} 1019 1020static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1021_mm_loadl_epi64(__m128i const *p) 1022{ 1023 return (__m128i) { *(long long*)p, 0}; 1024} 1025 1026static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1027_mm_set_epi64x(long long q1, long long q0) 1028{ 1029 return (__m128i){ q0, q1 }; 1030} 1031 1032static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1033_mm_set_epi64(__m64 q1, __m64 q0) 1034{ 1035 return (__m128i){ (long long)q0, (long long)q1 }; 1036} 1037 1038static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1039_mm_set_epi32(int i3, int i2, int i1, int i0) 1040{ 1041 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1042} 1043 1044static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1045_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1046{ 1047 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1048} 1049 1050static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1051_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1052{ 1053 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1054} 1055 1056static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1057_mm_set1_epi64x(long long q) 1058{ 1059 return (__m128i){ q, q }; 1060} 1061 1062static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1063_mm_set1_epi64(__m64 q) 1064{ 1065 return (__m128i){ (long long)q, (long long)q }; 1066} 1067 1068static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1069_mm_set1_epi32(int i) 1070{ 1071 return (__m128i)(__v4si){ i, i, i, i }; 1072} 1073 1074static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1075_mm_set1_epi16(short w) 1076{ 1077 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1078} 1079 1080static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1081_mm_set1_epi8(char b) 1082{ 1083 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1084} 1085 1086static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1087_mm_setr_epi64(__m64 q0, __m64 q1) 1088{ 1089 return (__m128i){ (long long)q0, (long long)q1 }; 1090} 1091 1092static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1093_mm_setr_epi32(int i0, int i1, int i2, int i3) 1094{ 1095 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1096} 1097 1098static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1099_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1100{ 1101 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1102} 1103 1104static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1105_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1106{ 1107 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1108} 1109 1110static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1111_mm_setzero_si128(void) 1112{ 1113 return (__m128i){ 0LL, 0LL }; 1114} 1115 1116static inline void __attribute__((__always_inline__, __nodebug__)) 1117_mm_store_si128(__m128i *p, __m128i b) 1118{ 1119 *p = b; 1120} 1121 1122static inline void __attribute__((__always_inline__, __nodebug__)) 1123_mm_storeu_si128(__m128i *p, __m128i b) 1124{ 1125 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1126} 1127 1128static inline void __attribute__((__always_inline__, __nodebug__)) 1129_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1130{ 1131 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1132} 1133 1134static inline void __attribute__((__always_inline__, __nodebug__)) 1135_mm_storel_epi64(__m128i *p, __m128i a) 1136{ 1137 __builtin_ia32_storelv4si((__v2si *)p, a); 1138} 1139 1140static inline void __attribute__((__always_inline__, __nodebug__)) 1141_mm_stream_pd(double *p, __m128d a) 1142{ 1143 __builtin_ia32_movntpd(p, a); 1144} 1145 1146static inline void __attribute__((__always_inline__, __nodebug__)) 1147_mm_stream_si128(__m128i *p, __m128i a) 1148{ 1149 __builtin_ia32_movntdq(p, a); 1150} 1151 1152static inline void __attribute__((__always_inline__, __nodebug__)) 1153_mm_stream_si32(int *p, int a) 1154{ 1155 __builtin_ia32_movnti(p, a); 1156} 1157 1158static inline void __attribute__((__always_inline__, __nodebug__)) 1159_mm_clflush(void const *p) 1160{ 1161 __builtin_ia32_clflush(p); 1162} 1163 1164static inline void __attribute__((__always_inline__, __nodebug__)) 1165_mm_lfence(void) 1166{ 1167 __builtin_ia32_lfence(); 1168} 1169 1170static inline void __attribute__((__always_inline__, __nodebug__)) 1171_mm_mfence(void) 1172{ 1173 __builtin_ia32_mfence(); 1174} 1175 1176static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1177_mm_packs_epi16(__m128i a, __m128i b) 1178{ 1179 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1180} 1181 1182static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1183_mm_packs_epi32(__m128i a, __m128i b) 1184{ 1185 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1186} 1187 1188static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1189_mm_packus_epi16(__m128i a, __m128i b) 1190{ 1191 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1192} 1193 1194static inline int __attribute__((__always_inline__, __nodebug__)) 1195_mm_extract_epi16(__m128i a, int imm) 1196{ 1197 __v8hi b = (__v8hi)a; 1198 return b[imm]; 1199} 1200 1201static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1202_mm_insert_epi16(__m128i a, int b, int imm) 1203{ 1204 __v8hi c = (__v8hi)a; 1205 c[imm & 7] = b; 1206 return (__m128i)c; 1207} 1208 1209static inline int __attribute__((__always_inline__, __nodebug__)) 1210_mm_movemask_epi8(__m128i a) 1211{ 1212 return __builtin_ia32_pmovmskb128((__v16qi)a); 1213} 1214 1215#define _mm_shuffle_epi32(a, imm) \ 1216 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \ 1217 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1218 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6)) 1219#define _mm_shufflelo_epi16(a, imm) \ 1220 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \ 1221 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1222 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1223 4, 5, 6, 7)) 1224#define _mm_shufflehi_epi16(a, imm) \ 1225 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \ 1226 4 + ((imm) & 0x3), 4 + ((imm) & 0xc) >> 2, \ 1227 4 + ((imm) & 0x30) >> 4, \ 1228 4 + ((imm) & 0xc0) >> 6)) 1229 1230static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1231_mm_unpackhi_epi8(__m128i a, __m128i b) 1232{ 1233 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1234} 1235 1236static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1237_mm_unpackhi_epi16(__m128i a, __m128i b) 1238{ 1239 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1240} 1241 1242static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1243_mm_unpackhi_epi32(__m128i a, __m128i b) 1244{ 1245 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1246} 1247 1248static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1249_mm_unpackhi_epi64(__m128i a, __m128i b) 1250{ 1251 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1252} 1253 1254static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1255_mm_unpacklo_epi8(__m128i a, __m128i b) 1256{ 1257 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1258} 1259 1260static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1261_mm_unpacklo_epi16(__m128i a, __m128i b) 1262{ 1263 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1264} 1265 1266static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1267_mm_unpacklo_epi32(__m128i a, __m128i b) 1268{ 1269 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1270} 1271 1272static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1273_mm_unpacklo_epi64(__m128i a, __m128i b) 1274{ 1275 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1276} 1277 1278static inline __m64 __attribute__((__always_inline__, __nodebug__)) 1279_mm_movepi64_pi64(__m128i a) 1280{ 1281 return (__m64)a[0]; 1282} 1283 1284static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1285_mm_movpi64_pi64(__m64 a) 1286{ 1287 return (__m128i){ (long long)a, 0 }; 1288} 1289 1290static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1291_mm_move_epi64(__m128i a) 1292{ 1293 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1294} 1295 1296static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1297_mm_unpackhi_pd(__m128d a, __m128d b) 1298{ 1299 return __builtin_shufflevector(a, b, 1, 2+1); 1300} 1301 1302static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1303_mm_unpacklo_pd(__m128d a, __m128d b) 1304{ 1305 return __builtin_shufflevector(a, b, 0, 2+0); 1306} 1307 1308static inline int __attribute__((__always_inline__, __nodebug__)) 1309_mm_movemask_pd(__m128d a) 1310{ 1311 return __builtin_ia32_movmskpd(a); 1312} 1313 1314#define _mm_shuffle_pd(a, b, i) (__builtin_shufflevector((a), (b), (i) & 1, \ 1315 (((i) & 2) >> 1) + 2)) 1316 1317static inline __m128 __attribute__((__always_inline__, __nodebug__)) 1318_mm_castpd_ps(__m128d in) 1319{ 1320 return (__m128)in; 1321} 1322 1323static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1324_mm_castpd_si128(__m128d in) 1325{ 1326 return (__m128i)in; 1327} 1328 1329static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1330_mm_castps_pd(__m128 in) 1331{ 1332 return (__m128d)in; 1333} 1334 1335static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1336_mm_castps_si128(__m128 in) 1337{ 1338 return (__m128i)in; 1339} 1340 1341static inline __m128 __attribute__((__always_inline__, __nodebug__)) 1342_mm_castsi128_ps(__m128i in) 1343{ 1344 return (__m128)in; 1345} 1346 1347static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1348_mm_castsi128_pd(__m128i in) 1349{ 1350 return (__m128d)in; 1351} 1352 1353static inline void __attribute__((__always_inline__, __nodebug__)) 1354_mm_pause(void) 1355{ 1356 __asm__ volatile ("pause"); 1357} 1358 1359#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1360 1361#endif /* __SSE2__ */ 1362 1363#endif /* __EMMINTRIN_H */ 1364