emmintrin.h revision ae8ecdd6dbaac2fc3e10f3146ec6bae28428cea3
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36typedef int __v4si __attribute__((__vector_size__(16))); 37typedef short __v8hi __attribute__((__vector_size__(16))); 38typedef char __v16qi __attribute__((__vector_size__(16))); 39 40static inline __m128d __attribute__((__always_inline__, __nodebug__)) 41_mm_add_sd(__m128d a, __m128d b) 42{ 43 return __builtin_ia32_addsd(a, b); 44} 45 46static inline __m128d __attribute__((__always_inline__, __nodebug__)) 47_mm_add_pd(__m128d a, __m128d b) 48{ 49 return a + b; 50} 51 52static inline __m128d __attribute__((__always_inline__, __nodebug__)) 53_mm_sub_sd(__m128d a, __m128d b) 54{ 55 return __builtin_ia32_subsd(a, b); 56} 57 58static inline __m128d __attribute__((__always_inline__, __nodebug__)) 59_mm_sub_pd(__m128d a, __m128d b) 60{ 61 return a - b; 62} 63 64static inline __m128d __attribute__((__always_inline__, __nodebug__)) 65_mm_mul_sd(__m128d a, __m128d b) 66{ 67 return __builtin_ia32_mulsd(a, b); 68} 69 70static inline __m128d __attribute__((__always_inline__, __nodebug__)) 71_mm_mul_pd(__m128d a, __m128d b) 72{ 73 return a * b; 74} 75 76static inline __m128d __attribute__((__always_inline__, __nodebug__)) 77_mm_div_sd(__m128d a, __m128d b) 78{ 79 return __builtin_ia32_divsd(a, b); 80} 81 82static inline __m128d __attribute__((__always_inline__, __nodebug__)) 83_mm_div_pd(__m128d a, __m128d b) 84{ 85 return a / b; 86} 87 88static inline __m128d __attribute__((__always_inline__, __nodebug__)) 89_mm_sqrt_sd(__m128d a, __m128d b) 90{ 91 __m128d c = __builtin_ia32_sqrtsd(b); 92 return (__m128d) { c[0], a[1] }; 93} 94 95static inline __m128d __attribute__((__always_inline__, __nodebug__)) 96_mm_sqrt_pd(__m128d a) 97{ 98 return __builtin_ia32_sqrtpd(a); 99} 100 101static inline __m128d __attribute__((__always_inline__, __nodebug__)) 102_mm_min_sd(__m128d a, __m128d b) 103{ 104 return __builtin_ia32_minsd(a, b); 105} 106 107static inline __m128d __attribute__((__always_inline__, __nodebug__)) 108_mm_min_pd(__m128d a, __m128d b) 109{ 110 return __builtin_ia32_minpd(a, b); 111} 112 113static inline __m128d __attribute__((__always_inline__, __nodebug__)) 114_mm_max_sd(__m128d a, __m128d b) 115{ 116 return __builtin_ia32_maxsd(a, b); 117} 118 119static inline __m128d __attribute__((__always_inline__, __nodebug__)) 120_mm_max_pd(__m128d a, __m128d b) 121{ 122 return __builtin_ia32_maxpd(a, b); 123} 124 125static inline __m128d __attribute__((__always_inline__, __nodebug__)) 126_mm_and_pd(__m128d a, __m128d b) 127{ 128 return __builtin_ia32_andpd(a, b); 129} 130 131static inline __m128d __attribute__((__always_inline__, __nodebug__)) 132_mm_andnot_pd(__m128d a, __m128d b) 133{ 134 return __builtin_ia32_andnpd(a, b); 135} 136 137static inline __m128d __attribute__((__always_inline__, __nodebug__)) 138_mm_or_pd(__m128d a, __m128d b) 139{ 140 return __builtin_ia32_orpd(a, b); 141} 142 143static inline __m128d __attribute__((__always_inline__, __nodebug__)) 144_mm_xor_pd(__m128d a, __m128d b) 145{ 146 return __builtin_ia32_xorpd(a, b); 147} 148 149static inline __m128d __attribute__((__always_inline__, __nodebug__)) 150_mm_cmpeq_pd(__m128d a, __m128d b) 151{ 152 return (__m128d)__builtin_ia32_cmpeqpd(a, b); 153} 154 155static inline __m128d __attribute__((__always_inline__, __nodebug__)) 156_mm_cmplt_pd(__m128d a, __m128d b) 157{ 158 return (__m128d)__builtin_ia32_cmpltpd(a, b); 159} 160 161static inline __m128d __attribute__((__always_inline__, __nodebug__)) 162_mm_cmple_pd(__m128d a, __m128d b) 163{ 164 return (__m128d)__builtin_ia32_cmplepd(a, b); 165} 166 167static inline __m128d __attribute__((__always_inline__, __nodebug__)) 168_mm_cmpgt_pd(__m128d a, __m128d b) 169{ 170 return (__m128d)__builtin_ia32_cmpltpd(b, a); 171} 172 173static inline __m128d __attribute__((__always_inline__, __nodebug__)) 174_mm_cmpge_pd(__m128d a, __m128d b) 175{ 176 return (__m128d)__builtin_ia32_cmplepd(b, a); 177} 178 179static inline __m128d __attribute__((__always_inline__, __nodebug__)) 180_mm_cmpord_pd(__m128d a, __m128d b) 181{ 182 return (__m128d)__builtin_ia32_cmpordpd(a, b); 183} 184 185static inline __m128d __attribute__((__always_inline__, __nodebug__)) 186_mm_cmpunord_pd(__m128d a, __m128d b) 187{ 188 return (__m128d)__builtin_ia32_cmpunordpd(a, b); 189} 190 191static inline __m128d __attribute__((__always_inline__, __nodebug__)) 192_mm_cmpneq_pd(__m128d a, __m128d b) 193{ 194 return (__m128d)__builtin_ia32_cmpneqpd(a, b); 195} 196 197static inline __m128d __attribute__((__always_inline__, __nodebug__)) 198_mm_cmpnlt_pd(__m128d a, __m128d b) 199{ 200 return (__m128d)__builtin_ia32_cmpnltpd(a, b); 201} 202 203static inline __m128d __attribute__((__always_inline__, __nodebug__)) 204_mm_cmpnle_pd(__m128d a, __m128d b) 205{ 206 return (__m128d)__builtin_ia32_cmpnlepd(a, b); 207} 208 209static inline __m128d __attribute__((__always_inline__, __nodebug__)) 210_mm_cmpngt_pd(__m128d a, __m128d b) 211{ 212 return (__m128d)__builtin_ia32_cmpnltpd(b, a); 213} 214 215static inline __m128d __attribute__((__always_inline__, __nodebug__)) 216_mm_cmpnge_pd(__m128d a, __m128d b) 217{ 218 return (__m128d)__builtin_ia32_cmpnlepd(b, a); 219} 220 221static inline __m128d __attribute__((__always_inline__, __nodebug__)) 222_mm_cmpeq_sd(__m128d a, __m128d b) 223{ 224 return (__m128d)__builtin_ia32_cmpeqsd(a, b); 225} 226 227static inline __m128d __attribute__((__always_inline__, __nodebug__)) 228_mm_cmplt_sd(__m128d a, __m128d b) 229{ 230 return (__m128d)__builtin_ia32_cmpltsd(a, b); 231} 232 233static inline __m128d __attribute__((__always_inline__, __nodebug__)) 234_mm_cmple_sd(__m128d a, __m128d b) 235{ 236 return (__m128d)__builtin_ia32_cmplesd(a, b); 237} 238 239static inline __m128d __attribute__((__always_inline__, __nodebug__)) 240_mm_cmpgt_sd(__m128d a, __m128d b) 241{ 242 return (__m128d)__builtin_ia32_cmpltsd(b, a); 243} 244 245static inline __m128d __attribute__((__always_inline__, __nodebug__)) 246_mm_cmpge_sd(__m128d a, __m128d b) 247{ 248 return (__m128d)__builtin_ia32_cmplesd(b, a); 249} 250 251static inline __m128d __attribute__((__always_inline__, __nodebug__)) 252_mm_cmpord_sd(__m128d a, __m128d b) 253{ 254 return (__m128d)__builtin_ia32_cmpordsd(a, b); 255} 256 257static inline __m128d __attribute__((__always_inline__, __nodebug__)) 258_mm_cmpunord_sd(__m128d a, __m128d b) 259{ 260 return (__m128d)__builtin_ia32_cmpunordsd(a, b); 261} 262 263static inline __m128d __attribute__((__always_inline__, __nodebug__)) 264_mm_cmpneq_sd(__m128d a, __m128d b) 265{ 266 return (__m128d)__builtin_ia32_cmpneqsd(a, b); 267} 268 269static inline __m128d __attribute__((__always_inline__, __nodebug__)) 270_mm_cmpnlt_sd(__m128d a, __m128d b) 271{ 272 return (__m128d)__builtin_ia32_cmpnltsd(a, b); 273} 274 275static inline __m128d __attribute__((__always_inline__, __nodebug__)) 276_mm_cmpnle_sd(__m128d a, __m128d b) 277{ 278 return (__m128d)__builtin_ia32_cmpnlesd(a, b); 279} 280 281static inline __m128d __attribute__((__always_inline__, __nodebug__)) 282_mm_cmpngt_sd(__m128d a, __m128d b) 283{ 284 return (__m128d)__builtin_ia32_cmpnltsd(b, a); 285} 286 287static inline __m128d __attribute__((__always_inline__, __nodebug__)) 288_mm_cmpnge_sd(__m128d a, __m128d b) 289{ 290 return (__m128d)__builtin_ia32_cmpnlesd(b, a); 291} 292 293static inline int __attribute__((__always_inline__, __nodebug__)) 294_mm_comieq_sd(__m128d a, __m128d b) 295{ 296 return __builtin_ia32_comisdeq(a, b); 297} 298 299static inline int __attribute__((__always_inline__, __nodebug__)) 300_mm_comilt_sd(__m128d a, __m128d b) 301{ 302 return __builtin_ia32_comisdlt(a, b); 303} 304 305static inline int __attribute__((__always_inline__, __nodebug__)) 306_mm_comile_sd(__m128d a, __m128d b) 307{ 308 return __builtin_ia32_comisdle(a, b); 309} 310 311static inline int __attribute__((__always_inline__, __nodebug__)) 312_mm_comigt_sd(__m128d a, __m128d b) 313{ 314 return __builtin_ia32_comisdgt(a, b); 315} 316 317static inline int __attribute__((__always_inline__, __nodebug__)) 318_mm_comineq_sd(__m128d a, __m128d b) 319{ 320 return __builtin_ia32_comisdneq(a, b); 321} 322 323static inline int __attribute__((__always_inline__, __nodebug__)) 324_mm_ucomieq_sd(__m128d a, __m128d b) 325{ 326 return __builtin_ia32_ucomisdeq(a, b); 327} 328 329static inline int __attribute__((__always_inline__, __nodebug__)) 330_mm_ucomilt_sd(__m128d a, __m128d b) 331{ 332 return __builtin_ia32_ucomisdlt(a, b); 333} 334 335static inline int __attribute__((__always_inline__, __nodebug__)) 336_mm_ucomile_sd(__m128d a, __m128d b) 337{ 338 return __builtin_ia32_ucomisdle(a, b); 339} 340 341static inline int __attribute__((__always_inline__, __nodebug__)) 342_mm_ucomigt_sd(__m128d a, __m128d b) 343{ 344 return __builtin_ia32_ucomisdgt(a, b); 345} 346 347static inline int __attribute__((__always_inline__, __nodebug__)) 348_mm_ucomineq_sd(__m128d a, __m128d b) 349{ 350 return __builtin_ia32_ucomisdneq(a, b); 351} 352 353static inline __m128 __attribute__((__always_inline__, __nodebug__)) 354_mm_cvtpd_ps(__m128d a) 355{ 356 return __builtin_ia32_cvtpd2ps(a); 357} 358 359static inline __m128d __attribute__((__always_inline__, __nodebug__)) 360_mm_cvtps_pd(__m128 a) 361{ 362 return __builtin_ia32_cvtps2pd(a); 363} 364 365static inline __m128d __attribute__((__always_inline__, __nodebug__)) 366_mm_cvtepi32_pd(__m128i a) 367{ 368 return __builtin_ia32_cvtdq2pd((__v4si)a); 369} 370 371static inline __m128i __attribute__((__always_inline__, __nodebug__)) 372_mm_cvtpd_epi32(__m128d a) 373{ 374 return __builtin_ia32_cvtpd2dq(a); 375} 376 377static inline int __attribute__((__always_inline__, __nodebug__)) 378_mm_cvtsd_si32(__m128d a) 379{ 380 return __builtin_ia32_cvtsd2si(a); 381} 382 383static inline __m128 __attribute__((__always_inline__, __nodebug__)) 384_mm_cvtsd_ss(__m128 a, __m128d b) 385{ 386 return __builtin_ia32_cvtsd2ss(a, b); 387} 388 389static inline __m128d __attribute__((__always_inline__, __nodebug__)) 390_mm_cvtsi32_sd(__m128d a, int b) 391{ 392 return __builtin_ia32_cvtsi2sd(a, b); 393} 394 395static inline __m128d __attribute__((__always_inline__, __nodebug__)) 396_mm_cvtss_sd(__m128d a, __m128 b) 397{ 398 return __builtin_ia32_cvtss2sd(a, b); 399} 400 401static inline __m128i __attribute__((__always_inline__, __nodebug__)) 402_mm_cvttpd_epi32(__m128d a) 403{ 404 return (__m128i)__builtin_ia32_cvttpd2dq(a); 405} 406 407static inline int __attribute__((__always_inline__, __nodebug__)) 408_mm_cvttsd_si32(__m128d a) 409{ 410 return __builtin_ia32_cvttsd2si(a); 411} 412 413static inline __m64 __attribute__((__always_inline__, __nodebug__)) 414_mm_cvtpd_pi32(__m128d a) 415{ 416 return (__m64)__builtin_ia32_cvtpd2pi(a); 417} 418 419static inline __m64 __attribute__((__always_inline__, __nodebug__)) 420_mm_cvttpd_pi32(__m128d a) 421{ 422 return (__m64)__builtin_ia32_cvttpd2pi(a); 423} 424 425static inline __m128d __attribute__((__always_inline__, __nodebug__)) 426_mm_cvtpi32_pd(__m64 a) 427{ 428 return __builtin_ia32_cvtpi2pd((__v2si)a); 429} 430 431static inline double __attribute__((__always_inline__, __nodebug__)) 432_mm_cvtsd_f64(__m128d a) 433{ 434 return a[0]; 435} 436 437static inline __m128d __attribute__((__always_inline__, __nodebug__)) 438_mm_load_pd(double const *dp) 439{ 440 return *(__m128d*)dp; 441} 442 443static inline __m128d __attribute__((__always_inline__, __nodebug__)) 444_mm_load1_pd(double const *dp) 445{ 446 return (__m128d){ dp[0], dp[0] }; 447} 448 449static inline __m128d __attribute__((__always_inline__, __nodebug__)) 450_mm_loadr_pd(double const *dp) 451{ 452 return (__m128d){ dp[1], dp[0] }; 453} 454 455static inline __m128d __attribute__((__always_inline__, __nodebug__)) 456_mm_loadu_pd(double const *dp) 457{ 458 return __builtin_ia32_loadupd(dp); 459} 460 461static inline __m128d __attribute__((__always_inline__, __nodebug__)) 462_mm_load_sd(double const *dp) 463{ 464 return (__m128d){ *dp, 0.0 }; 465} 466 467static inline __m128d __attribute__((__always_inline__, __nodebug__)) 468_mm_loadh_pd(__m128d a, double const *dp) 469{ 470 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); 471} 472 473static inline __m128d __attribute__((__always_inline__, __nodebug__)) 474_mm_loadl_pd(__m128d a, double const *dp) 475{ 476 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); 477} 478 479static inline __m128d __attribute__((__always_inline__, __nodebug__)) 480_mm_set_sd(double w) 481{ 482 return (__m128d){ w, 0 }; 483} 484 485static inline __m128d __attribute__((__always_inline__, __nodebug__)) 486_mm_set1_pd(double w) 487{ 488 return (__m128d){ w, w }; 489} 490 491static inline __m128d __attribute__((__always_inline__, __nodebug__)) 492_mm_set_pd(double w, double x) 493{ 494 return (__m128d){ w, x }; 495} 496 497static inline __m128d __attribute__((__always_inline__, __nodebug__)) 498_mm_setr_pd(double w, double x) 499{ 500 return (__m128d){ x, w }; 501} 502 503static inline __m128d __attribute__((__always_inline__, __nodebug__)) 504_mm_setzero_pd(void) 505{ 506 return (__m128d){ 0, 0 }; 507} 508 509static inline __m128d __attribute__((__always_inline__, __nodebug__)) 510_mm_move_sd(__m128d a, __m128d b) 511{ 512 return (__m128d){ b[0], a[1] }; 513} 514 515static inline void __attribute__((__always_inline__, __nodebug__)) 516_mm_store_sd(double *dp, __m128d a) 517{ 518 dp[0] = a[0]; 519} 520 521static inline void __attribute__((__always_inline__, __nodebug__)) 522_mm_store1_pd(double *dp, __m128d a) 523{ 524 dp[0] = a[0]; 525 dp[1] = a[0]; 526} 527 528static inline void __attribute__((__always_inline__, __nodebug__)) 529_mm_store_pd(double *dp, __m128d a) 530{ 531 *(__m128d *)dp = a; 532} 533 534static inline void __attribute__((__always_inline__, __nodebug__)) 535_mm_storeu_pd(double *dp, __m128d a) 536{ 537 __builtin_ia32_storeupd(dp, a); 538} 539 540static inline void __attribute__((__always_inline__, __nodebug__)) 541_mm_storer_pd(double *dp, __m128d a) 542{ 543 dp[0] = a[1]; 544 dp[1] = a[0]; 545} 546 547static inline void __attribute__((__always_inline__, __nodebug__)) 548_mm_storeh_pd(double *dp, __m128d a) 549{ 550 dp[0] = a[1]; 551} 552 553static inline void __attribute__((__always_inline__, __nodebug__)) 554_mm_storel_pd(double *dp, __m128d a) 555{ 556 dp[0] = a[0]; 557} 558 559static inline __m128i __attribute__((__always_inline__, __nodebug__)) 560_mm_add_epi8(__m128i a, __m128i b) 561{ 562 return (__m128i)((__v16qi)a + (__v16qi)b); 563} 564 565static inline __m128i __attribute__((__always_inline__, __nodebug__)) 566_mm_add_epi16(__m128i a, __m128i b) 567{ 568 return (__m128i)((__v8hi)a + (__v8hi)b); 569} 570 571static inline __m128i __attribute__((__always_inline__, __nodebug__)) 572_mm_add_epi32(__m128i a, __m128i b) 573{ 574 return (__m128i)((__v4si)a + (__v4si)b); 575} 576 577static inline __m64 __attribute__((__always_inline__, __nodebug__)) 578_mm_add_si64(__m64 a, __m64 b) 579{ 580 return a + b; 581} 582 583static inline __m128i __attribute__((__always_inline__, __nodebug__)) 584_mm_add_epi64(__m128i a, __m128i b) 585{ 586 return a + b; 587} 588 589static inline __m128i __attribute__((__always_inline__, __nodebug__)) 590_mm_adds_epi8(__m128i a, __m128i b) 591{ 592 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 593} 594 595static inline __m128i __attribute__((__always_inline__, __nodebug__)) 596_mm_adds_epi16(__m128i a, __m128i b) 597{ 598 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 599} 600 601static inline __m128i __attribute__((__always_inline__, __nodebug__)) 602_mm_adds_epu8(__m128i a, __m128i b) 603{ 604 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 605} 606 607static inline __m128i __attribute__((__always_inline__, __nodebug__)) 608_mm_adds_epu16(__m128i a, __m128i b) 609{ 610 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 611} 612 613static inline __m128i __attribute__((__always_inline__, __nodebug__)) 614_mm_avg_epu8(__m128i a, __m128i b) 615{ 616 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 617} 618 619static inline __m128i __attribute__((__always_inline__, __nodebug__)) 620_mm_avg_epu16(__m128i a, __m128i b) 621{ 622 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 623} 624 625static inline __m128i __attribute__((__always_inline__, __nodebug__)) 626_mm_madd_epi16(__m128i a, __m128i b) 627{ 628 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 629} 630 631static inline __m128i __attribute__((__always_inline__, __nodebug__)) 632_mm_max_epi16(__m128i a, __m128i b) 633{ 634 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 635} 636 637static inline __m128i __attribute__((__always_inline__, __nodebug__)) 638_mm_max_epu8(__m128i a, __m128i b) 639{ 640 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 641} 642 643static inline __m128i __attribute__((__always_inline__, __nodebug__)) 644_mm_min_epi16(__m128i a, __m128i b) 645{ 646 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 647} 648 649static inline __m128i __attribute__((__always_inline__, __nodebug__)) 650_mm_min_epu8(__m128i a, __m128i b) 651{ 652 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 653} 654 655static inline __m128i __attribute__((__always_inline__, __nodebug__)) 656_mm_mulhi_epi16(__m128i a, __m128i b) 657{ 658 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 659} 660 661static inline __m128i __attribute__((__always_inline__, __nodebug__)) 662_mm_mulhi_epu16(__m128i a, __m128i b) 663{ 664 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 665} 666 667static inline __m128i __attribute__((__always_inline__, __nodebug__)) 668_mm_mullo_epi16(__m128i a, __m128i b) 669{ 670 return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b); 671} 672 673static inline __m64 __attribute__((__always_inline__, __nodebug__)) 674_mm_mul_su32(__m64 a, __m64 b) 675{ 676 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 677} 678 679static inline __m128i __attribute__((__always_inline__, __nodebug__)) 680_mm_mul_epu32(__m128i a, __m128i b) 681{ 682 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 683} 684 685static inline __m128i __attribute__((__always_inline__, __nodebug__)) 686_mm_sad_epu8(__m128i a, __m128i b) 687{ 688 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 689} 690 691static inline __m128i __attribute__((__always_inline__, __nodebug__)) 692_mm_sub_epi8(__m128i a, __m128i b) 693{ 694 return (__m128i)((__v16qi)a - (__v16qi)b); 695} 696 697static inline __m128i __attribute__((__always_inline__, __nodebug__)) 698_mm_sub_epi16(__m128i a, __m128i b) 699{ 700 return (__m128i)((__v8hi)a - (__v8hi)b); 701} 702 703static inline __m128i __attribute__((__always_inline__, __nodebug__)) 704_mm_sub_epi32(__m128i a, __m128i b) 705{ 706 return (__m128i)((__v4si)a - (__v4si)b); 707} 708 709static inline __m64 __attribute__((__always_inline__, __nodebug__)) 710_mm_sub_si64(__m64 a, __m64 b) 711{ 712 return a - b; 713} 714 715static inline __m128i __attribute__((__always_inline__, __nodebug__)) 716_mm_sub_epi64(__m128i a, __m128i b) 717{ 718 return a - b; 719} 720 721static inline __m128i __attribute__((__always_inline__, __nodebug__)) 722_mm_subs_epi8(__m128i a, __m128i b) 723{ 724 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 725} 726 727static inline __m128i __attribute__((__always_inline__, __nodebug__)) 728_mm_subs_epi16(__m128i a, __m128i b) 729{ 730 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 731} 732 733static inline __m128i __attribute__((__always_inline__, __nodebug__)) 734_mm_subs_epu8(__m128i a, __m128i b) 735{ 736 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 737} 738 739static inline __m128i __attribute__((__always_inline__, __nodebug__)) 740_mm_subs_epu16(__m128i a, __m128i b) 741{ 742 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 743} 744 745static inline __m128i __attribute__((__always_inline__, __nodebug__)) 746_mm_and_si128(__m128i a, __m128i b) 747{ 748 return __builtin_ia32_pand128(a, b); 749} 750 751static inline __m128i __attribute__((__always_inline__, __nodebug__)) 752_mm_andnot_si128(__m128i a, __m128i b) 753{ 754 return __builtin_ia32_pandn128(a, b); 755} 756 757static inline __m128i __attribute__((__always_inline__, __nodebug__)) 758_mm_or_si128(__m128i a, __m128i b) 759{ 760 return __builtin_ia32_por128(a, b); 761} 762 763static inline __m128i __attribute__((__always_inline__, __nodebug__)) 764_mm_xor_si128(__m128i a, __m128i b) 765{ 766 return __builtin_ia32_pxor128(a, b); 767} 768 769static inline __m128i __attribute__((__always_inline__, __nodebug__)) 770_mm_slli_si128(__m128i a, int imm) 771{ 772 return __builtin_ia32_pslldqi128(a, imm * 8); 773} 774 775static inline __m128i __attribute__((__always_inline__, __nodebug__)) 776_mm_slli_epi16(__m128i a, int count) 777{ 778 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 779} 780 781static inline __m128i __attribute__((__always_inline__, __nodebug__)) 782_mm_sll_epi16(__m128i a, __m128i count) 783{ 784 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 785} 786 787static inline __m128i __attribute__((__always_inline__, __nodebug__)) 788_mm_slli_epi32(__m128i a, int count) 789{ 790 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 791} 792 793static inline __m128i __attribute__((__always_inline__, __nodebug__)) 794_mm_sll_epi32(__m128i a, __m128i count) 795{ 796 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 797} 798 799static inline __m128i __attribute__((__always_inline__, __nodebug__)) 800_mm_slli_epi64(__m128i a, int count) 801{ 802 return __builtin_ia32_psllqi128(a, count); 803} 804 805static inline __m128i __attribute__((__always_inline__, __nodebug__)) 806_mm_sll_epi64(__m128i a, __m128i count) 807{ 808 return __builtin_ia32_psllq128(a, count); 809} 810 811static inline __m128i __attribute__((__always_inline__, __nodebug__)) 812_mm_srai_epi16(__m128i a, int count) 813{ 814 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 815} 816 817static inline __m128i __attribute__((__always_inline__, __nodebug__)) 818_mm_sra_epi16(__m128i a, __m128i count) 819{ 820 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 821} 822 823static inline __m128i __attribute__((__always_inline__, __nodebug__)) 824_mm_srai_epi32(__m128i a, int count) 825{ 826 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 827} 828 829static inline __m128i __attribute__((__always_inline__, __nodebug__)) 830_mm_sra_epi32(__m128i a, __m128i count) 831{ 832 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 833} 834 835static inline __m128i __attribute__((__always_inline__, __nodebug__)) 836_mm_srli_si128(__m128i a, int imm) 837{ 838 return __builtin_ia32_psrldqi128(a, imm * 8); 839} 840 841static inline __m128i __attribute__((__always_inline__, __nodebug__)) 842_mm_srli_epi16(__m128i a, int count) 843{ 844 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 845} 846 847static inline __m128i __attribute__((__always_inline__, __nodebug__)) 848_mm_srl_epi16(__m128i a, __m128i count) 849{ 850 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 851} 852 853static inline __m128i __attribute__((__always_inline__, __nodebug__)) 854_mm_srli_epi32(__m128i a, int count) 855{ 856 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 857} 858 859static inline __m128i __attribute__((__always_inline__, __nodebug__)) 860_mm_srl_epi32(__m128i a, __m128i count) 861{ 862 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 863} 864 865static inline __m128i __attribute__((__always_inline__, __nodebug__)) 866_mm_srli_epi64(__m128i a, int count) 867{ 868 return __builtin_ia32_psrlqi128(a, count); 869} 870 871static inline __m128i __attribute__((__always_inline__, __nodebug__)) 872_mm_srl_epi64(__m128i a, __m128i count) 873{ 874 return __builtin_ia32_psrlq128(a, count); 875} 876 877static inline __m128i __attribute__((__always_inline__, __nodebug__)) 878_mm_cmpeq_epi8(__m128i a, __m128i b) 879{ 880 return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b); 881} 882 883static inline __m128i __attribute__((__always_inline__, __nodebug__)) 884_mm_cmpeq_epi16(__m128i a, __m128i b) 885{ 886 return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b); 887} 888 889static inline __m128i __attribute__((__always_inline__, __nodebug__)) 890_mm_cmpeq_epi32(__m128i a, __m128i b) 891{ 892 return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b); 893} 894 895static inline __m128i __attribute__((__always_inline__, __nodebug__)) 896_mm_cmpgt_epi8(__m128i a, __m128i b) 897{ 898 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b); 899} 900 901static inline __m128i __attribute__((__always_inline__, __nodebug__)) 902_mm_cmpgt_epi16(__m128i a, __m128i b) 903{ 904 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b); 905} 906 907static inline __m128i __attribute__((__always_inline__, __nodebug__)) 908_mm_cmpgt_epi32(__m128i a, __m128i b) 909{ 910 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b); 911} 912 913static inline __m128i __attribute__((__always_inline__, __nodebug__)) 914_mm_cmplt_epi8(__m128i a, __m128i b) 915{ 916 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a); 917} 918 919static inline __m128i __attribute__((__always_inline__, __nodebug__)) 920_mm_cmplt_epi16(__m128i a, __m128i b) 921{ 922 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a); 923} 924 925static inline __m128i __attribute__((__always_inline__, __nodebug__)) 926_mm_cmplt_epi32(__m128i a, __m128i b) 927{ 928 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a); 929} 930 931#ifdef __x86_64__ 932static inline __m128d __attribute__((__always_inline__, __nodebug__)) 933_mm_cvtsi64_sd(__m128d a, long long b) 934{ 935 return __builtin_ia32_cvtsi642sd(a, b); 936} 937 938static inline long long __attribute__((__always_inline__, __nodebug__)) 939_mm_cvtsd_si64(__m128d a) 940{ 941 return __builtin_ia32_cvtsd2si64(a); 942} 943 944static inline long long __attribute__((__always_inline__, __nodebug__)) 945_mm_cvttsd_si64(__m128d a) 946{ 947 return __builtin_ia32_cvttsd2si64(a); 948} 949#endif 950 951static inline __m128 __attribute__((__always_inline__, __nodebug__)) 952_mm_cvtepi32_ps(__m128i a) 953{ 954 return __builtin_ia32_cvtdq2ps((__v4si)a); 955} 956 957static inline __m128i __attribute__((__always_inline__, __nodebug__)) 958_mm_cvtps_epi32(__m128 a) 959{ 960 return (__m128i)__builtin_ia32_cvtps2dq(a); 961} 962 963static inline __m128i __attribute__((__always_inline__, __nodebug__)) 964_mm_cvttps_epi32(__m128 a) 965{ 966 return (__m128i)__builtin_ia32_cvttps2dq(a); 967} 968 969static inline __m128i __attribute__((__always_inline__, __nodebug__)) 970_mm_cvtsi32_si128(int a) 971{ 972 return (__m128i)(__v4si){ a, 0, 0, 0 }; 973} 974 975#ifdef __x86_64__ 976static inline __m128i __attribute__((__always_inline__, __nodebug__)) 977_mm_cvtsi64_si128(long long a) 978{ 979 return (__m128i){ a, 0 }; 980} 981#endif 982 983static inline int __attribute__((__always_inline__, __nodebug__)) 984_mm_cvtsi128_si32(__m128i a) 985{ 986 __v4si b = (__v4si)a; 987 return b[0]; 988} 989 990#ifdef __x86_64__ 991static inline long long __attribute__((__always_inline__, __nodebug__)) 992_mm_cvtsi128_si64(__m128i a) 993{ 994 return a[0]; 995} 996#endif 997 998static inline __m128i __attribute__((__always_inline__, __nodebug__)) 999_mm_load_si128(__m128i const *p) 1000{ 1001 return *p; 1002} 1003 1004static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1005_mm_loadu_si128(__m128i const *p) 1006{ 1007 return (__m128i)__builtin_ia32_loaddqu((char const *)p); 1008} 1009 1010static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1011_mm_loadl_epi64(__m128i const *p) 1012{ 1013 return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p); 1014} 1015 1016static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1017_mm_set_epi64(__m64 q1, __m64 q0) 1018{ 1019 return (__m128i){ (long long)q0, (long long)q1 }; 1020} 1021 1022static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1023_mm_set_epi32(int i3, int i2, int i1, int i0) 1024{ 1025 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1026} 1027 1028static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1029_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1030{ 1031 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1032} 1033 1034static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1035_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1036{ 1037 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1038} 1039 1040static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1041_mm_set1_epi64(__m64 q) 1042{ 1043 return (__m128i){ (long long)q, (long long)q }; 1044} 1045 1046static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1047_mm_set1_epi32(int i) 1048{ 1049 return (__m128i)(__v4si){ i, i, i, i }; 1050} 1051 1052static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1053_mm_set1_epi16(short w) 1054{ 1055 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1056} 1057 1058static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1059_mm_set1_epi8(char b) 1060{ 1061 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1062} 1063 1064static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1065_mm_setr_epi64(__m64 q0, __m64 q1) 1066{ 1067 return (__m128i){ (long long)q0, (long long)q1 }; 1068} 1069 1070static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1071_mm_setr_epi32(int i0, int i1, int i2, int i3) 1072{ 1073 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1074} 1075 1076static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1077_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1078{ 1079 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1080} 1081 1082static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1083_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1084{ 1085 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1086} 1087 1088static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1089_mm_setzero_si128(void) 1090{ 1091 return (__m128i){ 0LL, 0LL }; 1092} 1093 1094static inline void __attribute__((__always_inline__, __nodebug__)) 1095_mm_store_si128(__m128i *p, __m128i b) 1096{ 1097 *p = b; 1098} 1099 1100static inline void __attribute__((__always_inline__, __nodebug__)) 1101_mm_storeu_si128(__m128i *p, __m128i b) 1102{ 1103 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1104} 1105 1106static inline void __attribute__((__always_inline__, __nodebug__)) 1107_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1108{ 1109 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1110} 1111 1112static inline void __attribute__((__always_inline__, __nodebug__)) 1113_mm_storel_epi64(__m128i *p, __m128i a) 1114{ 1115 __builtin_ia32_storelv4si((__v2si *)p, a); 1116} 1117 1118static inline void __attribute__((__always_inline__, __nodebug__)) 1119_mm_stream_pd(double *p, __m128d a) 1120{ 1121 __builtin_ia32_movntpd(p, a); 1122} 1123 1124static inline void __attribute__((__always_inline__, __nodebug__)) 1125_mm_stream_si128(__m128i *p, __m128i a) 1126{ 1127 __builtin_ia32_movntdq(p, a); 1128} 1129 1130static inline void __attribute__((__always_inline__, __nodebug__)) 1131_mm_stream_si32(int *p, int a) 1132{ 1133 __builtin_ia32_movnti(p, a); 1134} 1135 1136static inline void __attribute__((__always_inline__, __nodebug__)) 1137_mm_clflush(void const *p) 1138{ 1139 __builtin_ia32_clflush(p); 1140} 1141 1142static inline void __attribute__((__always_inline__, __nodebug__)) 1143_mm_lfence(void) 1144{ 1145 __builtin_ia32_lfence(); 1146} 1147 1148static inline void __attribute__((__always_inline__, __nodebug__)) 1149_mm_mfence(void) 1150{ 1151 __builtin_ia32_mfence(); 1152} 1153 1154static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1155_mm_packs_epi16(__m128i a, __m128i b) 1156{ 1157 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1158} 1159 1160static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1161_mm_packs_epi32(__m128i a, __m128i b) 1162{ 1163 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1164} 1165 1166static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1167_mm_packus_epi16(__m128i a, __m128i b) 1168{ 1169 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1170} 1171 1172static inline int __attribute__((__always_inline__, __nodebug__)) 1173_mm_extract_epi16(__m128i a, int imm) 1174{ 1175 __v8hi b = (__v8hi)a; 1176 return b[imm]; 1177} 1178 1179static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1180_mm_insert_epi16(__m128i a, int b, int imm) 1181{ 1182 return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm); 1183} 1184 1185static inline int __attribute__((__always_inline__, __nodebug__)) 1186_mm_movemask_epi8(__m128i a) 1187{ 1188 return __builtin_ia32_pmovmskb128((__v16qi)a); 1189} 1190 1191#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm))) 1192#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm))) 1193#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm))) 1194 1195static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1196_mm_unpackhi_epi8(__m128i a, __m128i b) 1197{ 1198 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1199} 1200 1201static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1202_mm_unpackhi_epi16(__m128i a, __m128i b) 1203{ 1204 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1205} 1206 1207static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1208_mm_unpackhi_epi32(__m128i a, __m128i b) 1209{ 1210 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1211} 1212 1213static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1214_mm_unpackhi_epi64(__m128i a, __m128i b) 1215{ 1216 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1217} 1218 1219static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1220_mm_unpacklo_epi8(__m128i a, __m128i b) 1221{ 1222 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1223} 1224 1225static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1226_mm_unpacklo_epi16(__m128i a, __m128i b) 1227{ 1228 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1229} 1230 1231static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1232_mm_unpacklo_epi32(__m128i a, __m128i b) 1233{ 1234 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1235} 1236 1237static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1238_mm_unpacklo_epi64(__m128i a, __m128i b) 1239{ 1240 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1241} 1242 1243static inline __m64 __attribute__((__always_inline__, __nodebug__)) 1244_mm_movepi64_pi64(__m128i a) 1245{ 1246 return (__m64)a[0]; 1247} 1248 1249static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1250_mm_movpi64_pi64(__m64 a) 1251{ 1252 return (__m128i){ (long long)a, 0 }; 1253} 1254 1255static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1256_mm_move_epi64(__m128i a) 1257{ 1258 return (__m128i){ a[0], 0 }; 1259} 1260 1261static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1262_mm_unpackhi_pd(__m128d a, __m128d b) 1263{ 1264 return __builtin_shufflevector(a, b, 1, 2+1); 1265} 1266 1267static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1268_mm_unpacklo_pd(__m128d a, __m128d b) 1269{ 1270 return __builtin_shufflevector(a, b, 0, 2+0); 1271} 1272 1273static inline int __attribute__((__always_inline__, __nodebug__)) 1274_mm_movemask_pd(__m128d a) 1275{ 1276 return __builtin_ia32_movmskpd(a); 1277} 1278 1279#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i))) 1280 1281static inline __m128 __attribute__((__always_inline__, __nodebug__)) 1282_mm_castpd_ps(__m128d in) 1283{ 1284 return (__m128)in; 1285} 1286 1287static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1288_mm_castpd_si128(__m128d in) 1289{ 1290 return (__m128i)in; 1291} 1292 1293static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1294_mm_castps_pd(__m128 in) 1295{ 1296 return (__m128d)in; 1297} 1298 1299static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1300_mm_castps_si128(__m128 in) 1301{ 1302 return (__m128i)in; 1303} 1304 1305static inline __m128 __attribute__((__always_inline__, __nodebug__)) 1306_mm_castsi128_ps(__m128i in) 1307{ 1308 return (__m128)in; 1309} 1310 1311static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1312_mm_castsi128_pd(__m128i in) 1313{ 1314 return (__m128d)in; 1315} 1316 1317static inline void __attribute__((__always_inline__, __nodebug__)) 1318_mm_pause(void) 1319{ 1320 __asm__ volatile ("pause"); 1321} 1322 1323#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1324 1325#endif /* __SSE2__ */ 1326 1327#endif /* __EMMINTRIN_H */ 1328