emmintrin.h revision 80c800465865aa15ec4b094407170c149ce344cd
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36typedef int __v4si __attribute__((__vector_size__(16))); 37typedef short __v8hi __attribute__((__vector_size__(16))); 38typedef char __v16qi __attribute__((__vector_size__(16))); 39 40static inline __m128d __attribute__((__always_inline__, __nodebug__)) 41_mm_add_sd(__m128d a, __m128d b) 42{ 43 a[0] += b[0]; 44 return a; 45} 46 47static inline __m128d __attribute__((__always_inline__, __nodebug__)) 48_mm_add_pd(__m128d a, __m128d b) 49{ 50 return a + b; 51} 52 53static inline __m128d __attribute__((__always_inline__, __nodebug__)) 54_mm_sub_sd(__m128d a, __m128d b) 55{ 56 a[0] -= b[0]; 57 return a; 58} 59 60static inline __m128d __attribute__((__always_inline__, __nodebug__)) 61_mm_sub_pd(__m128d a, __m128d b) 62{ 63 return a - b; 64} 65 66static inline __m128d __attribute__((__always_inline__, __nodebug__)) 67_mm_mul_sd(__m128d a, __m128d b) 68{ 69 a[0] *= b[0]; 70 return a; 71} 72 73static inline __m128d __attribute__((__always_inline__, __nodebug__)) 74_mm_mul_pd(__m128d a, __m128d b) 75{ 76 return a * b; 77} 78 79static inline __m128d __attribute__((__always_inline__, __nodebug__)) 80_mm_div_sd(__m128d a, __m128d b) 81{ 82 a[0] /= b[0]; 83 return a; 84} 85 86static inline __m128d __attribute__((__always_inline__, __nodebug__)) 87_mm_div_pd(__m128d a, __m128d b) 88{ 89 return a / b; 90} 91 92static inline __m128d __attribute__((__always_inline__, __nodebug__)) 93_mm_sqrt_sd(__m128d a, __m128d b) 94{ 95 __m128d c = __builtin_ia32_sqrtsd(b); 96 return (__m128d) { c[0], a[1] }; 97} 98 99static inline __m128d __attribute__((__always_inline__, __nodebug__)) 100_mm_sqrt_pd(__m128d a) 101{ 102 return __builtin_ia32_sqrtpd(a); 103} 104 105static inline __m128d __attribute__((__always_inline__, __nodebug__)) 106_mm_min_sd(__m128d a, __m128d b) 107{ 108 return __builtin_ia32_minsd(a, b); 109} 110 111static inline __m128d __attribute__((__always_inline__, __nodebug__)) 112_mm_min_pd(__m128d a, __m128d b) 113{ 114 return __builtin_ia32_minpd(a, b); 115} 116 117static inline __m128d __attribute__((__always_inline__, __nodebug__)) 118_mm_max_sd(__m128d a, __m128d b) 119{ 120 return __builtin_ia32_maxsd(a, b); 121} 122 123static inline __m128d __attribute__((__always_inline__, __nodebug__)) 124_mm_max_pd(__m128d a, __m128d b) 125{ 126 return __builtin_ia32_maxpd(a, b); 127} 128 129static inline __m128d __attribute__((__always_inline__, __nodebug__)) 130_mm_and_pd(__m128d a, __m128d b) 131{ 132 return (__m128)((__v4si)a & (__v4si)b); 133} 134 135static inline __m128d __attribute__((__always_inline__, __nodebug__)) 136_mm_andnot_pd(__m128d a, __m128d b) 137{ 138 return (__m128)(~(__v4si)a & (__v4si)b); 139} 140 141static inline __m128d __attribute__((__always_inline__, __nodebug__)) 142_mm_or_pd(__m128d a, __m128d b) 143{ 144 return (__m128)((__v4si)a | (__v4si)b); 145} 146 147static inline __m128d __attribute__((__always_inline__, __nodebug__)) 148_mm_xor_pd(__m128d a, __m128d b) 149{ 150 return (__m128)((__v4si)a ^ (__v4si)b); 151} 152 153static inline __m128d __attribute__((__always_inline__, __nodebug__)) 154_mm_cmpeq_pd(__m128d a, __m128d b) 155{ 156 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 157} 158 159static inline __m128d __attribute__((__always_inline__, __nodebug__)) 160_mm_cmplt_pd(__m128d a, __m128d b) 161{ 162 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 163} 164 165static inline __m128d __attribute__((__always_inline__, __nodebug__)) 166_mm_cmple_pd(__m128d a, __m128d b) 167{ 168 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 169} 170 171static inline __m128d __attribute__((__always_inline__, __nodebug__)) 172_mm_cmpgt_pd(__m128d a, __m128d b) 173{ 174 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 175} 176 177static inline __m128d __attribute__((__always_inline__, __nodebug__)) 178_mm_cmpge_pd(__m128d a, __m128d b) 179{ 180 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 181} 182 183static inline __m128d __attribute__((__always_inline__, __nodebug__)) 184_mm_cmpord_pd(__m128d a, __m128d b) 185{ 186 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 187} 188 189static inline __m128d __attribute__((__always_inline__, __nodebug__)) 190_mm_cmpunord_pd(__m128d a, __m128d b) 191{ 192 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 193} 194 195static inline __m128d __attribute__((__always_inline__, __nodebug__)) 196_mm_cmpneq_pd(__m128d a, __m128d b) 197{ 198 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 199} 200 201static inline __m128d __attribute__((__always_inline__, __nodebug__)) 202_mm_cmpnlt_pd(__m128d a, __m128d b) 203{ 204 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 205} 206 207static inline __m128d __attribute__((__always_inline__, __nodebug__)) 208_mm_cmpnle_pd(__m128d a, __m128d b) 209{ 210 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 211} 212 213static inline __m128d __attribute__((__always_inline__, __nodebug__)) 214_mm_cmpngt_pd(__m128d a, __m128d b) 215{ 216 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 217} 218 219static inline __m128d __attribute__((__always_inline__, __nodebug__)) 220_mm_cmpnge_pd(__m128d a, __m128d b) 221{ 222 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 223} 224 225static inline __m128d __attribute__((__always_inline__, __nodebug__)) 226_mm_cmpeq_sd(__m128d a, __m128d b) 227{ 228 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 229} 230 231static inline __m128d __attribute__((__always_inline__, __nodebug__)) 232_mm_cmplt_sd(__m128d a, __m128d b) 233{ 234 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 235} 236 237static inline __m128d __attribute__((__always_inline__, __nodebug__)) 238_mm_cmple_sd(__m128d a, __m128d b) 239{ 240 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 241} 242 243static inline __m128d __attribute__((__always_inline__, __nodebug__)) 244_mm_cmpgt_sd(__m128d a, __m128d b) 245{ 246 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 247} 248 249static inline __m128d __attribute__((__always_inline__, __nodebug__)) 250_mm_cmpge_sd(__m128d a, __m128d b) 251{ 252 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 253} 254 255static inline __m128d __attribute__((__always_inline__, __nodebug__)) 256_mm_cmpord_sd(__m128d a, __m128d b) 257{ 258 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 259} 260 261static inline __m128d __attribute__((__always_inline__, __nodebug__)) 262_mm_cmpunord_sd(__m128d a, __m128d b) 263{ 264 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 265} 266 267static inline __m128d __attribute__((__always_inline__, __nodebug__)) 268_mm_cmpneq_sd(__m128d a, __m128d b) 269{ 270 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 271} 272 273static inline __m128d __attribute__((__always_inline__, __nodebug__)) 274_mm_cmpnlt_sd(__m128d a, __m128d b) 275{ 276 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 277} 278 279static inline __m128d __attribute__((__always_inline__, __nodebug__)) 280_mm_cmpnle_sd(__m128d a, __m128d b) 281{ 282 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 283} 284 285static inline __m128d __attribute__((__always_inline__, __nodebug__)) 286_mm_cmpngt_sd(__m128d a, __m128d b) 287{ 288 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 289} 290 291static inline __m128d __attribute__((__always_inline__, __nodebug__)) 292_mm_cmpnge_sd(__m128d a, __m128d b) 293{ 294 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 295} 296 297static inline int __attribute__((__always_inline__, __nodebug__)) 298_mm_comieq_sd(__m128d a, __m128d b) 299{ 300 return __builtin_ia32_comisdeq(a, b); 301} 302 303static inline int __attribute__((__always_inline__, __nodebug__)) 304_mm_comilt_sd(__m128d a, __m128d b) 305{ 306 return __builtin_ia32_comisdlt(a, b); 307} 308 309static inline int __attribute__((__always_inline__, __nodebug__)) 310_mm_comile_sd(__m128d a, __m128d b) 311{ 312 return __builtin_ia32_comisdle(a, b); 313} 314 315static inline int __attribute__((__always_inline__, __nodebug__)) 316_mm_comigt_sd(__m128d a, __m128d b) 317{ 318 return __builtin_ia32_comisdgt(a, b); 319} 320 321static inline int __attribute__((__always_inline__, __nodebug__)) 322_mm_comineq_sd(__m128d a, __m128d b) 323{ 324 return __builtin_ia32_comisdneq(a, b); 325} 326 327static inline int __attribute__((__always_inline__, __nodebug__)) 328_mm_ucomieq_sd(__m128d a, __m128d b) 329{ 330 return __builtin_ia32_ucomisdeq(a, b); 331} 332 333static inline int __attribute__((__always_inline__, __nodebug__)) 334_mm_ucomilt_sd(__m128d a, __m128d b) 335{ 336 return __builtin_ia32_ucomisdlt(a, b); 337} 338 339static inline int __attribute__((__always_inline__, __nodebug__)) 340_mm_ucomile_sd(__m128d a, __m128d b) 341{ 342 return __builtin_ia32_ucomisdle(a, b); 343} 344 345static inline int __attribute__((__always_inline__, __nodebug__)) 346_mm_ucomigt_sd(__m128d a, __m128d b) 347{ 348 return __builtin_ia32_ucomisdgt(a, b); 349} 350 351static inline int __attribute__((__always_inline__, __nodebug__)) 352_mm_ucomineq_sd(__m128d a, __m128d b) 353{ 354 return __builtin_ia32_ucomisdneq(a, b); 355} 356 357static inline __m128 __attribute__((__always_inline__, __nodebug__)) 358_mm_cvtpd_ps(__m128d a) 359{ 360 return __builtin_ia32_cvtpd2ps(a); 361} 362 363static inline __m128d __attribute__((__always_inline__, __nodebug__)) 364_mm_cvtps_pd(__m128 a) 365{ 366 return __builtin_ia32_cvtps2pd(a); 367} 368 369static inline __m128d __attribute__((__always_inline__, __nodebug__)) 370_mm_cvtepi32_pd(__m128i a) 371{ 372 return __builtin_ia32_cvtdq2pd((__v4si)a); 373} 374 375static inline __m128i __attribute__((__always_inline__, __nodebug__)) 376_mm_cvtpd_epi32(__m128d a) 377{ 378 return __builtin_ia32_cvtpd2dq(a); 379} 380 381static inline int __attribute__((__always_inline__, __nodebug__)) 382_mm_cvtsd_si32(__m128d a) 383{ 384 return __builtin_ia32_cvtsd2si(a); 385} 386 387static inline __m128 __attribute__((__always_inline__, __nodebug__)) 388_mm_cvtsd_ss(__m128 a, __m128d b) 389{ 390 a[0] = b[0]; 391 return a; 392} 393 394static inline __m128d __attribute__((__always_inline__, __nodebug__)) 395_mm_cvtsi32_sd(__m128d a, int b) 396{ 397 return __builtin_ia32_cvtsi2sd(a, b); 398} 399 400static inline __m128d __attribute__((__always_inline__, __nodebug__)) 401_mm_cvtss_sd(__m128d a, __m128 b) 402{ 403 a[0] = b[0]; 404 return a; 405} 406 407static inline __m128i __attribute__((__always_inline__, __nodebug__)) 408_mm_cvttpd_epi32(__m128d a) 409{ 410 return (__m128i)__builtin_ia32_cvttpd2dq(a); 411} 412 413static inline int __attribute__((__always_inline__, __nodebug__)) 414_mm_cvttsd_si32(__m128d a) 415{ 416 return a[0]; 417} 418 419static inline __m64 __attribute__((__always_inline__, __nodebug__)) 420_mm_cvtpd_pi32(__m128d a) 421{ 422 return (__m64)__builtin_ia32_cvtpd2pi(a); 423} 424 425static inline __m64 __attribute__((__always_inline__, __nodebug__)) 426_mm_cvttpd_pi32(__m128d a) 427{ 428 return (__m64)__builtin_ia32_cvttpd2pi(a); 429} 430 431static inline __m128d __attribute__((__always_inline__, __nodebug__)) 432_mm_cvtpi32_pd(__m64 a) 433{ 434 return __builtin_ia32_cvtpi2pd((__v2si)a); 435} 436 437static inline double __attribute__((__always_inline__, __nodebug__)) 438_mm_cvtsd_f64(__m128d a) 439{ 440 return a[0]; 441} 442 443static inline __m128d __attribute__((__always_inline__, __nodebug__)) 444_mm_load_pd(double const *dp) 445{ 446 return *(__m128d*)dp; 447} 448 449static inline __m128d __attribute__((__always_inline__, __nodebug__)) 450_mm_load1_pd(double const *dp) 451{ 452 return (__m128d){ dp[0], dp[0] }; 453} 454 455#define _mm_load_pd1(dp) _mm_load1_pd(dp) 456 457static inline __m128d __attribute__((__always_inline__, __nodebug__)) 458_mm_loadr_pd(double const *dp) 459{ 460 return (__m128d){ dp[1], dp[0] }; 461} 462 463static inline __m128d __attribute__((__always_inline__, __nodebug__)) 464_mm_loadu_pd(double const *dp) 465{ 466 return __builtin_ia32_loadupd(dp); 467} 468 469static inline __m128d __attribute__((__always_inline__, __nodebug__)) 470_mm_load_sd(double const *dp) 471{ 472 return (__m128d){ *dp, 0.0 }; 473} 474 475static inline __m128d __attribute__((__always_inline__, __nodebug__)) 476_mm_loadh_pd(__m128d a, double const *dp) 477{ 478 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); 479} 480 481static inline __m128d __attribute__((__always_inline__, __nodebug__)) 482_mm_loadl_pd(__m128d a, double const *dp) 483{ 484 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); 485} 486 487static inline __m128d __attribute__((__always_inline__, __nodebug__)) 488_mm_set_sd(double w) 489{ 490 return (__m128d){ w, 0 }; 491} 492 493static inline __m128d __attribute__((__always_inline__, __nodebug__)) 494_mm_set1_pd(double w) 495{ 496 return (__m128d){ w, w }; 497} 498 499static inline __m128d __attribute__((__always_inline__, __nodebug__)) 500_mm_set_pd(double w, double x) 501{ 502 return (__m128d){ w, x }; 503} 504 505static inline __m128d __attribute__((__always_inline__, __nodebug__)) 506_mm_setr_pd(double w, double x) 507{ 508 return (__m128d){ x, w }; 509} 510 511static inline __m128d __attribute__((__always_inline__, __nodebug__)) 512_mm_setzero_pd(void) 513{ 514 return (__m128d){ 0, 0 }; 515} 516 517static inline __m128d __attribute__((__always_inline__, __nodebug__)) 518_mm_move_sd(__m128d a, __m128d b) 519{ 520 return (__m128d){ b[0], a[1] }; 521} 522 523static inline void __attribute__((__always_inline__, __nodebug__)) 524_mm_store_sd(double *dp, __m128d a) 525{ 526 dp[0] = a[0]; 527} 528 529static inline void __attribute__((__always_inline__, __nodebug__)) 530_mm_store1_pd(double *dp, __m128d a) 531{ 532 dp[0] = a[0]; 533 dp[1] = a[0]; 534} 535 536static inline void __attribute__((__always_inline__, __nodebug__)) 537_mm_store_pd(double *dp, __m128d a) 538{ 539 *(__m128d *)dp = a; 540} 541 542static inline void __attribute__((__always_inline__, __nodebug__)) 543_mm_storeu_pd(double *dp, __m128d a) 544{ 545 __builtin_ia32_storeupd(dp, a); 546} 547 548static inline void __attribute__((__always_inline__, __nodebug__)) 549_mm_storer_pd(double *dp, __m128d a) 550{ 551 dp[0] = a[1]; 552 dp[1] = a[0]; 553} 554 555static inline void __attribute__((__always_inline__, __nodebug__)) 556_mm_storeh_pd(double *dp, __m128d a) 557{ 558 dp[0] = a[1]; 559} 560 561static inline void __attribute__((__always_inline__, __nodebug__)) 562_mm_storel_pd(double *dp, __m128d a) 563{ 564 dp[0] = a[0]; 565} 566 567static inline __m128i __attribute__((__always_inline__, __nodebug__)) 568_mm_add_epi8(__m128i a, __m128i b) 569{ 570 return (__m128i)((__v16qi)a + (__v16qi)b); 571} 572 573static inline __m128i __attribute__((__always_inline__, __nodebug__)) 574_mm_add_epi16(__m128i a, __m128i b) 575{ 576 return (__m128i)((__v8hi)a + (__v8hi)b); 577} 578 579static inline __m128i __attribute__((__always_inline__, __nodebug__)) 580_mm_add_epi32(__m128i a, __m128i b) 581{ 582 return (__m128i)((__v4si)a + (__v4si)b); 583} 584 585static inline __m64 __attribute__((__always_inline__, __nodebug__)) 586_mm_add_si64(__m64 a, __m64 b) 587{ 588 return a + b; 589} 590 591static inline __m128i __attribute__((__always_inline__, __nodebug__)) 592_mm_add_epi64(__m128i a, __m128i b) 593{ 594 return a + b; 595} 596 597static inline __m128i __attribute__((__always_inline__, __nodebug__)) 598_mm_adds_epi8(__m128i a, __m128i b) 599{ 600 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 601} 602 603static inline __m128i __attribute__((__always_inline__, __nodebug__)) 604_mm_adds_epi16(__m128i a, __m128i b) 605{ 606 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 607} 608 609static inline __m128i __attribute__((__always_inline__, __nodebug__)) 610_mm_adds_epu8(__m128i a, __m128i b) 611{ 612 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 613} 614 615static inline __m128i __attribute__((__always_inline__, __nodebug__)) 616_mm_adds_epu16(__m128i a, __m128i b) 617{ 618 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 619} 620 621static inline __m128i __attribute__((__always_inline__, __nodebug__)) 622_mm_avg_epu8(__m128i a, __m128i b) 623{ 624 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 625} 626 627static inline __m128i __attribute__((__always_inline__, __nodebug__)) 628_mm_avg_epu16(__m128i a, __m128i b) 629{ 630 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 631} 632 633static inline __m128i __attribute__((__always_inline__, __nodebug__)) 634_mm_madd_epi16(__m128i a, __m128i b) 635{ 636 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 637} 638 639static inline __m128i __attribute__((__always_inline__, __nodebug__)) 640_mm_max_epi16(__m128i a, __m128i b) 641{ 642 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 643} 644 645static inline __m128i __attribute__((__always_inline__, __nodebug__)) 646_mm_max_epu8(__m128i a, __m128i b) 647{ 648 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 649} 650 651static inline __m128i __attribute__((__always_inline__, __nodebug__)) 652_mm_min_epi16(__m128i a, __m128i b) 653{ 654 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 655} 656 657static inline __m128i __attribute__((__always_inline__, __nodebug__)) 658_mm_min_epu8(__m128i a, __m128i b) 659{ 660 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 661} 662 663static inline __m128i __attribute__((__always_inline__, __nodebug__)) 664_mm_mulhi_epi16(__m128i a, __m128i b) 665{ 666 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 667} 668 669static inline __m128i __attribute__((__always_inline__, __nodebug__)) 670_mm_mulhi_epu16(__m128i a, __m128i b) 671{ 672 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 673} 674 675static inline __m128i __attribute__((__always_inline__, __nodebug__)) 676_mm_mullo_epi16(__m128i a, __m128i b) 677{ 678 return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b); 679} 680 681static inline __m64 __attribute__((__always_inline__, __nodebug__)) 682_mm_mul_su32(__m64 a, __m64 b) 683{ 684 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 685} 686 687static inline __m128i __attribute__((__always_inline__, __nodebug__)) 688_mm_mul_epu32(__m128i a, __m128i b) 689{ 690 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 691} 692 693static inline __m128i __attribute__((__always_inline__, __nodebug__)) 694_mm_sad_epu8(__m128i a, __m128i b) 695{ 696 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 697} 698 699static inline __m128i __attribute__((__always_inline__, __nodebug__)) 700_mm_sub_epi8(__m128i a, __m128i b) 701{ 702 return (__m128i)((__v16qi)a - (__v16qi)b); 703} 704 705static inline __m128i __attribute__((__always_inline__, __nodebug__)) 706_mm_sub_epi16(__m128i a, __m128i b) 707{ 708 return (__m128i)((__v8hi)a - (__v8hi)b); 709} 710 711static inline __m128i __attribute__((__always_inline__, __nodebug__)) 712_mm_sub_epi32(__m128i a, __m128i b) 713{ 714 return (__m128i)((__v4si)a - (__v4si)b); 715} 716 717static inline __m64 __attribute__((__always_inline__, __nodebug__)) 718_mm_sub_si64(__m64 a, __m64 b) 719{ 720 return a - b; 721} 722 723static inline __m128i __attribute__((__always_inline__, __nodebug__)) 724_mm_sub_epi64(__m128i a, __m128i b) 725{ 726 return a - b; 727} 728 729static inline __m128i __attribute__((__always_inline__, __nodebug__)) 730_mm_subs_epi8(__m128i a, __m128i b) 731{ 732 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 733} 734 735static inline __m128i __attribute__((__always_inline__, __nodebug__)) 736_mm_subs_epi16(__m128i a, __m128i b) 737{ 738 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 739} 740 741static inline __m128i __attribute__((__always_inline__, __nodebug__)) 742_mm_subs_epu8(__m128i a, __m128i b) 743{ 744 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 745} 746 747static inline __m128i __attribute__((__always_inline__, __nodebug__)) 748_mm_subs_epu16(__m128i a, __m128i b) 749{ 750 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 751} 752 753static inline __m128i __attribute__((__always_inline__, __nodebug__)) 754_mm_and_si128(__m128i a, __m128i b) 755{ 756 return a & b; 757} 758 759static inline __m128i __attribute__((__always_inline__, __nodebug__)) 760_mm_andnot_si128(__m128i a, __m128i b) 761{ 762 return ~a & b; 763} 764 765static inline __m128i __attribute__((__always_inline__, __nodebug__)) 766_mm_or_si128(__m128i a, __m128i b) 767{ 768 return a | b; 769} 770 771static inline __m128i __attribute__((__always_inline__, __nodebug__)) 772_mm_xor_si128(__m128i a, __m128i b) 773{ 774 return a ^ b; 775} 776 777static inline __m128i __attribute__((__always_inline__, __nodebug__)) 778_mm_slli_si128(__m128i a, int imm) 779{ 780 return __builtin_ia32_pslldqi128(a, imm * 8); 781} 782 783static inline __m128i __attribute__((__always_inline__, __nodebug__)) 784_mm_slli_epi16(__m128i a, int count) 785{ 786 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 787} 788 789static inline __m128i __attribute__((__always_inline__, __nodebug__)) 790_mm_sll_epi16(__m128i a, __m128i count) 791{ 792 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 793} 794 795static inline __m128i __attribute__((__always_inline__, __nodebug__)) 796_mm_slli_epi32(__m128i a, int count) 797{ 798 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 799} 800 801static inline __m128i __attribute__((__always_inline__, __nodebug__)) 802_mm_sll_epi32(__m128i a, __m128i count) 803{ 804 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 805} 806 807static inline __m128i __attribute__((__always_inline__, __nodebug__)) 808_mm_slli_epi64(__m128i a, int count) 809{ 810 return __builtin_ia32_psllqi128(a, count); 811} 812 813static inline __m128i __attribute__((__always_inline__, __nodebug__)) 814_mm_sll_epi64(__m128i a, __m128i count) 815{ 816 return __builtin_ia32_psllq128(a, count); 817} 818 819static inline __m128i __attribute__((__always_inline__, __nodebug__)) 820_mm_srai_epi16(__m128i a, int count) 821{ 822 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 823} 824 825static inline __m128i __attribute__((__always_inline__, __nodebug__)) 826_mm_sra_epi16(__m128i a, __m128i count) 827{ 828 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 829} 830 831static inline __m128i __attribute__((__always_inline__, __nodebug__)) 832_mm_srai_epi32(__m128i a, int count) 833{ 834 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 835} 836 837static inline __m128i __attribute__((__always_inline__, __nodebug__)) 838_mm_sra_epi32(__m128i a, __m128i count) 839{ 840 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 841} 842 843static inline __m128i __attribute__((__always_inline__, __nodebug__)) 844_mm_srli_si128(__m128i a, int imm) 845{ 846 return __builtin_ia32_psrldqi128(a, imm * 8); 847} 848 849static inline __m128i __attribute__((__always_inline__, __nodebug__)) 850_mm_srli_epi16(__m128i a, int count) 851{ 852 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 853} 854 855static inline __m128i __attribute__((__always_inline__, __nodebug__)) 856_mm_srl_epi16(__m128i a, __m128i count) 857{ 858 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 859} 860 861static inline __m128i __attribute__((__always_inline__, __nodebug__)) 862_mm_srli_epi32(__m128i a, int count) 863{ 864 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 865} 866 867static inline __m128i __attribute__((__always_inline__, __nodebug__)) 868_mm_srl_epi32(__m128i a, __m128i count) 869{ 870 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 871} 872 873static inline __m128i __attribute__((__always_inline__, __nodebug__)) 874_mm_srli_epi64(__m128i a, int count) 875{ 876 return __builtin_ia32_psrlqi128(a, count); 877} 878 879static inline __m128i __attribute__((__always_inline__, __nodebug__)) 880_mm_srl_epi64(__m128i a, __m128i count) 881{ 882 return __builtin_ia32_psrlq128(a, count); 883} 884 885static inline __m128i __attribute__((__always_inline__, __nodebug__)) 886_mm_cmpeq_epi8(__m128i a, __m128i b) 887{ 888 return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b); 889} 890 891static inline __m128i __attribute__((__always_inline__, __nodebug__)) 892_mm_cmpeq_epi16(__m128i a, __m128i b) 893{ 894 return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b); 895} 896 897static inline __m128i __attribute__((__always_inline__, __nodebug__)) 898_mm_cmpeq_epi32(__m128i a, __m128i b) 899{ 900 return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b); 901} 902 903static inline __m128i __attribute__((__always_inline__, __nodebug__)) 904_mm_cmpgt_epi8(__m128i a, __m128i b) 905{ 906 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b); 907} 908 909static inline __m128i __attribute__((__always_inline__, __nodebug__)) 910_mm_cmpgt_epi16(__m128i a, __m128i b) 911{ 912 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b); 913} 914 915static inline __m128i __attribute__((__always_inline__, __nodebug__)) 916_mm_cmpgt_epi32(__m128i a, __m128i b) 917{ 918 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b); 919} 920 921static inline __m128i __attribute__((__always_inline__, __nodebug__)) 922_mm_cmplt_epi8(__m128i a, __m128i b) 923{ 924 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a); 925} 926 927static inline __m128i __attribute__((__always_inline__, __nodebug__)) 928_mm_cmplt_epi16(__m128i a, __m128i b) 929{ 930 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a); 931} 932 933static inline __m128i __attribute__((__always_inline__, __nodebug__)) 934_mm_cmplt_epi32(__m128i a, __m128i b) 935{ 936 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a); 937} 938 939#ifdef __x86_64__ 940static inline __m128d __attribute__((__always_inline__, __nodebug__)) 941_mm_cvtsi64_sd(__m128d a, long long b) 942{ 943 a[0] = b; 944 return a; 945} 946 947static inline long long __attribute__((__always_inline__, __nodebug__)) 948_mm_cvtsd_si64(__m128d a) 949{ 950 return __builtin_ia32_cvtsd2si64(a); 951} 952 953static inline long long __attribute__((__always_inline__, __nodebug__)) 954_mm_cvttsd_si64(__m128d a) 955{ 956 return a[0]; 957} 958#endif 959 960static inline __m128 __attribute__((__always_inline__, __nodebug__)) 961_mm_cvtepi32_ps(__m128i a) 962{ 963 return __builtin_ia32_cvtdq2ps((__v4si)a); 964} 965 966static inline __m128i __attribute__((__always_inline__, __nodebug__)) 967_mm_cvtps_epi32(__m128 a) 968{ 969 return (__m128i)__builtin_ia32_cvtps2dq(a); 970} 971 972static inline __m128i __attribute__((__always_inline__, __nodebug__)) 973_mm_cvttps_epi32(__m128 a) 974{ 975 return (__m128i)__builtin_ia32_cvttps2dq(a); 976} 977 978static inline __m128i __attribute__((__always_inline__, __nodebug__)) 979_mm_cvtsi32_si128(int a) 980{ 981 return (__m128i)(__v4si){ a, 0, 0, 0 }; 982} 983 984#ifdef __x86_64__ 985static inline __m128i __attribute__((__always_inline__, __nodebug__)) 986_mm_cvtsi64_si128(long long a) 987{ 988 return (__m128i){ a, 0 }; 989} 990#endif 991 992static inline int __attribute__((__always_inline__, __nodebug__)) 993_mm_cvtsi128_si32(__m128i a) 994{ 995 __v4si b = (__v4si)a; 996 return b[0]; 997} 998 999#ifdef __x86_64__ 1000static inline long long __attribute__((__always_inline__, __nodebug__)) 1001_mm_cvtsi128_si64(__m128i a) 1002{ 1003 return a[0]; 1004} 1005#endif 1006 1007static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1008_mm_load_si128(__m128i const *p) 1009{ 1010 return *p; 1011} 1012 1013static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1014_mm_loadu_si128(__m128i const *p) 1015{ 1016 return (__m128i)__builtin_ia32_loaddqu((char const *)p); 1017} 1018 1019static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1020_mm_loadl_epi64(__m128i const *p) 1021{ 1022 return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p); 1023} 1024 1025static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1026_mm_set_epi64(__m64 q1, __m64 q0) 1027{ 1028 return (__m128i){ (long long)q0, (long long)q1 }; 1029} 1030 1031static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1032_mm_set_epi32(int i3, int i2, int i1, int i0) 1033{ 1034 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1035} 1036 1037static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1038_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1039{ 1040 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1041} 1042 1043static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1044_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1045{ 1046 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1047} 1048 1049static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1050_mm_set1_epi64(__m64 q) 1051{ 1052 return (__m128i){ (long long)q, (long long)q }; 1053} 1054 1055static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1056_mm_set1_epi32(int i) 1057{ 1058 return (__m128i)(__v4si){ i, i, i, i }; 1059} 1060 1061static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1062_mm_set1_epi16(short w) 1063{ 1064 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1065} 1066 1067static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1068_mm_set1_epi8(char b) 1069{ 1070 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1071} 1072 1073static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1074_mm_setr_epi64(__m64 q0, __m64 q1) 1075{ 1076 return (__m128i){ (long long)q0, (long long)q1 }; 1077} 1078 1079static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1080_mm_setr_epi32(int i0, int i1, int i2, int i3) 1081{ 1082 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1083} 1084 1085static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1086_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1087{ 1088 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1089} 1090 1091static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1092_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1093{ 1094 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1095} 1096 1097static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1098_mm_setzero_si128(void) 1099{ 1100 return (__m128i){ 0LL, 0LL }; 1101} 1102 1103static inline void __attribute__((__always_inline__, __nodebug__)) 1104_mm_store_si128(__m128i *p, __m128i b) 1105{ 1106 *p = b; 1107} 1108 1109static inline void __attribute__((__always_inline__, __nodebug__)) 1110_mm_storeu_si128(__m128i *p, __m128i b) 1111{ 1112 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1113} 1114 1115static inline void __attribute__((__always_inline__, __nodebug__)) 1116_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1117{ 1118 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1119} 1120 1121static inline void __attribute__((__always_inline__, __nodebug__)) 1122_mm_storel_epi64(__m128i *p, __m128i a) 1123{ 1124 __builtin_ia32_storelv4si((__v2si *)p, a); 1125} 1126 1127static inline void __attribute__((__always_inline__, __nodebug__)) 1128_mm_stream_pd(double *p, __m128d a) 1129{ 1130 __builtin_ia32_movntpd(p, a); 1131} 1132 1133static inline void __attribute__((__always_inline__, __nodebug__)) 1134_mm_stream_si128(__m128i *p, __m128i a) 1135{ 1136 __builtin_ia32_movntdq(p, a); 1137} 1138 1139static inline void __attribute__((__always_inline__, __nodebug__)) 1140_mm_stream_si32(int *p, int a) 1141{ 1142 __builtin_ia32_movnti(p, a); 1143} 1144 1145static inline void __attribute__((__always_inline__, __nodebug__)) 1146_mm_clflush(void const *p) 1147{ 1148 __builtin_ia32_clflush(p); 1149} 1150 1151static inline void __attribute__((__always_inline__, __nodebug__)) 1152_mm_lfence(void) 1153{ 1154 __builtin_ia32_lfence(); 1155} 1156 1157static inline void __attribute__((__always_inline__, __nodebug__)) 1158_mm_mfence(void) 1159{ 1160 __builtin_ia32_mfence(); 1161} 1162 1163static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1164_mm_packs_epi16(__m128i a, __m128i b) 1165{ 1166 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1167} 1168 1169static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1170_mm_packs_epi32(__m128i a, __m128i b) 1171{ 1172 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1173} 1174 1175static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1176_mm_packus_epi16(__m128i a, __m128i b) 1177{ 1178 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1179} 1180 1181static inline int __attribute__((__always_inline__, __nodebug__)) 1182_mm_extract_epi16(__m128i a, int imm) 1183{ 1184 __v8hi b = (__v8hi)a; 1185 return b[imm]; 1186} 1187 1188static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1189_mm_insert_epi16(__m128i a, int b, int imm) 1190{ 1191 __v8hi c = (__v8hi)a; 1192 c[imm & 7] = b; 1193 return c; 1194} 1195 1196static inline int __attribute__((__always_inline__, __nodebug__)) 1197_mm_movemask_epi8(__m128i a) 1198{ 1199 return __builtin_ia32_pmovmskb128((__v16qi)a); 1200} 1201 1202#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm))) 1203#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm))) 1204#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm))) 1205 1206static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1207_mm_unpackhi_epi8(__m128i a, __m128i b) 1208{ 1209 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1210} 1211 1212static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1213_mm_unpackhi_epi16(__m128i a, __m128i b) 1214{ 1215 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1216} 1217 1218static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1219_mm_unpackhi_epi32(__m128i a, __m128i b) 1220{ 1221 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1222} 1223 1224static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1225_mm_unpackhi_epi64(__m128i a, __m128i b) 1226{ 1227 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1228} 1229 1230static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1231_mm_unpacklo_epi8(__m128i a, __m128i b) 1232{ 1233 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1234} 1235 1236static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1237_mm_unpacklo_epi16(__m128i a, __m128i b) 1238{ 1239 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1240} 1241 1242static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1243_mm_unpacklo_epi32(__m128i a, __m128i b) 1244{ 1245 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1246} 1247 1248static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1249_mm_unpacklo_epi64(__m128i a, __m128i b) 1250{ 1251 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1252} 1253 1254static inline __m64 __attribute__((__always_inline__, __nodebug__)) 1255_mm_movepi64_pi64(__m128i a) 1256{ 1257 return (__m64)a[0]; 1258} 1259 1260static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1261_mm_movpi64_pi64(__m64 a) 1262{ 1263 return (__m128i){ (long long)a, 0 }; 1264} 1265 1266static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1267_mm_move_epi64(__m128i a) 1268{ 1269 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1270} 1271 1272static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1273_mm_unpackhi_pd(__m128d a, __m128d b) 1274{ 1275 return __builtin_shufflevector(a, b, 1, 2+1); 1276} 1277 1278static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1279_mm_unpacklo_pd(__m128d a, __m128d b) 1280{ 1281 return __builtin_shufflevector(a, b, 0, 2+0); 1282} 1283 1284static inline int __attribute__((__always_inline__, __nodebug__)) 1285_mm_movemask_pd(__m128d a) 1286{ 1287 return __builtin_ia32_movmskpd(a); 1288} 1289 1290#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i))) 1291 1292static inline __m128 __attribute__((__always_inline__, __nodebug__)) 1293_mm_castpd_ps(__m128d in) 1294{ 1295 return (__m128)in; 1296} 1297 1298static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1299_mm_castpd_si128(__m128d in) 1300{ 1301 return (__m128i)in; 1302} 1303 1304static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1305_mm_castps_pd(__m128 in) 1306{ 1307 return (__m128d)in; 1308} 1309 1310static inline __m128i __attribute__((__always_inline__, __nodebug__)) 1311_mm_castps_si128(__m128 in) 1312{ 1313 return (__m128i)in; 1314} 1315 1316static inline __m128 __attribute__((__always_inline__, __nodebug__)) 1317_mm_castsi128_ps(__m128i in) 1318{ 1319 return (__m128)in; 1320} 1321 1322static inline __m128d __attribute__((__always_inline__, __nodebug__)) 1323_mm_castsi128_pd(__m128i in) 1324{ 1325 return (__m128d)in; 1326} 1327 1328static inline void __attribute__((__always_inline__, __nodebug__)) 1329_mm_pause(void) 1330{ 1331 __asm__ volatile ("pause"); 1332} 1333 1334#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1335 1336#endif /* __SSE2__ */ 1337 1338#endif /* __EMMINTRIN_H */ 1339