1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#include <xmmintrin.h> 28 29typedef double __m128d __attribute__((__vector_size__(16))); 30typedef long long __m128i __attribute__((__vector_size__(16))); 31 32/* Type defines. */ 33typedef double __v2df __attribute__ ((__vector_size__ (16))); 34typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35typedef short __v8hi __attribute__((__vector_size__(16))); 36typedef char __v16qi __attribute__((__vector_size__(16))); 37 38/* We need an explicitly signed variant for char. Note that this shouldn't 39 * appear in the interface though. */ 40typedef signed char __v16qs __attribute__((__vector_size__(16))); 41 42#include <f16cintrin.h> 43 44/* Define the default attributes for the functions in this file. */ 45#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 46 47static __inline__ __m128d __DEFAULT_FN_ATTRS 48_mm_add_sd(__m128d __a, __m128d __b) 49{ 50 __a[0] += __b[0]; 51 return __a; 52} 53 54static __inline__ __m128d __DEFAULT_FN_ATTRS 55_mm_add_pd(__m128d __a, __m128d __b) 56{ 57 return __a + __b; 58} 59 60static __inline__ __m128d __DEFAULT_FN_ATTRS 61_mm_sub_sd(__m128d __a, __m128d __b) 62{ 63 __a[0] -= __b[0]; 64 return __a; 65} 66 67static __inline__ __m128d __DEFAULT_FN_ATTRS 68_mm_sub_pd(__m128d __a, __m128d __b) 69{ 70 return __a - __b; 71} 72 73static __inline__ __m128d __DEFAULT_FN_ATTRS 74_mm_mul_sd(__m128d __a, __m128d __b) 75{ 76 __a[0] *= __b[0]; 77 return __a; 78} 79 80static __inline__ __m128d __DEFAULT_FN_ATTRS 81_mm_mul_pd(__m128d __a, __m128d __b) 82{ 83 return __a * __b; 84} 85 86static __inline__ __m128d __DEFAULT_FN_ATTRS 87_mm_div_sd(__m128d __a, __m128d __b) 88{ 89 __a[0] /= __b[0]; 90 return __a; 91} 92 93static __inline__ __m128d __DEFAULT_FN_ATTRS 94_mm_div_pd(__m128d __a, __m128d __b) 95{ 96 return __a / __b; 97} 98 99static __inline__ __m128d __DEFAULT_FN_ATTRS 100_mm_sqrt_sd(__m128d __a, __m128d __b) 101{ 102 __m128d __c = __builtin_ia32_sqrtsd(__b); 103 return (__m128d) { __c[0], __a[1] }; 104} 105 106static __inline__ __m128d __DEFAULT_FN_ATTRS 107_mm_sqrt_pd(__m128d __a) 108{ 109 return __builtin_ia32_sqrtpd(__a); 110} 111 112static __inline__ __m128d __DEFAULT_FN_ATTRS 113_mm_min_sd(__m128d __a, __m128d __b) 114{ 115 return __builtin_ia32_minsd(__a, __b); 116} 117 118static __inline__ __m128d __DEFAULT_FN_ATTRS 119_mm_min_pd(__m128d __a, __m128d __b) 120{ 121 return __builtin_ia32_minpd(__a, __b); 122} 123 124static __inline__ __m128d __DEFAULT_FN_ATTRS 125_mm_max_sd(__m128d __a, __m128d __b) 126{ 127 return __builtin_ia32_maxsd(__a, __b); 128} 129 130static __inline__ __m128d __DEFAULT_FN_ATTRS 131_mm_max_pd(__m128d __a, __m128d __b) 132{ 133 return __builtin_ia32_maxpd(__a, __b); 134} 135 136static __inline__ __m128d __DEFAULT_FN_ATTRS 137_mm_and_pd(__m128d __a, __m128d __b) 138{ 139 return (__m128d)((__v4si)__a & (__v4si)__b); 140} 141 142static __inline__ __m128d __DEFAULT_FN_ATTRS 143_mm_andnot_pd(__m128d __a, __m128d __b) 144{ 145 return (__m128d)(~(__v4si)__a & (__v4si)__b); 146} 147 148static __inline__ __m128d __DEFAULT_FN_ATTRS 149_mm_or_pd(__m128d __a, __m128d __b) 150{ 151 return (__m128d)((__v4si)__a | (__v4si)__b); 152} 153 154static __inline__ __m128d __DEFAULT_FN_ATTRS 155_mm_xor_pd(__m128d __a, __m128d __b) 156{ 157 return (__m128d)((__v4si)__a ^ (__v4si)__b); 158} 159 160static __inline__ __m128d __DEFAULT_FN_ATTRS 161_mm_cmpeq_pd(__m128d __a, __m128d __b) 162{ 163 return (__m128d)__builtin_ia32_cmpeqpd(__a, __b); 164} 165 166static __inline__ __m128d __DEFAULT_FN_ATTRS 167_mm_cmplt_pd(__m128d __a, __m128d __b) 168{ 169 return (__m128d)__builtin_ia32_cmpltpd(__a, __b); 170} 171 172static __inline__ __m128d __DEFAULT_FN_ATTRS 173_mm_cmple_pd(__m128d __a, __m128d __b) 174{ 175 return (__m128d)__builtin_ia32_cmplepd(__a, __b); 176} 177 178static __inline__ __m128d __DEFAULT_FN_ATTRS 179_mm_cmpgt_pd(__m128d __a, __m128d __b) 180{ 181 return (__m128d)__builtin_ia32_cmpltpd(__b, __a); 182} 183 184static __inline__ __m128d __DEFAULT_FN_ATTRS 185_mm_cmpge_pd(__m128d __a, __m128d __b) 186{ 187 return (__m128d)__builtin_ia32_cmplepd(__b, __a); 188} 189 190static __inline__ __m128d __DEFAULT_FN_ATTRS 191_mm_cmpord_pd(__m128d __a, __m128d __b) 192{ 193 return (__m128d)__builtin_ia32_cmpordpd(__a, __b); 194} 195 196static __inline__ __m128d __DEFAULT_FN_ATTRS 197_mm_cmpunord_pd(__m128d __a, __m128d __b) 198{ 199 return (__m128d)__builtin_ia32_cmpunordpd(__a, __b); 200} 201 202static __inline__ __m128d __DEFAULT_FN_ATTRS 203_mm_cmpneq_pd(__m128d __a, __m128d __b) 204{ 205 return (__m128d)__builtin_ia32_cmpneqpd(__a, __b); 206} 207 208static __inline__ __m128d __DEFAULT_FN_ATTRS 209_mm_cmpnlt_pd(__m128d __a, __m128d __b) 210{ 211 return (__m128d)__builtin_ia32_cmpnltpd(__a, __b); 212} 213 214static __inline__ __m128d __DEFAULT_FN_ATTRS 215_mm_cmpnle_pd(__m128d __a, __m128d __b) 216{ 217 return (__m128d)__builtin_ia32_cmpnlepd(__a, __b); 218} 219 220static __inline__ __m128d __DEFAULT_FN_ATTRS 221_mm_cmpngt_pd(__m128d __a, __m128d __b) 222{ 223 return (__m128d)__builtin_ia32_cmpnltpd(__b, __a); 224} 225 226static __inline__ __m128d __DEFAULT_FN_ATTRS 227_mm_cmpnge_pd(__m128d __a, __m128d __b) 228{ 229 return (__m128d)__builtin_ia32_cmpnlepd(__b, __a); 230} 231 232static __inline__ __m128d __DEFAULT_FN_ATTRS 233_mm_cmpeq_sd(__m128d __a, __m128d __b) 234{ 235 return (__m128d)__builtin_ia32_cmpeqsd(__a, __b); 236} 237 238static __inline__ __m128d __DEFAULT_FN_ATTRS 239_mm_cmplt_sd(__m128d __a, __m128d __b) 240{ 241 return (__m128d)__builtin_ia32_cmpltsd(__a, __b); 242} 243 244static __inline__ __m128d __DEFAULT_FN_ATTRS 245_mm_cmple_sd(__m128d __a, __m128d __b) 246{ 247 return (__m128d)__builtin_ia32_cmplesd(__a, __b); 248} 249 250static __inline__ __m128d __DEFAULT_FN_ATTRS 251_mm_cmpgt_sd(__m128d __a, __m128d __b) 252{ 253 __m128d __c = __builtin_ia32_cmpltsd(__b, __a); 254 return (__m128d) { __c[0], __a[1] }; 255} 256 257static __inline__ __m128d __DEFAULT_FN_ATTRS 258_mm_cmpge_sd(__m128d __a, __m128d __b) 259{ 260 __m128d __c = __builtin_ia32_cmplesd(__b, __a); 261 return (__m128d) { __c[0], __a[1] }; 262} 263 264static __inline__ __m128d __DEFAULT_FN_ATTRS 265_mm_cmpord_sd(__m128d __a, __m128d __b) 266{ 267 return (__m128d)__builtin_ia32_cmpordsd(__a, __b); 268} 269 270static __inline__ __m128d __DEFAULT_FN_ATTRS 271_mm_cmpunord_sd(__m128d __a, __m128d __b) 272{ 273 return (__m128d)__builtin_ia32_cmpunordsd(__a, __b); 274} 275 276static __inline__ __m128d __DEFAULT_FN_ATTRS 277_mm_cmpneq_sd(__m128d __a, __m128d __b) 278{ 279 return (__m128d)__builtin_ia32_cmpneqsd(__a, __b); 280} 281 282static __inline__ __m128d __DEFAULT_FN_ATTRS 283_mm_cmpnlt_sd(__m128d __a, __m128d __b) 284{ 285 return (__m128d)__builtin_ia32_cmpnltsd(__a, __b); 286} 287 288static __inline__ __m128d __DEFAULT_FN_ATTRS 289_mm_cmpnle_sd(__m128d __a, __m128d __b) 290{ 291 return (__m128d)__builtin_ia32_cmpnlesd(__a, __b); 292} 293 294static __inline__ __m128d __DEFAULT_FN_ATTRS 295_mm_cmpngt_sd(__m128d __a, __m128d __b) 296{ 297 __m128d __c = __builtin_ia32_cmpnltsd(__b, __a); 298 return (__m128d) { __c[0], __a[1] }; 299} 300 301static __inline__ __m128d __DEFAULT_FN_ATTRS 302_mm_cmpnge_sd(__m128d __a, __m128d __b) 303{ 304 __m128d __c = __builtin_ia32_cmpnlesd(__b, __a); 305 return (__m128d) { __c[0], __a[1] }; 306} 307 308static __inline__ int __DEFAULT_FN_ATTRS 309_mm_comieq_sd(__m128d __a, __m128d __b) 310{ 311 return __builtin_ia32_comisdeq(__a, __b); 312} 313 314static __inline__ int __DEFAULT_FN_ATTRS 315_mm_comilt_sd(__m128d __a, __m128d __b) 316{ 317 return __builtin_ia32_comisdlt(__a, __b); 318} 319 320static __inline__ int __DEFAULT_FN_ATTRS 321_mm_comile_sd(__m128d __a, __m128d __b) 322{ 323 return __builtin_ia32_comisdle(__a, __b); 324} 325 326static __inline__ int __DEFAULT_FN_ATTRS 327_mm_comigt_sd(__m128d __a, __m128d __b) 328{ 329 return __builtin_ia32_comisdgt(__a, __b); 330} 331 332static __inline__ int __DEFAULT_FN_ATTRS 333_mm_comige_sd(__m128d __a, __m128d __b) 334{ 335 return __builtin_ia32_comisdge(__a, __b); 336} 337 338static __inline__ int __DEFAULT_FN_ATTRS 339_mm_comineq_sd(__m128d __a, __m128d __b) 340{ 341 return __builtin_ia32_comisdneq(__a, __b); 342} 343 344static __inline__ int __DEFAULT_FN_ATTRS 345_mm_ucomieq_sd(__m128d __a, __m128d __b) 346{ 347 return __builtin_ia32_ucomisdeq(__a, __b); 348} 349 350static __inline__ int __DEFAULT_FN_ATTRS 351_mm_ucomilt_sd(__m128d __a, __m128d __b) 352{ 353 return __builtin_ia32_ucomisdlt(__a, __b); 354} 355 356static __inline__ int __DEFAULT_FN_ATTRS 357_mm_ucomile_sd(__m128d __a, __m128d __b) 358{ 359 return __builtin_ia32_ucomisdle(__a, __b); 360} 361 362static __inline__ int __DEFAULT_FN_ATTRS 363_mm_ucomigt_sd(__m128d __a, __m128d __b) 364{ 365 return __builtin_ia32_ucomisdgt(__a, __b); 366} 367 368static __inline__ int __DEFAULT_FN_ATTRS 369_mm_ucomige_sd(__m128d __a, __m128d __b) 370{ 371 return __builtin_ia32_ucomisdge(__a, __b); 372} 373 374static __inline__ int __DEFAULT_FN_ATTRS 375_mm_ucomineq_sd(__m128d __a, __m128d __b) 376{ 377 return __builtin_ia32_ucomisdneq(__a, __b); 378} 379 380static __inline__ __m128 __DEFAULT_FN_ATTRS 381_mm_cvtpd_ps(__m128d __a) 382{ 383 return __builtin_ia32_cvtpd2ps(__a); 384} 385 386static __inline__ __m128d __DEFAULT_FN_ATTRS 387_mm_cvtps_pd(__m128 __a) 388{ 389 return __builtin_ia32_cvtps2pd(__a); 390} 391 392static __inline__ __m128d __DEFAULT_FN_ATTRS 393_mm_cvtepi32_pd(__m128i __a) 394{ 395 return __builtin_ia32_cvtdq2pd((__v4si)__a); 396} 397 398static __inline__ __m128i __DEFAULT_FN_ATTRS 399_mm_cvtpd_epi32(__m128d __a) 400{ 401 return __builtin_ia32_cvtpd2dq(__a); 402} 403 404static __inline__ int __DEFAULT_FN_ATTRS 405_mm_cvtsd_si32(__m128d __a) 406{ 407 return __builtin_ia32_cvtsd2si(__a); 408} 409 410static __inline__ __m128 __DEFAULT_FN_ATTRS 411_mm_cvtsd_ss(__m128 __a, __m128d __b) 412{ 413 __a[0] = __b[0]; 414 return __a; 415} 416 417static __inline__ __m128d __DEFAULT_FN_ATTRS 418_mm_cvtsi32_sd(__m128d __a, int __b) 419{ 420 __a[0] = __b; 421 return __a; 422} 423 424static __inline__ __m128d __DEFAULT_FN_ATTRS 425_mm_cvtss_sd(__m128d __a, __m128 __b) 426{ 427 __a[0] = __b[0]; 428 return __a; 429} 430 431static __inline__ __m128i __DEFAULT_FN_ATTRS 432_mm_cvttpd_epi32(__m128d __a) 433{ 434 return (__m128i)__builtin_ia32_cvttpd2dq(__a); 435} 436 437static __inline__ int __DEFAULT_FN_ATTRS 438_mm_cvttsd_si32(__m128d __a) 439{ 440 return __a[0]; 441} 442 443static __inline__ __m64 __DEFAULT_FN_ATTRS 444_mm_cvtpd_pi32(__m128d __a) 445{ 446 return (__m64)__builtin_ia32_cvtpd2pi(__a); 447} 448 449static __inline__ __m64 __DEFAULT_FN_ATTRS 450_mm_cvttpd_pi32(__m128d __a) 451{ 452 return (__m64)__builtin_ia32_cvttpd2pi(__a); 453} 454 455static __inline__ __m128d __DEFAULT_FN_ATTRS 456_mm_cvtpi32_pd(__m64 __a) 457{ 458 return __builtin_ia32_cvtpi2pd((__v2si)__a); 459} 460 461static __inline__ double __DEFAULT_FN_ATTRS 462_mm_cvtsd_f64(__m128d __a) 463{ 464 return __a[0]; 465} 466 467static __inline__ __m128d __DEFAULT_FN_ATTRS 468_mm_load_pd(double const *__dp) 469{ 470 return *(__m128d*)__dp; 471} 472 473static __inline__ __m128d __DEFAULT_FN_ATTRS 474_mm_load1_pd(double const *__dp) 475{ 476 struct __mm_load1_pd_struct { 477 double __u; 478 } __attribute__((__packed__, __may_alias__)); 479 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 480 return (__m128d){ __u, __u }; 481} 482 483#define _mm_load_pd1(dp) _mm_load1_pd(dp) 484 485static __inline__ __m128d __DEFAULT_FN_ATTRS 486_mm_loadr_pd(double const *__dp) 487{ 488 __m128d __u = *(__m128d*)__dp; 489 return __builtin_shufflevector(__u, __u, 1, 0); 490} 491 492static __inline__ __m128d __DEFAULT_FN_ATTRS 493_mm_loadu_pd(double const *__dp) 494{ 495 struct __loadu_pd { 496 __m128d __v; 497 } __attribute__((__packed__, __may_alias__)); 498 return ((struct __loadu_pd*)__dp)->__v; 499} 500 501static __inline__ __m128d __DEFAULT_FN_ATTRS 502_mm_load_sd(double const *__dp) 503{ 504 struct __mm_load_sd_struct { 505 double __u; 506 } __attribute__((__packed__, __may_alias__)); 507 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 508 return (__m128d){ __u, 0 }; 509} 510 511static __inline__ __m128d __DEFAULT_FN_ATTRS 512_mm_loadh_pd(__m128d __a, double const *__dp) 513{ 514 struct __mm_loadh_pd_struct { 515 double __u; 516 } __attribute__((__packed__, __may_alias__)); 517 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 518 return (__m128d){ __a[0], __u }; 519} 520 521static __inline__ __m128d __DEFAULT_FN_ATTRS 522_mm_loadl_pd(__m128d __a, double const *__dp) 523{ 524 struct __mm_loadl_pd_struct { 525 double __u; 526 } __attribute__((__packed__, __may_alias__)); 527 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 528 return (__m128d){ __u, __a[1] }; 529} 530 531static __inline__ __m128d __DEFAULT_FN_ATTRS 532_mm_undefined_pd() 533{ 534 return (__m128d)__builtin_ia32_undef128(); 535} 536 537static __inline__ __m128d __DEFAULT_FN_ATTRS 538_mm_set_sd(double __w) 539{ 540 return (__m128d){ __w, 0 }; 541} 542 543static __inline__ __m128d __DEFAULT_FN_ATTRS 544_mm_set1_pd(double __w) 545{ 546 return (__m128d){ __w, __w }; 547} 548 549static __inline__ __m128d __DEFAULT_FN_ATTRS 550_mm_set_pd(double __w, double __x) 551{ 552 return (__m128d){ __x, __w }; 553} 554 555static __inline__ __m128d __DEFAULT_FN_ATTRS 556_mm_setr_pd(double __w, double __x) 557{ 558 return (__m128d){ __w, __x }; 559} 560 561static __inline__ __m128d __DEFAULT_FN_ATTRS 562_mm_setzero_pd(void) 563{ 564 return (__m128d){ 0, 0 }; 565} 566 567static __inline__ __m128d __DEFAULT_FN_ATTRS 568_mm_move_sd(__m128d __a, __m128d __b) 569{ 570 return (__m128d){ __b[0], __a[1] }; 571} 572 573static __inline__ void __DEFAULT_FN_ATTRS 574_mm_store_sd(double *__dp, __m128d __a) 575{ 576 struct __mm_store_sd_struct { 577 double __u; 578 } __attribute__((__packed__, __may_alias__)); 579 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 580} 581 582static __inline__ void __DEFAULT_FN_ATTRS 583_mm_store1_pd(double *__dp, __m128d __a) 584{ 585 struct __mm_store1_pd_struct { 586 double __u[2]; 587 } __attribute__((__packed__, __may_alias__)); 588 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0]; 589 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; 590} 591 592static __inline__ void __DEFAULT_FN_ATTRS 593_mm_store_pd(double *__dp, __m128d __a) 594{ 595 *(__m128d *)__dp = __a; 596} 597 598static __inline__ void __DEFAULT_FN_ATTRS 599_mm_storeu_pd(double *__dp, __m128d __a) 600{ 601 __builtin_ia32_storeupd(__dp, __a); 602} 603 604static __inline__ void __DEFAULT_FN_ATTRS 605_mm_storer_pd(double *__dp, __m128d __a) 606{ 607 __a = __builtin_shufflevector(__a, __a, 1, 0); 608 *(__m128d *)__dp = __a; 609} 610 611static __inline__ void __DEFAULT_FN_ATTRS 612_mm_storeh_pd(double *__dp, __m128d __a) 613{ 614 struct __mm_storeh_pd_struct { 615 double __u; 616 } __attribute__((__packed__, __may_alias__)); 617 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 618} 619 620static __inline__ void __DEFAULT_FN_ATTRS 621_mm_storel_pd(double *__dp, __m128d __a) 622{ 623 struct __mm_storeh_pd_struct { 624 double __u; 625 } __attribute__((__packed__, __may_alias__)); 626 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 627} 628 629static __inline__ __m128i __DEFAULT_FN_ATTRS 630_mm_add_epi8(__m128i __a, __m128i __b) 631{ 632 return (__m128i)((__v16qi)__a + (__v16qi)__b); 633} 634 635static __inline__ __m128i __DEFAULT_FN_ATTRS 636_mm_add_epi16(__m128i __a, __m128i __b) 637{ 638 return (__m128i)((__v8hi)__a + (__v8hi)__b); 639} 640 641static __inline__ __m128i __DEFAULT_FN_ATTRS 642_mm_add_epi32(__m128i __a, __m128i __b) 643{ 644 return (__m128i)((__v4si)__a + (__v4si)__b); 645} 646 647static __inline__ __m64 __DEFAULT_FN_ATTRS 648_mm_add_si64(__m64 __a, __m64 __b) 649{ 650 return (__m64)__builtin_ia32_paddq(__a, __b); 651} 652 653static __inline__ __m128i __DEFAULT_FN_ATTRS 654_mm_add_epi64(__m128i __a, __m128i __b) 655{ 656 return __a + __b; 657} 658 659static __inline__ __m128i __DEFAULT_FN_ATTRS 660_mm_adds_epi8(__m128i __a, __m128i __b) 661{ 662 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 663} 664 665static __inline__ __m128i __DEFAULT_FN_ATTRS 666_mm_adds_epi16(__m128i __a, __m128i __b) 667{ 668 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 669} 670 671static __inline__ __m128i __DEFAULT_FN_ATTRS 672_mm_adds_epu8(__m128i __a, __m128i __b) 673{ 674 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 675} 676 677static __inline__ __m128i __DEFAULT_FN_ATTRS 678_mm_adds_epu16(__m128i __a, __m128i __b) 679{ 680 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 681} 682 683static __inline__ __m128i __DEFAULT_FN_ATTRS 684_mm_avg_epu8(__m128i __a, __m128i __b) 685{ 686 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 687} 688 689static __inline__ __m128i __DEFAULT_FN_ATTRS 690_mm_avg_epu16(__m128i __a, __m128i __b) 691{ 692 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 693} 694 695static __inline__ __m128i __DEFAULT_FN_ATTRS 696_mm_madd_epi16(__m128i __a, __m128i __b) 697{ 698 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 699} 700 701static __inline__ __m128i __DEFAULT_FN_ATTRS 702_mm_max_epi16(__m128i __a, __m128i __b) 703{ 704 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 705} 706 707static __inline__ __m128i __DEFAULT_FN_ATTRS 708_mm_max_epu8(__m128i __a, __m128i __b) 709{ 710 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 711} 712 713static __inline__ __m128i __DEFAULT_FN_ATTRS 714_mm_min_epi16(__m128i __a, __m128i __b) 715{ 716 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 717} 718 719static __inline__ __m128i __DEFAULT_FN_ATTRS 720_mm_min_epu8(__m128i __a, __m128i __b) 721{ 722 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 723} 724 725static __inline__ __m128i __DEFAULT_FN_ATTRS 726_mm_mulhi_epi16(__m128i __a, __m128i __b) 727{ 728 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 729} 730 731static __inline__ __m128i __DEFAULT_FN_ATTRS 732_mm_mulhi_epu16(__m128i __a, __m128i __b) 733{ 734 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 735} 736 737/// \brief Multiplies the corresponding elements of two [8 x short] vectors and 738/// returns a vector containing the low-order 16 bits of each 32-bit product 739/// in the corresponding element. 740/// 741/// \headerfile <x86intrin.h> 742/// 743/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction. 744/// 745/// \param __a 746/// A 128-bit integer vector containing one of the source operands. 747/// \param __b 748/// A 128-bit integer vector containing one of the source operands. 749/// \returns A 128-bit integer vector containing the products of both operands. 750static __inline__ __m128i __DEFAULT_FN_ATTRS 751_mm_mullo_epi16(__m128i __a, __m128i __b) 752{ 753 return (__m128i)((__v8hi)__a * (__v8hi)__b); 754} 755 756/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 757/// of the two 64-bit integer vectors and returns the 64-bit unsigned 758/// product. 759/// 760/// \headerfile <x86intrin.h> 761/// 762/// This intrinsic corresponds to the \c PMULUDQ instruction. 763/// 764/// \param __a 765/// A 64-bit integer containing one of the source operands. 766/// \param __b 767/// A 64-bit integer containing one of the source operands. 768/// \returns A 64-bit integer vector containing the product of both operands. 769static __inline__ __m64 __DEFAULT_FN_ATTRS 770_mm_mul_su32(__m64 __a, __m64 __b) 771{ 772 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 773} 774 775/// \brief Multiplies 32-bit unsigned integer values contained in the lower 776/// bits of the corresponding elements of two [2 x i64] vectors, and returns 777/// the 64-bit products in the corresponding elements of a [2 x i64] vector. 778/// 779/// \headerfile <x86intrin.h> 780/// 781/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction. 782/// 783/// \param __a 784/// A [2 x i64] vector containing one of the source operands. 785/// \param __b 786/// A [2 x i64] vector containing one of the source operands. 787/// \returns A [2 x i64] vector containing the product of both operands. 788static __inline__ __m128i __DEFAULT_FN_ATTRS 789_mm_mul_epu32(__m128i __a, __m128i __b) 790{ 791 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 792} 793 794/// \brief Computes the absolute differences of corresponding 8-bit integer 795/// values in two 128-bit vectors. Sums the first 8 absolute differences, and 796/// separately sums the second 8 absolute differences. Packss these two 797/// unsigned 16-bit integer sums into the upper and lower elements of a 798/// [2 x i64] vector. 799/// 800/// \headerfile <x86intrin.h> 801/// 802/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction. 803/// 804/// \param __a 805/// A 128-bit integer vector containing one of the source operands. 806/// \param __b 807/// A 128-bit integer vector containing one of the source operands. 808/// \returns A [2 x i64] vector containing the sums of the sets of absolute 809/// differences between both operands. 810static __inline__ __m128i __DEFAULT_FN_ATTRS 811_mm_sad_epu8(__m128i __a, __m128i __b) 812{ 813 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 814} 815 816/// \brief Subtracts the corresponding 8-bit integer values in the operands. 817/// 818/// \headerfile <x86intrin.h> 819/// 820/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction. 821/// 822/// \param __a 823/// A 128-bit integer vector containing the minuends. 824/// \param __b 825/// A 128-bit integer vector containing the subtrahends. 826/// \returns A 128-bit integer vector containing the differences of the values 827/// in the operands. 828static __inline__ __m128i __DEFAULT_FN_ATTRS 829_mm_sub_epi8(__m128i __a, __m128i __b) 830{ 831 return (__m128i)((__v16qi)__a - (__v16qi)__b); 832} 833 834/// \brief Subtracts the corresponding 16-bit integer values in the operands. 835/// 836/// \headerfile <x86intrin.h> 837/// 838/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction. 839/// 840/// \param __a 841/// A 128-bit integer vector containing the minuends. 842/// \param __b 843/// A 128-bit integer vector containing the subtrahends. 844/// \returns A 128-bit integer vector containing the differences of the values 845/// in the operands. 846static __inline__ __m128i __DEFAULT_FN_ATTRS 847_mm_sub_epi16(__m128i __a, __m128i __b) 848{ 849 return (__m128i)((__v8hi)__a - (__v8hi)__b); 850} 851 852/// \brief Subtracts the corresponding 32-bit integer values in the operands. 853/// 854/// \headerfile <x86intrin.h> 855/// 856/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction. 857/// 858/// \param __a 859/// A 128-bit integer vector containing the minuends. 860/// \param __b 861/// A 128-bit integer vector containing the subtrahends. 862/// \returns A 128-bit integer vector containing the differences of the values 863/// in the operands. 864static __inline__ __m128i __DEFAULT_FN_ATTRS 865_mm_sub_epi32(__m128i __a, __m128i __b) 866{ 867 return (__m128i)((__v4si)__a - (__v4si)__b); 868} 869 870/// \brief Subtracts signed or unsigned 64-bit integer values and writes the 871/// difference to the corresponding bits in the destination. 872/// 873/// \headerfile <x86intrin.h> 874/// 875/// This intrinsic corresponds to the \c PSUBQ instruction. 876/// 877/// \param __a 878/// A 64-bit integer vector containing the minuend. 879/// \param __b 880/// A 64-bit integer vector containing the subtrahend. 881/// \returns A 64-bit integer vector containing the difference of the values in 882/// the operands. 883static __inline__ __m64 __DEFAULT_FN_ATTRS 884_mm_sub_si64(__m64 __a, __m64 __b) 885{ 886 return (__m64)__builtin_ia32_psubq(__a, __b); 887} 888 889/// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 890/// 891/// \headerfile <x86intrin.h> 892/// 893/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction. 894/// 895/// \param __a 896/// A 128-bit integer vector containing the minuends. 897/// \param __b 898/// A 128-bit integer vector containing the subtrahends. 899/// \returns A 128-bit integer vector containing the differences of the values 900/// in the operands. 901static __inline__ __m128i __DEFAULT_FN_ATTRS 902_mm_sub_epi64(__m128i __a, __m128i __b) 903{ 904 return __a - __b; 905} 906 907/// \brief Subtracts corresponding 8-bit signed integer values in the input and 908/// returns the differences in the corresponding bytes in the destination. 909/// Differences greater than 7Fh are saturated to 7Fh, and differences less 910/// than 80h are saturated to 80h. 911/// 912/// \headerfile <x86intrin.h> 913/// 914/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction. 915/// 916/// \param __a 917/// A 128-bit integer vector containing the minuends. 918/// \param __b 919/// A 128-bit integer vector containing the subtrahends. 920/// \returns A 128-bit integer vector containing the differences of the values 921/// in the operands. 922static __inline__ __m128i __DEFAULT_FN_ATTRS 923_mm_subs_epi8(__m128i __a, __m128i __b) 924{ 925 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 926} 927 928/// \brief Subtracts corresponding 16-bit signed integer values in the input and 929/// returns the differences in the corresponding bytes in the destination. 930/// Differences greater than 7FFFh are saturated to 7FFFh, and values less 931/// than 8000h are saturated to 8000h. 932/// 933/// \headerfile <x86intrin.h> 934/// 935/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction. 936/// 937/// \param __a 938/// A 128-bit integer vector containing the minuends. 939/// \param __b 940/// A 128-bit integer vector containing the subtrahends. 941/// \returns A 128-bit integer vector containing the differences of the values 942/// in the operands. 943static __inline__ __m128i __DEFAULT_FN_ATTRS 944_mm_subs_epi16(__m128i __a, __m128i __b) 945{ 946 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 947} 948 949/// \brief Subtracts corresponding 8-bit unsigned integer values in the input 950/// and returns the differences in the corresponding bytes in the 951/// destination. Differences less than 00h are saturated to 00h. 952/// 953/// \headerfile <x86intrin.h> 954/// 955/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction. 956/// 957/// \param __a 958/// A 128-bit integer vector containing the minuends. 959/// \param __b 960/// A 128-bit integer vector containing the subtrahends. 961/// \returns A 128-bit integer vector containing the unsigned integer 962/// differences of the values in the operands. 963static __inline__ __m128i __DEFAULT_FN_ATTRS 964_mm_subs_epu8(__m128i __a, __m128i __b) 965{ 966 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 967} 968 969/// \brief Subtracts corresponding 16-bit unsigned integer values in the input 970/// and returns the differences in the corresponding bytes in the 971/// destination. Differences less than 0000h are saturated to 0000h. 972/// 973/// \headerfile <x86intrin.h> 974/// 975/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction. 976/// 977/// \param __a 978/// A 128-bit integer vector containing the minuends. 979/// \param __b 980/// A 128-bit integer vector containing the subtrahends. 981/// \returns A 128-bit integer vector containing the unsigned integer 982/// differences of the values in the operands. 983static __inline__ __m128i __DEFAULT_FN_ATTRS 984_mm_subs_epu16(__m128i __a, __m128i __b) 985{ 986 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 987} 988 989/// \brief Performs a bitwise AND of two 128-bit integer vectors. 990/// 991/// \headerfile <x86intrin.h> 992/// 993/// This intrinsic corresponds to the \c VPAND / PAND instruction. 994/// 995/// \param __a 996/// A 128-bit integer vector containing one of the source operands. 997/// \param __b 998/// A 128-bit integer vector containing one of the source operands. 999/// \returns A 128-bit integer vector containing the bitwise AND of the values 1000/// in both operands. 1001static __inline__ __m128i __DEFAULT_FN_ATTRS 1002_mm_and_si128(__m128i __a, __m128i __b) 1003{ 1004 return __a & __b; 1005} 1006 1007/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 1008/// one's complement of the values contained in the first source operand. 1009/// 1010/// \headerfile <x86intrin.h> 1011/// 1012/// This intrinsic corresponds to the \c VPANDN / PANDN instruction. 1013/// 1014/// \param __a 1015/// A 128-bit vector containing the left source operand. The one's complement 1016/// of this value is used in the bitwise AND. 1017/// \param __b 1018/// A 128-bit vector containing the right source operand. 1019/// \returns A 128-bit integer vector containing the bitwise AND of the one's 1020/// complement of the first operand and the values in the second operand. 1021static __inline__ __m128i __DEFAULT_FN_ATTRS 1022_mm_andnot_si128(__m128i __a, __m128i __b) 1023{ 1024 return ~__a & __b; 1025} 1026/// \brief Performs a bitwise OR of two 128-bit integer vectors. 1027/// 1028/// \headerfile <x86intrin.h> 1029/// 1030/// This intrinsic corresponds to the \c VPOR / POR instruction. 1031/// 1032/// \param __a 1033/// A 128-bit integer vector containing one of the source operands. 1034/// \param __b 1035/// A 128-bit integer vector containing one of the source operands. 1036/// \returns A 128-bit integer vector containing the bitwise OR of the values 1037/// in both operands. 1038static __inline__ __m128i __DEFAULT_FN_ATTRS 1039_mm_or_si128(__m128i __a, __m128i __b) 1040{ 1041 return __a | __b; 1042} 1043 1044/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 1045/// 1046/// \headerfile <x86intrin.h> 1047/// 1048/// This intrinsic corresponds to the \c VPXOR / PXOR instruction. 1049/// 1050/// \param __a 1051/// A 128-bit integer vector containing one of the source operands. 1052/// \param __b 1053/// A 128-bit integer vector containing one of the source operands. 1054/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 1055/// values in both operands. 1056static __inline__ __m128i __DEFAULT_FN_ATTRS 1057_mm_xor_si128(__m128i __a, __m128i __b) 1058{ 1059 return __a ^ __b; 1060} 1061 1062/// \brief Left-shifts the 128-bit integer vector operand by the specified 1063/// number of bytes. Low-order bits are cleared. 1064/// 1065/// \headerfile <x86intrin.h> 1066/// 1067/// \code 1068/// __m128i _mm_slli_si128(__m128i a, const int imm); 1069/// \endcode 1070/// 1071/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction. 1072/// 1073/// \param a 1074/// A 128-bit integer vector containing the source operand. 1075/// \param imm 1076/// An immediate value specifying the number of bytes to left-shift 1077/// operand a. 1078/// \returns A 128-bit integer vector containing the left-shifted value. 1079#define _mm_slli_si128(a, imm) __extension__ ({ \ 1080 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ 1081 (__v16qi)(__m128i)(a), \ 1082 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \ 1083 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \ 1084 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \ 1085 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \ 1086 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \ 1087 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \ 1088 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \ 1089 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \ 1090 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \ 1091 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \ 1092 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \ 1093 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \ 1094 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \ 1095 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \ 1096 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \ 1097 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); }) 1098 1099#define _mm_bslli_si128(a, imm) \ 1100 _mm_slli_si128((a), (imm)) 1101 1102/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1103/// by the specified number of bits. Low-order bits are cleared. 1104/// 1105/// \headerfile <x86intrin.h> 1106/// 1107/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1108/// 1109/// \param __a 1110/// A 128-bit integer vector containing the source operand. 1111/// \param __count 1112/// An integer value specifying the number of bits to left-shift each value 1113/// in operand __a. 1114/// \returns A 128-bit integer vector containing the left-shifted values. 1115static __inline__ __m128i __DEFAULT_FN_ATTRS 1116_mm_slli_epi16(__m128i __a, int __count) 1117{ 1118 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 1119} 1120 1121/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1122/// by the specified number of bits. Low-order bits are cleared. 1123/// 1124/// \headerfile <x86intrin.h> 1125/// 1126/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1127/// 1128/// \param __a 1129/// A 128-bit integer vector containing the source operand. 1130/// \param __count 1131/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1132/// to left-shift each value in operand __a. 1133/// \returns A 128-bit integer vector containing the left-shifted values. 1134static __inline__ __m128i __DEFAULT_FN_ATTRS 1135_mm_sll_epi16(__m128i __a, __m128i __count) 1136{ 1137 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 1138} 1139 1140/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1141/// by the specified number of bits. Low-order bits are cleared. 1142/// 1143/// \headerfile <x86intrin.h> 1144/// 1145/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1146/// 1147/// \param __a 1148/// A 128-bit integer vector containing the source operand. 1149/// \param __count 1150/// An integer value specifying the number of bits to left-shift each value 1151/// in operand __a. 1152/// \returns A 128-bit integer vector containing the left-shifted values. 1153static __inline__ __m128i __DEFAULT_FN_ATTRS 1154_mm_slli_epi32(__m128i __a, int __count) 1155{ 1156 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 1157} 1158 1159/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1160/// by the specified number of bits. Low-order bits are cleared. 1161/// 1162/// \headerfile <x86intrin.h> 1163/// 1164/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1165/// 1166/// \param __a 1167/// A 128-bit integer vector containing the source operand. 1168/// \param __count 1169/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1170/// to left-shift each value in operand __a. 1171/// \returns A 128-bit integer vector containing the left-shifted values. 1172static __inline__ __m128i __DEFAULT_FN_ATTRS 1173_mm_sll_epi32(__m128i __a, __m128i __count) 1174{ 1175 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 1176} 1177 1178/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1179/// by the specified number of bits. Low-order bits are cleared. 1180/// 1181/// \headerfile <x86intrin.h> 1182/// 1183/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1184/// 1185/// \param __a 1186/// A 128-bit integer vector containing the source operand. 1187/// \param __count 1188/// An integer value specifying the number of bits to left-shift each value 1189/// in operand __a. 1190/// \returns A 128-bit integer vector containing the left-shifted values. 1191static __inline__ __m128i __DEFAULT_FN_ATTRS 1192_mm_slli_epi64(__m128i __a, int __count) 1193{ 1194 return __builtin_ia32_psllqi128(__a, __count); 1195} 1196 1197/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1198/// by the specified number of bits. Low-order bits are cleared. 1199/// 1200/// \headerfile <x86intrin.h> 1201/// 1202/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1203/// 1204/// \param __a 1205/// A 128-bit integer vector containing the source operand. 1206/// \param __count 1207/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1208/// to left-shift each value in operand __a. 1209/// \returns A 128-bit integer vector containing the left-shifted values. 1210static __inline__ __m128i __DEFAULT_FN_ATTRS 1211_mm_sll_epi64(__m128i __a, __m128i __count) 1212{ 1213 return __builtin_ia32_psllq128(__a, __count); 1214} 1215 1216/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1217/// by the specified number of bits. High-order bits are filled with the sign 1218/// bit of the initial value. 1219/// 1220/// \headerfile <x86intrin.h> 1221/// 1222/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1223/// 1224/// \param __a 1225/// A 128-bit integer vector containing the source operand. 1226/// \param __count 1227/// An integer value specifying the number of bits to right-shift each value 1228/// in operand __a. 1229/// \returns A 128-bit integer vector containing the right-shifted values. 1230static __inline__ __m128i __DEFAULT_FN_ATTRS 1231_mm_srai_epi16(__m128i __a, int __count) 1232{ 1233 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 1234} 1235 1236/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1237/// by the specified number of bits. High-order bits are filled with the sign 1238/// bit of the initial value. 1239/// 1240/// \headerfile <x86intrin.h> 1241/// 1242/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1243/// 1244/// \param __a 1245/// A 128-bit integer vector containing the source operand. 1246/// \param __count 1247/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1248/// to right-shift each value in operand __a. 1249/// \returns A 128-bit integer vector containing the right-shifted values. 1250static __inline__ __m128i __DEFAULT_FN_ATTRS 1251_mm_sra_epi16(__m128i __a, __m128i __count) 1252{ 1253 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 1254} 1255 1256/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1257/// by the specified number of bits. High-order bits are filled with the sign 1258/// bit of the initial value. 1259/// 1260/// \headerfile <x86intrin.h> 1261/// 1262/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1263/// 1264/// \param __a 1265/// A 128-bit integer vector containing the source operand. 1266/// \param __count 1267/// An integer value specifying the number of bits to right-shift each value 1268/// in operand __a. 1269/// \returns A 128-bit integer vector containing the right-shifted values. 1270static __inline__ __m128i __DEFAULT_FN_ATTRS 1271_mm_srai_epi32(__m128i __a, int __count) 1272{ 1273 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 1274} 1275 1276/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1277/// by the specified number of bits. High-order bits are filled with the sign 1278/// bit of the initial value. 1279/// 1280/// \headerfile <x86intrin.h> 1281/// 1282/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1283/// 1284/// \param __a 1285/// A 128-bit integer vector containing the source operand. 1286/// \param __count 1287/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1288/// to right-shift each value in operand __a. 1289/// \returns A 128-bit integer vector containing the right-shifted values. 1290static __inline__ __m128i __DEFAULT_FN_ATTRS 1291_mm_sra_epi32(__m128i __a, __m128i __count) 1292{ 1293 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 1294} 1295 1296/// \brief Right-shifts the 128-bit integer vector operand by the specified 1297/// number of bytes. High-order bits are cleared. 1298/// 1299/// \headerfile <x86intrin.h> 1300/// 1301/// \code 1302/// __m128i _mm_srli_si128(__m128i a, const int imm); 1303/// \endcode 1304/// 1305/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction. 1306/// 1307/// \param a 1308/// A 128-bit integer vector containing the source operand. 1309/// \param imm 1310/// An immediate value specifying the number of bytes to right-shift operand 1311/// a. 1312/// \returns A 128-bit integer vector containing the right-shifted value. 1313#define _mm_srli_si128(a, imm) __extension__ ({ \ 1314 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ 1315 (__v16qi)_mm_setzero_si128(), \ 1316 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \ 1317 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \ 1318 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \ 1319 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \ 1320 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \ 1321 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \ 1322 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \ 1323 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \ 1324 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \ 1325 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \ 1326 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \ 1327 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \ 1328 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \ 1329 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \ 1330 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \ 1331 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); }) 1332 1333#define _mm_bsrli_si128(a, imm) \ 1334 _mm_srli_si128((a), (imm)) 1335 1336/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1337/// operand by the specified number of bits. High-order bits are cleared. 1338/// 1339/// \headerfile <x86intrin.h> 1340/// 1341/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1342/// 1343/// \param __a 1344/// A 128-bit integer vector containing the source operand. 1345/// \param __count 1346/// An integer value specifying the number of bits to right-shift each value 1347/// in operand __a. 1348/// \returns A 128-bit integer vector containing the right-shifted values. 1349static __inline__ __m128i __DEFAULT_FN_ATTRS 1350_mm_srli_epi16(__m128i __a, int __count) 1351{ 1352 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 1353} 1354 1355/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1356/// operand by the specified number of bits. High-order bits are cleared. 1357/// 1358/// \headerfile <x86intrin.h> 1359/// 1360/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1361/// 1362/// \param __a 1363/// A 128-bit integer vector containing the source operand. 1364/// \param __count 1365/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1366/// to right-shift each value in operand __a. 1367/// \returns A 128-bit integer vector containing the right-shifted values. 1368static __inline__ __m128i __DEFAULT_FN_ATTRS 1369_mm_srl_epi16(__m128i __a, __m128i __count) 1370{ 1371 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 1372} 1373 1374/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1375/// operand by the specified number of bits. High-order bits are cleared. 1376/// 1377/// \headerfile <x86intrin.h> 1378/// 1379/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1380/// 1381/// \param __a 1382/// A 128-bit integer vector containing the source operand. 1383/// \param __count 1384/// An integer value specifying the number of bits to right-shift each value 1385/// in operand __a. 1386/// \returns A 128-bit integer vector containing the right-shifted values. 1387static __inline__ __m128i __DEFAULT_FN_ATTRS 1388_mm_srli_epi32(__m128i __a, int __count) 1389{ 1390 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 1391} 1392 1393/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1394/// operand by the specified number of bits. High-order bits are cleared. 1395/// 1396/// \headerfile <x86intrin.h> 1397/// 1398/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1399/// 1400/// \param __a 1401/// A 128-bit integer vector containing the source operand. 1402/// \param __count 1403/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1404/// to right-shift each value in operand __a. 1405/// \returns A 128-bit integer vector containing the right-shifted values. 1406static __inline__ __m128i __DEFAULT_FN_ATTRS 1407_mm_srl_epi32(__m128i __a, __m128i __count) 1408{ 1409 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 1410} 1411 1412/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1413/// operand by the specified number of bits. High-order bits are cleared. 1414/// 1415/// \headerfile <x86intrin.h> 1416/// 1417/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1418/// 1419/// \param __a 1420/// A 128-bit integer vector containing the source operand. 1421/// \param __count 1422/// An integer value specifying the number of bits to right-shift each value 1423/// in operand __a. 1424/// \returns A 128-bit integer vector containing the right-shifted values. 1425static __inline__ __m128i __DEFAULT_FN_ATTRS 1426_mm_srli_epi64(__m128i __a, int __count) 1427{ 1428 return __builtin_ia32_psrlqi128(__a, __count); 1429} 1430 1431/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1432/// operand by the specified number of bits. High-order bits are cleared. 1433/// 1434/// \headerfile <x86intrin.h> 1435/// 1436/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1437/// 1438/// \param __a 1439/// A 128-bit integer vector containing the source operand. 1440/// \param __count 1441/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1442/// to right-shift each value in operand __a. 1443/// \returns A 128-bit integer vector containing the right-shifted values. 1444static __inline__ __m128i __DEFAULT_FN_ATTRS 1445_mm_srl_epi64(__m128i __a, __m128i __count) 1446{ 1447 return __builtin_ia32_psrlq128(__a, __count); 1448} 1449 1450/// \brief Compares each of the corresponding 8-bit values of the 128-bit 1451/// integer vectors for equality. Each comparison yields 0h for false, FFh 1452/// for true. 1453/// 1454/// \headerfile <x86intrin.h> 1455/// 1456/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction. 1457/// 1458/// \param __a 1459/// A 128-bit integer vector. 1460/// \param __b 1461/// A 128-bit integer vector. 1462/// \returns A 128-bit integer vector containing the comparison results. 1463static __inline__ __m128i __DEFAULT_FN_ATTRS 1464_mm_cmpeq_epi8(__m128i __a, __m128i __b) 1465{ 1466 return (__m128i)((__v16qi)__a == (__v16qi)__b); 1467} 1468 1469/// \brief Compares each of the corresponding 16-bit values of the 128-bit 1470/// integer vectors for equality. Each comparison yields 0h for false, FFFFh 1471/// for true. 1472/// 1473/// \headerfile <x86intrin.h> 1474/// 1475/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction. 1476/// 1477/// \param __a 1478/// A 128-bit integer vector. 1479/// \param __b 1480/// A 128-bit integer vector. 1481/// \returns A 128-bit integer vector containing the comparison results. 1482static __inline__ __m128i __DEFAULT_FN_ATTRS 1483_mm_cmpeq_epi16(__m128i __a, __m128i __b) 1484{ 1485 return (__m128i)((__v8hi)__a == (__v8hi)__b); 1486} 1487 1488/// \brief Compares each of the corresponding 32-bit values of the 128-bit 1489/// integer vectors for equality. Each comparison yields 0h for false, 1490/// FFFFFFFFh for true. 1491/// 1492/// \headerfile <x86intrin.h> 1493/// 1494/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction. 1495/// 1496/// \param __a 1497/// A 128-bit integer vector. 1498/// \param __b 1499/// A 128-bit integer vector. 1500/// \returns A 128-bit integer vector containing the comparison results. 1501static __inline__ __m128i __DEFAULT_FN_ATTRS 1502_mm_cmpeq_epi32(__m128i __a, __m128i __b) 1503{ 1504 return (__m128i)((__v4si)__a == (__v4si)__b); 1505} 1506 1507/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1508/// integer vectors to determine if the values in the first operand are 1509/// greater than those in the second operand. Each comparison yields 0h for 1510/// false, FFh for true. 1511/// 1512/// \headerfile <x86intrin.h> 1513/// 1514/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1515/// 1516/// \param __a 1517/// A 128-bit integer vector. 1518/// \param __b 1519/// A 128-bit integer vector. 1520/// \returns A 128-bit integer vector containing the comparison results. 1521static __inline__ __m128i __DEFAULT_FN_ATTRS 1522_mm_cmpgt_epi8(__m128i __a, __m128i __b) 1523{ 1524 /* This function always performs a signed comparison, but __v16qi is a char 1525 which may be signed or unsigned, so use __v16qs. */ 1526 return (__m128i)((__v16qs)__a > (__v16qs)__b); 1527} 1528 1529/// \brief Compares each of the corresponding signed 16-bit values of the 1530/// 128-bit integer vectors to determine if the values in the first operand 1531/// are greater than those in the second operand. Each comparison yields 0h 1532/// for false, FFFFh for true. 1533/// 1534/// \headerfile <x86intrin.h> 1535/// 1536/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1537/// 1538/// \param __a 1539/// A 128-bit integer vector. 1540/// \param __b 1541/// A 128-bit integer vector. 1542/// \returns A 128-bit integer vector containing the comparison results. 1543static __inline__ __m128i __DEFAULT_FN_ATTRS 1544_mm_cmpgt_epi16(__m128i __a, __m128i __b) 1545{ 1546 return (__m128i)((__v8hi)__a > (__v8hi)__b); 1547} 1548 1549/// \brief Compares each of the corresponding signed 32-bit values of the 1550/// 128-bit integer vectors to determine if the values in the first operand 1551/// are greater than those in the second operand. Each comparison yields 0h 1552/// for false, FFFFFFFFh for true. 1553/// 1554/// \headerfile <x86intrin.h> 1555/// 1556/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1557/// 1558/// \param __a 1559/// A 128-bit integer vector. 1560/// \param __b 1561/// A 128-bit integer vector. 1562/// \returns A 128-bit integer vector containing the comparison results. 1563static __inline__ __m128i __DEFAULT_FN_ATTRS 1564_mm_cmpgt_epi32(__m128i __a, __m128i __b) 1565{ 1566 return (__m128i)((__v4si)__a > (__v4si)__b); 1567} 1568 1569/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1570/// integer vectors to determine if the values in the first operand are less 1571/// than those in the second operand. Each comparison yields 0h for false, 1572/// FFh for true. 1573/// 1574/// \headerfile <x86intrin.h> 1575/// 1576/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1577/// 1578/// \param __a 1579/// A 128-bit integer vector. 1580/// \param __b 1581/// A 128-bit integer vector. 1582/// \returns A 128-bit integer vector containing the comparison results. 1583static __inline__ __m128i __DEFAULT_FN_ATTRS 1584_mm_cmplt_epi8(__m128i __a, __m128i __b) 1585{ 1586 return _mm_cmpgt_epi8(__b, __a); 1587} 1588 1589/// \brief Compares each of the corresponding signed 16-bit values of the 1590/// 128-bit integer vectors to determine if the values in the first operand 1591/// are less than those in the second operand. Each comparison yields 0h for 1592/// false, FFFFh for true. 1593/// 1594/// \headerfile <x86intrin.h> 1595/// 1596/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1597/// 1598/// \param __a 1599/// A 128-bit integer vector. 1600/// \param __b 1601/// A 128-bit integer vector. 1602/// \returns A 128-bit integer vector containing the comparison results. 1603static __inline__ __m128i __DEFAULT_FN_ATTRS 1604_mm_cmplt_epi16(__m128i __a, __m128i __b) 1605{ 1606 return _mm_cmpgt_epi16(__b, __a); 1607} 1608 1609/// \brief Compares each of the corresponding signed 32-bit values of the 1610/// 128-bit integer vectors to determine if the values in the first operand 1611/// are less than those in the second operand. Each comparison yields 0h for 1612/// false, FFFFFFFFh for true. 1613/// 1614/// \headerfile <x86intrin.h> 1615/// 1616/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1617/// 1618/// \param __a 1619/// A 128-bit integer vector. 1620/// \param __b 1621/// A 128-bit integer vector. 1622/// \returns A 128-bit integer vector containing the comparison results. 1623static __inline__ __m128i __DEFAULT_FN_ATTRS 1624_mm_cmplt_epi32(__m128i __a, __m128i __b) 1625{ 1626 return _mm_cmpgt_epi32(__b, __a); 1627} 1628 1629#ifdef __x86_64__ 1630/// \brief Converts a 64-bit signed integer value from the second operand into a 1631/// double-precision value and returns it in the lower element of a [2 x 1632/// double] vector; the upper element of the returned vector is copied from 1633/// the upper element of the first operand. 1634/// 1635/// \headerfile <x86intrin.h> 1636/// 1637/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. 1638/// 1639/// \param __a 1640/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 1641/// copied to the upper 64 bits of the destination. 1642/// \param __b 1643/// A 64-bit signed integer operand containing the value to be converted. 1644/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 1645/// converted value of the second operand. The upper 64 bits are copied from 1646/// the upper 64 bits of the first operand. 1647static __inline__ __m128d __DEFAULT_FN_ATTRS 1648_mm_cvtsi64_sd(__m128d __a, long long __b) 1649{ 1650 __a[0] = __b; 1651 return __a; 1652} 1653 1654/// \brief Converts the first (lower) element of a vector of [2 x double] into a 1655/// 64-bit signed integer value, according to the current rounding mode. 1656/// 1657/// \headerfile <x86intrin.h> 1658/// 1659/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. 1660/// 1661/// \param __a 1662/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1663/// conversion. 1664/// \returns A 64-bit signed integer containing the converted value. 1665static __inline__ long long __DEFAULT_FN_ATTRS 1666_mm_cvtsd_si64(__m128d __a) 1667{ 1668 return __builtin_ia32_cvtsd2si64(__a); 1669} 1670 1671/// \brief Converts the first (lower) element of a vector of [2 x double] into a 1672/// 64-bit signed integer value, truncating the result when it is inexact. 1673/// 1674/// \headerfile <x86intrin.h> 1675/// 1676/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. 1677/// 1678/// \param __a 1679/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1680/// conversion. 1681/// \returns A 64-bit signed integer containing the converted value. 1682static __inline__ long long __DEFAULT_FN_ATTRS 1683_mm_cvttsd_si64(__m128d __a) 1684{ 1685 return __a[0]; 1686} 1687#endif 1688 1689/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 1690/// 1691/// \headerfile <x86intrin.h> 1692/// 1693/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. 1694/// 1695/// \param __a 1696/// A 128-bit integer vector. 1697/// \returns A 128-bit vector of [4 x float] containing the converted values. 1698static __inline__ __m128 __DEFAULT_FN_ATTRS 1699_mm_cvtepi32_ps(__m128i __a) 1700{ 1701 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1702} 1703 1704/// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 1705/// 1706/// \headerfile <x86intrin.h> 1707/// 1708/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. 1709/// 1710/// \param __a 1711/// A 128-bit vector of [4 x float]. 1712/// \returns A 128-bit integer vector of [4 x i32] containing the converted 1713/// values. 1714static __inline__ __m128i __DEFAULT_FN_ATTRS 1715_mm_cvtps_epi32(__m128 __a) 1716{ 1717 return (__m128i)__builtin_ia32_cvtps2dq(__a); 1718} 1719 1720/// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 1721/// truncating the result when it is inexact. 1722/// 1723/// \headerfile <x86intrin.h> 1724/// 1725/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction. 1726/// 1727/// \param __a 1728/// A 128-bit vector of [4 x float]. 1729/// \returns A 128-bit vector of [4 x i32] containing the converted values. 1730static __inline__ __m128i __DEFAULT_FN_ATTRS 1731_mm_cvttps_epi32(__m128 __a) 1732{ 1733 return (__m128i)__builtin_ia32_cvttps2dq(__a); 1734} 1735 1736/// \brief Returns a vector of [4 x i32] where the lowest element is the input 1737/// operand and the remaining elements are zero. 1738/// 1739/// \headerfile <x86intrin.h> 1740/// 1741/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1742/// 1743/// \param __a 1744/// A 32-bit signed integer operand. 1745/// \returns A 128-bit vector of [4 x i32]. 1746static __inline__ __m128i __DEFAULT_FN_ATTRS 1747_mm_cvtsi32_si128(int __a) 1748{ 1749 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1750} 1751 1752#ifdef __x86_64__ 1753/// \brief Returns a vector of [2 x i64] where the lower element is the input 1754/// operand and the upper element is zero. 1755/// 1756/// \headerfile <x86intrin.h> 1757/// 1758/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1759/// 1760/// \param __a 1761/// A 64-bit signed integer operand containing the value to be converted. 1762/// \returns A 128-bit vector of [2 x i64] containing the converted value. 1763static __inline__ __m128i __DEFAULT_FN_ATTRS 1764_mm_cvtsi64_si128(long long __a) 1765{ 1766 return (__m128i){ __a, 0 }; 1767} 1768#endif 1769 1770/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 1771/// 32-bit signed integer value. 1772/// 1773/// \headerfile <x86intrin.h> 1774/// 1775/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1776/// 1777/// \param __a 1778/// A vector of [4 x i32]. The least significant 32 bits are moved to the 1779/// destination. 1780/// \returns A 32-bit signed integer containing the moved value. 1781static __inline__ int __DEFAULT_FN_ATTRS 1782_mm_cvtsi128_si32(__m128i __a) 1783{ 1784 __v4si __b = (__v4si)__a; 1785 return __b[0]; 1786} 1787 1788#ifdef __x86_64__ 1789/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 1790/// 64-bit signed integer value. 1791/// 1792/// \headerfile <x86intrin.h> 1793/// 1794/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1795/// 1796/// \param __a 1797/// A vector of [2 x i64]. The least significant 64 bits are moved to the 1798/// destination. 1799/// \returns A 64-bit signed integer containing the moved value. 1800static __inline__ long long __DEFAULT_FN_ATTRS 1801_mm_cvtsi128_si64(__m128i __a) 1802{ 1803 return __a[0]; 1804} 1805#endif 1806 1807/// \brief Moves packed integer values from an aligned 128-bit memory location 1808/// to elements in a 128-bit integer vector. 1809/// 1810/// \headerfile <x86intrin.h> 1811/// 1812/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction. 1813/// 1814/// \param __p 1815/// An aligned pointer to a memory location containing integer values. 1816/// \returns A 128-bit integer vector containing the moved values. 1817static __inline__ __m128i __DEFAULT_FN_ATTRS 1818_mm_load_si128(__m128i const *__p) 1819{ 1820 return *__p; 1821} 1822 1823/// \brief Moves packed integer values from an unaligned 128-bit memory location 1824/// to elements in a 128-bit integer vector. 1825/// 1826/// \headerfile <x86intrin.h> 1827/// 1828/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction. 1829/// 1830/// \param __p 1831/// A pointer to a memory location containing integer values. 1832/// \returns A 128-bit integer vector containing the moved values. 1833static __inline__ __m128i __DEFAULT_FN_ATTRS 1834_mm_loadu_si128(__m128i const *__p) 1835{ 1836 struct __loadu_si128 { 1837 __m128i __v; 1838 } __attribute__((__packed__, __may_alias__)); 1839 return ((struct __loadu_si128*)__p)->__v; 1840} 1841 1842/// \brief Returns a vector of [2 x i64] where the lower element is taken from 1843/// the lower element of the operand, and the upper element is zero. 1844/// 1845/// \headerfile <x86intrin.h> 1846/// 1847/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1848/// 1849/// \param __p 1850/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 1851/// the destination. 1852/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 1853/// moved value. The higher order bits are cleared. 1854static __inline__ __m128i __DEFAULT_FN_ATTRS 1855_mm_loadl_epi64(__m128i const *__p) 1856{ 1857 struct __mm_loadl_epi64_struct { 1858 long long __u; 1859 } __attribute__((__packed__, __may_alias__)); 1860 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1861} 1862 1863/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 1864/// This could be used as an argument to another intrinsic function where the 1865/// argument is required but the value is not actually used. 1866/// 1867/// \headerfile <x86intrin.h> 1868/// 1869/// This intrinsic has no corresponding instruction. 1870/// 1871/// \returns A 128-bit vector of [4 x i32] with unspecified content. 1872static __inline__ __m128i __DEFAULT_FN_ATTRS 1873_mm_undefined_si128() 1874{ 1875 return (__m128i)__builtin_ia32_undef128(); 1876} 1877 1878/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1879/// the specified 64-bit integer values. 1880/// 1881/// \headerfile <x86intrin.h> 1882/// 1883/// This intrinsic is a utility function and does not correspond to a specific 1884/// instruction. 1885/// 1886/// \param __q1 1887/// A 64-bit integer value used to initialize the upper 64 bits of the 1888/// destination vector of [2 x i64]. 1889/// \param __q0 1890/// A 64-bit integer value used to initialize the lower 64 bits of the 1891/// destination vector of [2 x i64]. 1892/// \returns An initialized 128-bit vector of [2 x i64] containing the values 1893/// provided in the operands. 1894static __inline__ __m128i __DEFAULT_FN_ATTRS 1895_mm_set_epi64x(long long __q1, long long __q0) 1896{ 1897 return (__m128i){ __q0, __q1 }; 1898} 1899 1900/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1901/// the specified 64-bit integer values. 1902/// 1903/// \headerfile <x86intrin.h> 1904/// 1905/// This intrinsic is a utility function and does not correspond to a specific 1906/// instruction. 1907/// 1908/// \param __q1 1909/// A 64-bit integer value used to initialize the upper 64 bits of the 1910/// destination vector of [2 x i64]. 1911/// \param __q0 1912/// A 64-bit integer value used to initialize the lower 64 bits of the 1913/// destination vector of [2 x i64]. 1914/// \returns An initialized 128-bit vector of [2 x i64] containing the values 1915/// provided in the operands. 1916static __inline__ __m128i __DEFAULT_FN_ATTRS 1917_mm_set_epi64(__m64 __q1, __m64 __q0) 1918{ 1919 return (__m128i){ (long long)__q0, (long long)__q1 }; 1920} 1921 1922/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 1923/// the specified 32-bit integer values. 1924/// 1925/// \headerfile <x86intrin.h> 1926/// 1927/// This intrinsic is a utility function and does not correspond to a specific 1928/// instruction. 1929/// 1930/// \param __i3 1931/// A 32-bit integer value used to initialize bits [127:96] of the 1932/// destination vector. 1933/// \param __i2 1934/// A 32-bit integer value used to initialize bits [95:64] of the destination 1935/// vector. 1936/// \param __i1 1937/// A 32-bit integer value used to initialize bits [63:32] of the destination 1938/// vector. 1939/// \param __i0 1940/// A 32-bit integer value used to initialize bits [31:0] of the destination 1941/// vector. 1942/// \returns An initialized 128-bit vector of [4 x i32] containing the values 1943/// provided in the operands. 1944static __inline__ __m128i __DEFAULT_FN_ATTRS 1945_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 1946{ 1947 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 1948} 1949 1950/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 1951/// the specified 16-bit integer values. 1952/// 1953/// \headerfile <x86intrin.h> 1954/// 1955/// This intrinsic is a utility function and does not correspond to a specific 1956/// instruction. 1957/// 1958/// \param __w7 1959/// A 16-bit integer value used to initialize bits [127:112] of the 1960/// destination vector. 1961/// \param __w6 1962/// A 16-bit integer value used to initialize bits [111:96] of the 1963/// destination vector. 1964/// \param __w5 1965/// A 16-bit integer value used to initialize bits [95:80] of the destination 1966/// vector. 1967/// \param __w4 1968/// A 16-bit integer value used to initialize bits [79:64] of the destination 1969/// vector. 1970/// \param __w3 1971/// A 16-bit integer value used to initialize bits [63:48] of the destination 1972/// vector. 1973/// \param __w2 1974/// A 16-bit integer value used to initialize bits [47:32] of the destination 1975/// vector. 1976/// \param __w1 1977/// A 16-bit integer value used to initialize bits [31:16] of the destination 1978/// vector. 1979/// \param __w0 1980/// A 16-bit integer value used to initialize bits [15:0] of the destination 1981/// vector. 1982/// \returns An initialized 128-bit vector of [8 x i16] containing the values 1983/// provided in the operands. 1984static __inline__ __m128i __DEFAULT_FN_ATTRS 1985_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 1986{ 1987 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 1988} 1989 1990/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 1991/// the specified 8-bit integer values. 1992/// 1993/// \headerfile <x86intrin.h> 1994/// 1995/// This intrinsic is a utility function and does not correspond to a specific 1996/// instruction. 1997/// 1998/// \param __b15 1999/// Initializes bits [127:120] of the destination vector. 2000/// \param __b14 2001/// Initializes bits [119:112] of the destination vector. 2002/// \param __b13 2003/// Initializes bits [111:104] of the destination vector. 2004/// \param __b12 2005/// Initializes bits [103:96] of the destination vector. 2006/// \param __b11 2007/// Initializes bits [95:88] of the destination vector. 2008/// \param __b10 2009/// Initializes bits [87:80] of the destination vector. 2010/// \param __b9 2011/// Initializes bits [79:72] of the destination vector. 2012/// \param __b8 2013/// Initializes bits [71:64] of the destination vector. 2014/// \param __b7 2015/// Initializes bits [63:56] of the destination vector. 2016/// \param __b6 2017/// Initializes bits [55:48] of the destination vector. 2018/// \param __b5 2019/// Initializes bits [47:40] of the destination vector. 2020/// \param __b4 2021/// Initializes bits [39:32] of the destination vector. 2022/// \param __b3 2023/// Initializes bits [31:24] of the destination vector. 2024/// \param __b2 2025/// Initializes bits [23:16] of the destination vector. 2026/// \param __b1 2027/// Initializes bits [15:8] of the destination vector. 2028/// \param __b0 2029/// Initializes bits [7:0] of the destination vector. 2030/// \returns An initialized 128-bit vector of [16 x i8] containing the values 2031/// provided in the operands. 2032static __inline__ __m128i __DEFAULT_FN_ATTRS 2033_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 2034{ 2035 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2036} 2037 2038/// \brief Initializes both values in a 128-bit integer vector with the 2039/// specified 64-bit integer value. 2040/// 2041/// \headerfile <x86intrin.h> 2042/// 2043/// This intrinsic is a utility function and does not correspond to a specific 2044/// instruction. 2045/// 2046/// \param __q 2047/// Integer value used to initialize the elements of the destination integer 2048/// vector. 2049/// \returns An initialized 128-bit integer vector of [2 x i64] with both 2050/// elements containing the value provided in the operand. 2051static __inline__ __m128i __DEFAULT_FN_ATTRS 2052_mm_set1_epi64x(long long __q) 2053{ 2054 return (__m128i){ __q, __q }; 2055} 2056 2057/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 2058/// specified 64-bit value. 2059/// 2060/// \headerfile <x86intrin.h> 2061/// 2062/// This intrinsic is a utility function and does not correspond to a specific 2063/// instruction. 2064/// 2065/// \param __q 2066/// A 64-bit value used to initialize the elements of the destination integer 2067/// vector. 2068/// \returns An initialized 128-bit vector of [2 x i64] with all elements 2069/// containing the value provided in the operand. 2070static __inline__ __m128i __DEFAULT_FN_ATTRS 2071_mm_set1_epi64(__m64 __q) 2072{ 2073 return (__m128i){ (long long)__q, (long long)__q }; 2074} 2075 2076/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 2077/// specified 32-bit value. 2078/// 2079/// \headerfile <x86intrin.h> 2080/// 2081/// This intrinsic is a utility function and does not correspond to a specific 2082/// instruction. 2083/// 2084/// \param __i 2085/// A 32-bit value used to initialize the elements of the destination integer 2086/// vector. 2087/// \returns An initialized 128-bit vector of [4 x i32] with all elements 2088/// containing the value provided in the operand. 2089static __inline__ __m128i __DEFAULT_FN_ATTRS 2090_mm_set1_epi32(int __i) 2091{ 2092 return (__m128i)(__v4si){ __i, __i, __i, __i }; 2093} 2094 2095/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 2096/// specified 16-bit value. 2097/// 2098/// \headerfile <x86intrin.h> 2099/// 2100/// This intrinsic is a utility function and does not correspond to a specific 2101/// instruction. 2102/// 2103/// \param __w 2104/// A 16-bit value used to initialize the elements of the destination integer 2105/// vector. 2106/// \returns An initialized 128-bit vector of [8 x i16] with all elements 2107/// containing the value provided in the operand. 2108static __inline__ __m128i __DEFAULT_FN_ATTRS 2109_mm_set1_epi16(short __w) 2110{ 2111 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 2112} 2113 2114/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 2115/// specified 8-bit value. 2116/// 2117/// \headerfile <x86intrin.h> 2118/// 2119/// This intrinsic is a utility function and does not correspond to a specific 2120/// instruction. 2121/// 2122/// \param __b 2123/// An 8-bit value used to initialize the elements of the destination integer 2124/// vector. 2125/// \returns An initialized 128-bit vector of [16 x i8] with all elements 2126/// containing the value provided in the operand. 2127static __inline__ __m128i __DEFAULT_FN_ATTRS 2128_mm_set1_epi8(char __b) 2129{ 2130 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 2131} 2132 2133static __inline__ __m128i __DEFAULT_FN_ATTRS 2134_mm_setr_epi64(__m64 __q0, __m64 __q1) 2135{ 2136 return (__m128i){ (long long)__q0, (long long)__q1 }; 2137} 2138 2139static __inline__ __m128i __DEFAULT_FN_ATTRS 2140_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 2141{ 2142 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 2143} 2144 2145static __inline__ __m128i __DEFAULT_FN_ATTRS 2146_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 2147{ 2148 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2149} 2150 2151static __inline__ __m128i __DEFAULT_FN_ATTRS 2152_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 2153{ 2154 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2155} 2156 2157static __inline__ __m128i __DEFAULT_FN_ATTRS 2158_mm_setzero_si128(void) 2159{ 2160 return (__m128i){ 0LL, 0LL }; 2161} 2162 2163static __inline__ void __DEFAULT_FN_ATTRS 2164_mm_store_si128(__m128i *__p, __m128i __b) 2165{ 2166 *__p = __b; 2167} 2168 2169static __inline__ void __DEFAULT_FN_ATTRS 2170_mm_storeu_si128(__m128i *__p, __m128i __b) 2171{ 2172 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); 2173} 2174 2175static __inline__ void __DEFAULT_FN_ATTRS 2176_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 2177{ 2178 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 2179} 2180 2181static __inline__ void __DEFAULT_FN_ATTRS 2182_mm_storel_epi64(__m128i *__p, __m128i __a) 2183{ 2184 struct __mm_storel_epi64_struct { 2185 long long __u; 2186 } __attribute__((__packed__, __may_alias__)); 2187 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 2188} 2189 2190static __inline__ void __DEFAULT_FN_ATTRS 2191_mm_stream_pd(double *__p, __m128d __a) 2192{ 2193 __builtin_ia32_movntpd(__p, __a); 2194} 2195 2196static __inline__ void __DEFAULT_FN_ATTRS 2197_mm_stream_si128(__m128i *__p, __m128i __a) 2198{ 2199 __builtin_ia32_movntdq(__p, __a); 2200} 2201 2202static __inline__ void __DEFAULT_FN_ATTRS 2203_mm_stream_si32(int *__p, int __a) 2204{ 2205 __builtin_ia32_movnti(__p, __a); 2206} 2207 2208#ifdef __x86_64__ 2209static __inline__ void __DEFAULT_FN_ATTRS 2210_mm_stream_si64(long long *__p, long long __a) 2211{ 2212 __builtin_ia32_movnti64(__p, __a); 2213} 2214#endif 2215 2216static __inline__ void __DEFAULT_FN_ATTRS 2217_mm_clflush(void const *__p) 2218{ 2219 __builtin_ia32_clflush(__p); 2220} 2221 2222static __inline__ void __DEFAULT_FN_ATTRS 2223_mm_lfence(void) 2224{ 2225 __builtin_ia32_lfence(); 2226} 2227 2228static __inline__ void __DEFAULT_FN_ATTRS 2229_mm_mfence(void) 2230{ 2231 __builtin_ia32_mfence(); 2232} 2233 2234static __inline__ __m128i __DEFAULT_FN_ATTRS 2235_mm_packs_epi16(__m128i __a, __m128i __b) 2236{ 2237 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 2238} 2239 2240static __inline__ __m128i __DEFAULT_FN_ATTRS 2241_mm_packs_epi32(__m128i __a, __m128i __b) 2242{ 2243 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 2244} 2245 2246static __inline__ __m128i __DEFAULT_FN_ATTRS 2247_mm_packus_epi16(__m128i __a, __m128i __b) 2248{ 2249 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 2250} 2251 2252static __inline__ int __DEFAULT_FN_ATTRS 2253_mm_extract_epi16(__m128i __a, int __imm) 2254{ 2255 __v8hi __b = (__v8hi)__a; 2256 return (unsigned short)__b[__imm & 7]; 2257} 2258 2259static __inline__ __m128i __DEFAULT_FN_ATTRS 2260_mm_insert_epi16(__m128i __a, int __b, int __imm) 2261{ 2262 __v8hi __c = (__v8hi)__a; 2263 __c[__imm & 7] = __b; 2264 return (__m128i)__c; 2265} 2266 2267static __inline__ int __DEFAULT_FN_ATTRS 2268_mm_movemask_epi8(__m128i __a) 2269{ 2270 return __builtin_ia32_pmovmskb128((__v16qi)__a); 2271} 2272 2273#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 2274 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 2275 (__v4si)_mm_setzero_si128(), \ 2276 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 2277 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 2278 2279#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 2280 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2281 (__v8hi)_mm_setzero_si128(), \ 2282 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 2283 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 2284 4, 5, 6, 7); }) 2285 2286#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 2287 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2288 (__v8hi)_mm_setzero_si128(), \ 2289 0, 1, 2, 3, \ 2290 4 + (((imm) & 0x03) >> 0), \ 2291 4 + (((imm) & 0x0c) >> 2), \ 2292 4 + (((imm) & 0x30) >> 4), \ 2293 4 + (((imm) & 0xc0) >> 6)); }) 2294 2295static __inline__ __m128i __DEFAULT_FN_ATTRS 2296_mm_unpackhi_epi8(__m128i __a, __m128i __b) 2297{ 2298 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2299} 2300 2301static __inline__ __m128i __DEFAULT_FN_ATTRS 2302_mm_unpackhi_epi16(__m128i __a, __m128i __b) 2303{ 2304 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 2305} 2306 2307static __inline__ __m128i __DEFAULT_FN_ATTRS 2308_mm_unpackhi_epi32(__m128i __a, __m128i __b) 2309{ 2310 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 2311} 2312 2313static __inline__ __m128i __DEFAULT_FN_ATTRS 2314_mm_unpackhi_epi64(__m128i __a, __m128i __b) 2315{ 2316 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); 2317} 2318 2319static __inline__ __m128i __DEFAULT_FN_ATTRS 2320_mm_unpacklo_epi8(__m128i __a, __m128i __b) 2321{ 2322 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 2323} 2324 2325static __inline__ __m128i __DEFAULT_FN_ATTRS 2326_mm_unpacklo_epi16(__m128i __a, __m128i __b) 2327{ 2328 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 2329} 2330 2331static __inline__ __m128i __DEFAULT_FN_ATTRS 2332_mm_unpacklo_epi32(__m128i __a, __m128i __b) 2333{ 2334 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 2335} 2336 2337static __inline__ __m128i __DEFAULT_FN_ATTRS 2338_mm_unpacklo_epi64(__m128i __a, __m128i __b) 2339{ 2340 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); 2341} 2342 2343static __inline__ __m64 __DEFAULT_FN_ATTRS 2344_mm_movepi64_pi64(__m128i __a) 2345{ 2346 return (__m64)__a[0]; 2347} 2348 2349static __inline__ __m128i __DEFAULT_FN_ATTRS 2350_mm_movpi64_epi64(__m64 __a) 2351{ 2352 return (__m128i){ (long long)__a, 0 }; 2353} 2354 2355static __inline__ __m128i __DEFAULT_FN_ATTRS 2356_mm_move_epi64(__m128i __a) 2357{ 2358 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); 2359} 2360 2361static __inline__ __m128d __DEFAULT_FN_ATTRS 2362_mm_unpackhi_pd(__m128d __a, __m128d __b) 2363{ 2364 return __builtin_shufflevector(__a, __b, 1, 2+1); 2365} 2366 2367static __inline__ __m128d __DEFAULT_FN_ATTRS 2368_mm_unpacklo_pd(__m128d __a, __m128d __b) 2369{ 2370 return __builtin_shufflevector(__a, __b, 0, 2+0); 2371} 2372 2373static __inline__ int __DEFAULT_FN_ATTRS 2374_mm_movemask_pd(__m128d __a) 2375{ 2376 return __builtin_ia32_movmskpd(__a); 2377} 2378 2379#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 2380 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 2381 (i) & 1, (((i) & 2) >> 1) + 2); }) 2382 2383static __inline__ __m128 __DEFAULT_FN_ATTRS 2384_mm_castpd_ps(__m128d __a) 2385{ 2386 return (__m128)__a; 2387} 2388 2389static __inline__ __m128i __DEFAULT_FN_ATTRS 2390_mm_castpd_si128(__m128d __a) 2391{ 2392 return (__m128i)__a; 2393} 2394 2395static __inline__ __m128d __DEFAULT_FN_ATTRS 2396_mm_castps_pd(__m128 __a) 2397{ 2398 return (__m128d)__a; 2399} 2400 2401static __inline__ __m128i __DEFAULT_FN_ATTRS 2402_mm_castps_si128(__m128 __a) 2403{ 2404 return (__m128i)__a; 2405} 2406 2407static __inline__ __m128 __DEFAULT_FN_ATTRS 2408_mm_castsi128_ps(__m128i __a) 2409{ 2410 return (__m128)__a; 2411} 2412 2413static __inline__ __m128d __DEFAULT_FN_ATTRS 2414_mm_castsi128_pd(__m128i __a) 2415{ 2416 return (__m128d)__a; 2417} 2418 2419static __inline__ void __DEFAULT_FN_ATTRS 2420_mm_pause(void) 2421{ 2422 __builtin_ia32_pause(); 2423} 2424 2425#undef __DEFAULT_FN_ATTRS 2426 2427#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 2428 2429#endif /* __EMMINTRIN_H */ 2430