1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#include <xmmintrin.h> 28 29typedef double __m128d __attribute__((__vector_size__(16))); 30typedef long long __m128i __attribute__((__vector_size__(16))); 31 32/* Type defines. */ 33typedef double __v2df __attribute__ ((__vector_size__ (16))); 34typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35typedef short __v8hi __attribute__((__vector_size__(16))); 36typedef char __v16qi __attribute__((__vector_size__(16))); 37 38/* Unsigned types */ 39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 40typedef unsigned int __v4su __attribute__((__vector_size__(16))); 41typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 42typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 43 44/* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46typedef signed char __v16qs __attribute__((__vector_size__(16))); 47 48#include <f16cintrin.h> 49 50/* Define the default attributes for the functions in this file. */ 51#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 52 53static __inline__ __m128d __DEFAULT_FN_ATTRS 54_mm_add_sd(__m128d __a, __m128d __b) 55{ 56 __a[0] += __b[0]; 57 return __a; 58} 59 60static __inline__ __m128d __DEFAULT_FN_ATTRS 61_mm_add_pd(__m128d __a, __m128d __b) 62{ 63 return (__m128d)((__v2df)__a + (__v2df)__b); 64} 65 66static __inline__ __m128d __DEFAULT_FN_ATTRS 67_mm_sub_sd(__m128d __a, __m128d __b) 68{ 69 __a[0] -= __b[0]; 70 return __a; 71} 72 73static __inline__ __m128d __DEFAULT_FN_ATTRS 74_mm_sub_pd(__m128d __a, __m128d __b) 75{ 76 return (__m128d)((__v2df)__a - (__v2df)__b); 77} 78 79static __inline__ __m128d __DEFAULT_FN_ATTRS 80_mm_mul_sd(__m128d __a, __m128d __b) 81{ 82 __a[0] *= __b[0]; 83 return __a; 84} 85 86static __inline__ __m128d __DEFAULT_FN_ATTRS 87_mm_mul_pd(__m128d __a, __m128d __b) 88{ 89 return (__m128d)((__v2df)__a * (__v2df)__b); 90} 91 92static __inline__ __m128d __DEFAULT_FN_ATTRS 93_mm_div_sd(__m128d __a, __m128d __b) 94{ 95 __a[0] /= __b[0]; 96 return __a; 97} 98 99static __inline__ __m128d __DEFAULT_FN_ATTRS 100_mm_div_pd(__m128d __a, __m128d __b) 101{ 102 return (__m128d)((__v2df)__a / (__v2df)__b); 103} 104 105static __inline__ __m128d __DEFAULT_FN_ATTRS 106_mm_sqrt_sd(__m128d __a, __m128d __b) 107{ 108 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 109 return (__m128d) { __c[0], __a[1] }; 110} 111 112static __inline__ __m128d __DEFAULT_FN_ATTRS 113_mm_sqrt_pd(__m128d __a) 114{ 115 return __builtin_ia32_sqrtpd((__v2df)__a); 116} 117 118static __inline__ __m128d __DEFAULT_FN_ATTRS 119_mm_min_sd(__m128d __a, __m128d __b) 120{ 121 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 122} 123 124static __inline__ __m128d __DEFAULT_FN_ATTRS 125_mm_min_pd(__m128d __a, __m128d __b) 126{ 127 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 128} 129 130static __inline__ __m128d __DEFAULT_FN_ATTRS 131_mm_max_sd(__m128d __a, __m128d __b) 132{ 133 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 134} 135 136static __inline__ __m128d __DEFAULT_FN_ATTRS 137_mm_max_pd(__m128d __a, __m128d __b) 138{ 139 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 140} 141 142static __inline__ __m128d __DEFAULT_FN_ATTRS 143_mm_and_pd(__m128d __a, __m128d __b) 144{ 145 return (__m128d)((__v4si)__a & (__v4si)__b); 146} 147 148static __inline__ __m128d __DEFAULT_FN_ATTRS 149_mm_andnot_pd(__m128d __a, __m128d __b) 150{ 151 return (__m128d)(~(__v4si)__a & (__v4si)__b); 152} 153 154static __inline__ __m128d __DEFAULT_FN_ATTRS 155_mm_or_pd(__m128d __a, __m128d __b) 156{ 157 return (__m128d)((__v4si)__a | (__v4si)__b); 158} 159 160static __inline__ __m128d __DEFAULT_FN_ATTRS 161_mm_xor_pd(__m128d __a, __m128d __b) 162{ 163 return (__m128d)((__v4si)__a ^ (__v4si)__b); 164} 165 166static __inline__ __m128d __DEFAULT_FN_ATTRS 167_mm_cmpeq_pd(__m128d __a, __m128d __b) 168{ 169 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 170} 171 172static __inline__ __m128d __DEFAULT_FN_ATTRS 173_mm_cmplt_pd(__m128d __a, __m128d __b) 174{ 175 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 176} 177 178static __inline__ __m128d __DEFAULT_FN_ATTRS 179_mm_cmple_pd(__m128d __a, __m128d __b) 180{ 181 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 182} 183 184static __inline__ __m128d __DEFAULT_FN_ATTRS 185_mm_cmpgt_pd(__m128d __a, __m128d __b) 186{ 187 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 188} 189 190static __inline__ __m128d __DEFAULT_FN_ATTRS 191_mm_cmpge_pd(__m128d __a, __m128d __b) 192{ 193 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 194} 195 196static __inline__ __m128d __DEFAULT_FN_ATTRS 197_mm_cmpord_pd(__m128d __a, __m128d __b) 198{ 199 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 200} 201 202static __inline__ __m128d __DEFAULT_FN_ATTRS 203_mm_cmpunord_pd(__m128d __a, __m128d __b) 204{ 205 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 206} 207 208static __inline__ __m128d __DEFAULT_FN_ATTRS 209_mm_cmpneq_pd(__m128d __a, __m128d __b) 210{ 211 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 212} 213 214static __inline__ __m128d __DEFAULT_FN_ATTRS 215_mm_cmpnlt_pd(__m128d __a, __m128d __b) 216{ 217 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 218} 219 220static __inline__ __m128d __DEFAULT_FN_ATTRS 221_mm_cmpnle_pd(__m128d __a, __m128d __b) 222{ 223 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 224} 225 226static __inline__ __m128d __DEFAULT_FN_ATTRS 227_mm_cmpngt_pd(__m128d __a, __m128d __b) 228{ 229 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 230} 231 232static __inline__ __m128d __DEFAULT_FN_ATTRS 233_mm_cmpnge_pd(__m128d __a, __m128d __b) 234{ 235 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 236} 237 238static __inline__ __m128d __DEFAULT_FN_ATTRS 239_mm_cmpeq_sd(__m128d __a, __m128d __b) 240{ 241 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 242} 243 244static __inline__ __m128d __DEFAULT_FN_ATTRS 245_mm_cmplt_sd(__m128d __a, __m128d __b) 246{ 247 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 248} 249 250static __inline__ __m128d __DEFAULT_FN_ATTRS 251_mm_cmple_sd(__m128d __a, __m128d __b) 252{ 253 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 254} 255 256static __inline__ __m128d __DEFAULT_FN_ATTRS 257_mm_cmpgt_sd(__m128d __a, __m128d __b) 258{ 259 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 260 return (__m128d) { __c[0], __a[1] }; 261} 262 263static __inline__ __m128d __DEFAULT_FN_ATTRS 264_mm_cmpge_sd(__m128d __a, __m128d __b) 265{ 266 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 267 return (__m128d) { __c[0], __a[1] }; 268} 269 270static __inline__ __m128d __DEFAULT_FN_ATTRS 271_mm_cmpord_sd(__m128d __a, __m128d __b) 272{ 273 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 274} 275 276static __inline__ __m128d __DEFAULT_FN_ATTRS 277_mm_cmpunord_sd(__m128d __a, __m128d __b) 278{ 279 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 280} 281 282static __inline__ __m128d __DEFAULT_FN_ATTRS 283_mm_cmpneq_sd(__m128d __a, __m128d __b) 284{ 285 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 286} 287 288static __inline__ __m128d __DEFAULT_FN_ATTRS 289_mm_cmpnlt_sd(__m128d __a, __m128d __b) 290{ 291 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 292} 293 294static __inline__ __m128d __DEFAULT_FN_ATTRS 295_mm_cmpnle_sd(__m128d __a, __m128d __b) 296{ 297 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 298} 299 300static __inline__ __m128d __DEFAULT_FN_ATTRS 301_mm_cmpngt_sd(__m128d __a, __m128d __b) 302{ 303 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 304 return (__m128d) { __c[0], __a[1] }; 305} 306 307static __inline__ __m128d __DEFAULT_FN_ATTRS 308_mm_cmpnge_sd(__m128d __a, __m128d __b) 309{ 310 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 311 return (__m128d) { __c[0], __a[1] }; 312} 313 314static __inline__ int __DEFAULT_FN_ATTRS 315_mm_comieq_sd(__m128d __a, __m128d __b) 316{ 317 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 318} 319 320static __inline__ int __DEFAULT_FN_ATTRS 321_mm_comilt_sd(__m128d __a, __m128d __b) 322{ 323 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 324} 325 326static __inline__ int __DEFAULT_FN_ATTRS 327_mm_comile_sd(__m128d __a, __m128d __b) 328{ 329 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 330} 331 332static __inline__ int __DEFAULT_FN_ATTRS 333_mm_comigt_sd(__m128d __a, __m128d __b) 334{ 335 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 336} 337 338static __inline__ int __DEFAULT_FN_ATTRS 339_mm_comige_sd(__m128d __a, __m128d __b) 340{ 341 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 342} 343 344static __inline__ int __DEFAULT_FN_ATTRS 345_mm_comineq_sd(__m128d __a, __m128d __b) 346{ 347 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 348} 349 350static __inline__ int __DEFAULT_FN_ATTRS 351_mm_ucomieq_sd(__m128d __a, __m128d __b) 352{ 353 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 354} 355 356static __inline__ int __DEFAULT_FN_ATTRS 357_mm_ucomilt_sd(__m128d __a, __m128d __b) 358{ 359 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 360} 361 362static __inline__ int __DEFAULT_FN_ATTRS 363_mm_ucomile_sd(__m128d __a, __m128d __b) 364{ 365 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 366} 367 368static __inline__ int __DEFAULT_FN_ATTRS 369_mm_ucomigt_sd(__m128d __a, __m128d __b) 370{ 371 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 372} 373 374static __inline__ int __DEFAULT_FN_ATTRS 375_mm_ucomige_sd(__m128d __a, __m128d __b) 376{ 377 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 378} 379 380static __inline__ int __DEFAULT_FN_ATTRS 381_mm_ucomineq_sd(__m128d __a, __m128d __b) 382{ 383 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 384} 385 386static __inline__ __m128 __DEFAULT_FN_ATTRS 387_mm_cvtpd_ps(__m128d __a) 388{ 389 return __builtin_ia32_cvtpd2ps((__v2df)__a); 390} 391 392static __inline__ __m128d __DEFAULT_FN_ATTRS 393_mm_cvtps_pd(__m128 __a) 394{ 395 return (__m128d) __builtin_convertvector( 396 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 397} 398 399static __inline__ __m128d __DEFAULT_FN_ATTRS 400_mm_cvtepi32_pd(__m128i __a) 401{ 402 return (__m128d) __builtin_convertvector( 403 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 404} 405 406static __inline__ __m128i __DEFAULT_FN_ATTRS 407_mm_cvtpd_epi32(__m128d __a) 408{ 409 return __builtin_ia32_cvtpd2dq((__v2df)__a); 410} 411 412static __inline__ int __DEFAULT_FN_ATTRS 413_mm_cvtsd_si32(__m128d __a) 414{ 415 return __builtin_ia32_cvtsd2si((__v2df)__a); 416} 417 418static __inline__ __m128 __DEFAULT_FN_ATTRS 419_mm_cvtsd_ss(__m128 __a, __m128d __b) 420{ 421 __a[0] = __b[0]; 422 return __a; 423} 424 425static __inline__ __m128d __DEFAULT_FN_ATTRS 426_mm_cvtsi32_sd(__m128d __a, int __b) 427{ 428 __a[0] = __b; 429 return __a; 430} 431 432static __inline__ __m128d __DEFAULT_FN_ATTRS 433_mm_cvtss_sd(__m128d __a, __m128 __b) 434{ 435 __a[0] = __b[0]; 436 return __a; 437} 438 439static __inline__ __m128i __DEFAULT_FN_ATTRS 440_mm_cvttpd_epi32(__m128d __a) 441{ 442 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 443} 444 445static __inline__ int __DEFAULT_FN_ATTRS 446_mm_cvttsd_si32(__m128d __a) 447{ 448 return __a[0]; 449} 450 451static __inline__ __m64 __DEFAULT_FN_ATTRS 452_mm_cvtpd_pi32(__m128d __a) 453{ 454 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 455} 456 457static __inline__ __m64 __DEFAULT_FN_ATTRS 458_mm_cvttpd_pi32(__m128d __a) 459{ 460 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 461} 462 463static __inline__ __m128d __DEFAULT_FN_ATTRS 464_mm_cvtpi32_pd(__m64 __a) 465{ 466 return __builtin_ia32_cvtpi2pd((__v2si)__a); 467} 468 469static __inline__ double __DEFAULT_FN_ATTRS 470_mm_cvtsd_f64(__m128d __a) 471{ 472 return __a[0]; 473} 474 475static __inline__ __m128d __DEFAULT_FN_ATTRS 476_mm_load_pd(double const *__dp) 477{ 478 return *(__m128d*)__dp; 479} 480 481static __inline__ __m128d __DEFAULT_FN_ATTRS 482_mm_load1_pd(double const *__dp) 483{ 484 struct __mm_load1_pd_struct { 485 double __u; 486 } __attribute__((__packed__, __may_alias__)); 487 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 488 return (__m128d){ __u, __u }; 489} 490 491#define _mm_load_pd1(dp) _mm_load1_pd(dp) 492 493static __inline__ __m128d __DEFAULT_FN_ATTRS 494_mm_loadr_pd(double const *__dp) 495{ 496 __m128d __u = *(__m128d*)__dp; 497 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 498} 499 500static __inline__ __m128d __DEFAULT_FN_ATTRS 501_mm_loadu_pd(double const *__dp) 502{ 503 struct __loadu_pd { 504 __m128d __v; 505 } __attribute__((__packed__, __may_alias__)); 506 return ((struct __loadu_pd*)__dp)->__v; 507} 508 509static __inline__ __m128d __DEFAULT_FN_ATTRS 510_mm_load_sd(double const *__dp) 511{ 512 struct __mm_load_sd_struct { 513 double __u; 514 } __attribute__((__packed__, __may_alias__)); 515 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 516 return (__m128d){ __u, 0 }; 517} 518 519static __inline__ __m128d __DEFAULT_FN_ATTRS 520_mm_loadh_pd(__m128d __a, double const *__dp) 521{ 522 struct __mm_loadh_pd_struct { 523 double __u; 524 } __attribute__((__packed__, __may_alias__)); 525 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 526 return (__m128d){ __a[0], __u }; 527} 528 529static __inline__ __m128d __DEFAULT_FN_ATTRS 530_mm_loadl_pd(__m128d __a, double const *__dp) 531{ 532 struct __mm_loadl_pd_struct { 533 double __u; 534 } __attribute__((__packed__, __may_alias__)); 535 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 536 return (__m128d){ __u, __a[1] }; 537} 538 539static __inline__ __m128d __DEFAULT_FN_ATTRS 540_mm_undefined_pd() 541{ 542 return (__m128d)__builtin_ia32_undef128(); 543} 544 545static __inline__ __m128d __DEFAULT_FN_ATTRS 546_mm_set_sd(double __w) 547{ 548 return (__m128d){ __w, 0 }; 549} 550 551static __inline__ __m128d __DEFAULT_FN_ATTRS 552_mm_set1_pd(double __w) 553{ 554 return (__m128d){ __w, __w }; 555} 556 557static __inline__ __m128d __DEFAULT_FN_ATTRS 558_mm_set_pd(double __w, double __x) 559{ 560 return (__m128d){ __x, __w }; 561} 562 563static __inline__ __m128d __DEFAULT_FN_ATTRS 564_mm_setr_pd(double __w, double __x) 565{ 566 return (__m128d){ __w, __x }; 567} 568 569static __inline__ __m128d __DEFAULT_FN_ATTRS 570_mm_setzero_pd(void) 571{ 572 return (__m128d){ 0, 0 }; 573} 574 575static __inline__ __m128d __DEFAULT_FN_ATTRS 576_mm_move_sd(__m128d __a, __m128d __b) 577{ 578 return (__m128d){ __b[0], __a[1] }; 579} 580 581static __inline__ void __DEFAULT_FN_ATTRS 582_mm_store_sd(double *__dp, __m128d __a) 583{ 584 struct __mm_store_sd_struct { 585 double __u; 586 } __attribute__((__packed__, __may_alias__)); 587 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 588} 589 590static __inline__ void __DEFAULT_FN_ATTRS 591_mm_store_pd(double *__dp, __m128d __a) 592{ 593 *(__m128d*)__dp = __a; 594} 595 596static __inline__ void __DEFAULT_FN_ATTRS 597_mm_store1_pd(double *__dp, __m128d __a) 598{ 599 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 600 _mm_store_pd(__dp, __a); 601} 602 603static __inline__ void __DEFAULT_FN_ATTRS 604_mm_store_pd1(double *__dp, __m128d __a) 605{ 606 return _mm_store1_pd(__dp, __a); 607} 608 609static __inline__ void __DEFAULT_FN_ATTRS 610_mm_storeu_pd(double *__dp, __m128d __a) 611{ 612 struct __storeu_pd { 613 __m128d __v; 614 } __attribute__((__packed__, __may_alias__)); 615 ((struct __storeu_pd*)__dp)->__v = __a; 616} 617 618static __inline__ void __DEFAULT_FN_ATTRS 619_mm_storer_pd(double *__dp, __m128d __a) 620{ 621 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 622 *(__m128d *)__dp = __a; 623} 624 625static __inline__ void __DEFAULT_FN_ATTRS 626_mm_storeh_pd(double *__dp, __m128d __a) 627{ 628 struct __mm_storeh_pd_struct { 629 double __u; 630 } __attribute__((__packed__, __may_alias__)); 631 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 632} 633 634static __inline__ void __DEFAULT_FN_ATTRS 635_mm_storel_pd(double *__dp, __m128d __a) 636{ 637 struct __mm_storeh_pd_struct { 638 double __u; 639 } __attribute__((__packed__, __may_alias__)); 640 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 641} 642 643static __inline__ __m128i __DEFAULT_FN_ATTRS 644_mm_add_epi8(__m128i __a, __m128i __b) 645{ 646 return (__m128i)((__v16qi)__a + (__v16qi)__b); 647} 648 649static __inline__ __m128i __DEFAULT_FN_ATTRS 650_mm_add_epi16(__m128i __a, __m128i __b) 651{ 652 return (__m128i)((__v8hi)__a + (__v8hi)__b); 653} 654 655static __inline__ __m128i __DEFAULT_FN_ATTRS 656_mm_add_epi32(__m128i __a, __m128i __b) 657{ 658 return (__m128i)((__v4si)__a + (__v4si)__b); 659} 660 661static __inline__ __m64 __DEFAULT_FN_ATTRS 662_mm_add_si64(__m64 __a, __m64 __b) 663{ 664 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 665} 666 667static __inline__ __m128i __DEFAULT_FN_ATTRS 668_mm_add_epi64(__m128i __a, __m128i __b) 669{ 670 return (__m128i)((__v2di)__a + (__v2di)__b); 671} 672 673static __inline__ __m128i __DEFAULT_FN_ATTRS 674_mm_adds_epi8(__m128i __a, __m128i __b) 675{ 676 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 677} 678 679static __inline__ __m128i __DEFAULT_FN_ATTRS 680_mm_adds_epi16(__m128i __a, __m128i __b) 681{ 682 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 683} 684 685static __inline__ __m128i __DEFAULT_FN_ATTRS 686_mm_adds_epu8(__m128i __a, __m128i __b) 687{ 688 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 689} 690 691static __inline__ __m128i __DEFAULT_FN_ATTRS 692_mm_adds_epu16(__m128i __a, __m128i __b) 693{ 694 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 695} 696 697static __inline__ __m128i __DEFAULT_FN_ATTRS 698_mm_avg_epu8(__m128i __a, __m128i __b) 699{ 700 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 701} 702 703static __inline__ __m128i __DEFAULT_FN_ATTRS 704_mm_avg_epu16(__m128i __a, __m128i __b) 705{ 706 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 707} 708 709static __inline__ __m128i __DEFAULT_FN_ATTRS 710_mm_madd_epi16(__m128i __a, __m128i __b) 711{ 712 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 713} 714 715static __inline__ __m128i __DEFAULT_FN_ATTRS 716_mm_max_epi16(__m128i __a, __m128i __b) 717{ 718 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 719} 720 721static __inline__ __m128i __DEFAULT_FN_ATTRS 722_mm_max_epu8(__m128i __a, __m128i __b) 723{ 724 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 725} 726 727static __inline__ __m128i __DEFAULT_FN_ATTRS 728_mm_min_epi16(__m128i __a, __m128i __b) 729{ 730 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 731} 732 733static __inline__ __m128i __DEFAULT_FN_ATTRS 734_mm_min_epu8(__m128i __a, __m128i __b) 735{ 736 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 737} 738 739static __inline__ __m128i __DEFAULT_FN_ATTRS 740_mm_mulhi_epi16(__m128i __a, __m128i __b) 741{ 742 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 743} 744 745static __inline__ __m128i __DEFAULT_FN_ATTRS 746_mm_mulhi_epu16(__m128i __a, __m128i __b) 747{ 748 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 749} 750 751/// \brief Multiplies the corresponding elements of two [8 x short] vectors and 752/// returns a vector containing the low-order 16 bits of each 32-bit product 753/// in the corresponding element. 754/// 755/// \headerfile <x86intrin.h> 756/// 757/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction. 758/// 759/// \param __a 760/// A 128-bit integer vector containing one of the source operands. 761/// \param __b 762/// A 128-bit integer vector containing one of the source operands. 763/// \returns A 128-bit integer vector containing the products of both operands. 764static __inline__ __m128i __DEFAULT_FN_ATTRS 765_mm_mullo_epi16(__m128i __a, __m128i __b) 766{ 767 return (__m128i)((__v8hi)__a * (__v8hi)__b); 768} 769 770/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 771/// of the two 64-bit integer vectors and returns the 64-bit unsigned 772/// product. 773/// 774/// \headerfile <x86intrin.h> 775/// 776/// This intrinsic corresponds to the \c PMULUDQ instruction. 777/// 778/// \param __a 779/// A 64-bit integer containing one of the source operands. 780/// \param __b 781/// A 64-bit integer containing one of the source operands. 782/// \returns A 64-bit integer vector containing the product of both operands. 783static __inline__ __m64 __DEFAULT_FN_ATTRS 784_mm_mul_su32(__m64 __a, __m64 __b) 785{ 786 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 787} 788 789/// \brief Multiplies 32-bit unsigned integer values contained in the lower 790/// bits of the corresponding elements of two [2 x i64] vectors, and returns 791/// the 64-bit products in the corresponding elements of a [2 x i64] vector. 792/// 793/// \headerfile <x86intrin.h> 794/// 795/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction. 796/// 797/// \param __a 798/// A [2 x i64] vector containing one of the source operands. 799/// \param __b 800/// A [2 x i64] vector containing one of the source operands. 801/// \returns A [2 x i64] vector containing the product of both operands. 802static __inline__ __m128i __DEFAULT_FN_ATTRS 803_mm_mul_epu32(__m128i __a, __m128i __b) 804{ 805 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 806} 807 808/// \brief Computes the absolute differences of corresponding 8-bit integer 809/// values in two 128-bit vectors. Sums the first 8 absolute differences, and 810/// separately sums the second 8 absolute differences. Packss these two 811/// unsigned 16-bit integer sums into the upper and lower elements of a 812/// [2 x i64] vector. 813/// 814/// \headerfile <x86intrin.h> 815/// 816/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction. 817/// 818/// \param __a 819/// A 128-bit integer vector containing one of the source operands. 820/// \param __b 821/// A 128-bit integer vector containing one of the source operands. 822/// \returns A [2 x i64] vector containing the sums of the sets of absolute 823/// differences between both operands. 824static __inline__ __m128i __DEFAULT_FN_ATTRS 825_mm_sad_epu8(__m128i __a, __m128i __b) 826{ 827 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 828} 829 830/// \brief Subtracts the corresponding 8-bit integer values in the operands. 831/// 832/// \headerfile <x86intrin.h> 833/// 834/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction. 835/// 836/// \param __a 837/// A 128-bit integer vector containing the minuends. 838/// \param __b 839/// A 128-bit integer vector containing the subtrahends. 840/// \returns A 128-bit integer vector containing the differences of the values 841/// in the operands. 842static __inline__ __m128i __DEFAULT_FN_ATTRS 843_mm_sub_epi8(__m128i __a, __m128i __b) 844{ 845 return (__m128i)((__v16qi)__a - (__v16qi)__b); 846} 847 848/// \brief Subtracts the corresponding 16-bit integer values in the operands. 849/// 850/// \headerfile <x86intrin.h> 851/// 852/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction. 853/// 854/// \param __a 855/// A 128-bit integer vector containing the minuends. 856/// \param __b 857/// A 128-bit integer vector containing the subtrahends. 858/// \returns A 128-bit integer vector containing the differences of the values 859/// in the operands. 860static __inline__ __m128i __DEFAULT_FN_ATTRS 861_mm_sub_epi16(__m128i __a, __m128i __b) 862{ 863 return (__m128i)((__v8hi)__a - (__v8hi)__b); 864} 865 866/// \brief Subtracts the corresponding 32-bit integer values in the operands. 867/// 868/// \headerfile <x86intrin.h> 869/// 870/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction. 871/// 872/// \param __a 873/// A 128-bit integer vector containing the minuends. 874/// \param __b 875/// A 128-bit integer vector containing the subtrahends. 876/// \returns A 128-bit integer vector containing the differences of the values 877/// in the operands. 878static __inline__ __m128i __DEFAULT_FN_ATTRS 879_mm_sub_epi32(__m128i __a, __m128i __b) 880{ 881 return (__m128i)((__v4si)__a - (__v4si)__b); 882} 883 884/// \brief Subtracts signed or unsigned 64-bit integer values and writes the 885/// difference to the corresponding bits in the destination. 886/// 887/// \headerfile <x86intrin.h> 888/// 889/// This intrinsic corresponds to the \c PSUBQ instruction. 890/// 891/// \param __a 892/// A 64-bit integer vector containing the minuend. 893/// \param __b 894/// A 64-bit integer vector containing the subtrahend. 895/// \returns A 64-bit integer vector containing the difference of the values in 896/// the operands. 897static __inline__ __m64 __DEFAULT_FN_ATTRS 898_mm_sub_si64(__m64 __a, __m64 __b) 899{ 900 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 901} 902 903/// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 904/// 905/// \headerfile <x86intrin.h> 906/// 907/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction. 908/// 909/// \param __a 910/// A 128-bit integer vector containing the minuends. 911/// \param __b 912/// A 128-bit integer vector containing the subtrahends. 913/// \returns A 128-bit integer vector containing the differences of the values 914/// in the operands. 915static __inline__ __m128i __DEFAULT_FN_ATTRS 916_mm_sub_epi64(__m128i __a, __m128i __b) 917{ 918 return (__m128i)((__v2di)__a - (__v2di)__b); 919} 920 921/// \brief Subtracts corresponding 8-bit signed integer values in the input and 922/// returns the differences in the corresponding bytes in the destination. 923/// Differences greater than 7Fh are saturated to 7Fh, and differences less 924/// than 80h are saturated to 80h. 925/// 926/// \headerfile <x86intrin.h> 927/// 928/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction. 929/// 930/// \param __a 931/// A 128-bit integer vector containing the minuends. 932/// \param __b 933/// A 128-bit integer vector containing the subtrahends. 934/// \returns A 128-bit integer vector containing the differences of the values 935/// in the operands. 936static __inline__ __m128i __DEFAULT_FN_ATTRS 937_mm_subs_epi8(__m128i __a, __m128i __b) 938{ 939 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 940} 941 942/// \brief Subtracts corresponding 16-bit signed integer values in the input and 943/// returns the differences in the corresponding bytes in the destination. 944/// Differences greater than 7FFFh are saturated to 7FFFh, and values less 945/// than 8000h are saturated to 8000h. 946/// 947/// \headerfile <x86intrin.h> 948/// 949/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction. 950/// 951/// \param __a 952/// A 128-bit integer vector containing the minuends. 953/// \param __b 954/// A 128-bit integer vector containing the subtrahends. 955/// \returns A 128-bit integer vector containing the differences of the values 956/// in the operands. 957static __inline__ __m128i __DEFAULT_FN_ATTRS 958_mm_subs_epi16(__m128i __a, __m128i __b) 959{ 960 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 961} 962 963/// \brief Subtracts corresponding 8-bit unsigned integer values in the input 964/// and returns the differences in the corresponding bytes in the 965/// destination. Differences less than 00h are saturated to 00h. 966/// 967/// \headerfile <x86intrin.h> 968/// 969/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction. 970/// 971/// \param __a 972/// A 128-bit integer vector containing the minuends. 973/// \param __b 974/// A 128-bit integer vector containing the subtrahends. 975/// \returns A 128-bit integer vector containing the unsigned integer 976/// differences of the values in the operands. 977static __inline__ __m128i __DEFAULT_FN_ATTRS 978_mm_subs_epu8(__m128i __a, __m128i __b) 979{ 980 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 981} 982 983/// \brief Subtracts corresponding 16-bit unsigned integer values in the input 984/// and returns the differences in the corresponding bytes in the 985/// destination. Differences less than 0000h are saturated to 0000h. 986/// 987/// \headerfile <x86intrin.h> 988/// 989/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction. 990/// 991/// \param __a 992/// A 128-bit integer vector containing the minuends. 993/// \param __b 994/// A 128-bit integer vector containing the subtrahends. 995/// \returns A 128-bit integer vector containing the unsigned integer 996/// differences of the values in the operands. 997static __inline__ __m128i __DEFAULT_FN_ATTRS 998_mm_subs_epu16(__m128i __a, __m128i __b) 999{ 1000 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 1001} 1002 1003/// \brief Performs a bitwise AND of two 128-bit integer vectors. 1004/// 1005/// \headerfile <x86intrin.h> 1006/// 1007/// This intrinsic corresponds to the \c VPAND / PAND instruction. 1008/// 1009/// \param __a 1010/// A 128-bit integer vector containing one of the source operands. 1011/// \param __b 1012/// A 128-bit integer vector containing one of the source operands. 1013/// \returns A 128-bit integer vector containing the bitwise AND of the values 1014/// in both operands. 1015static __inline__ __m128i __DEFAULT_FN_ATTRS 1016_mm_and_si128(__m128i __a, __m128i __b) 1017{ 1018 return (__m128i)((__v2di)__a & (__v2di)__b); 1019} 1020 1021/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 1022/// one's complement of the values contained in the first source operand. 1023/// 1024/// \headerfile <x86intrin.h> 1025/// 1026/// This intrinsic corresponds to the \c VPANDN / PANDN instruction. 1027/// 1028/// \param __a 1029/// A 128-bit vector containing the left source operand. The one's complement 1030/// of this value is used in the bitwise AND. 1031/// \param __b 1032/// A 128-bit vector containing the right source operand. 1033/// \returns A 128-bit integer vector containing the bitwise AND of the one's 1034/// complement of the first operand and the values in the second operand. 1035static __inline__ __m128i __DEFAULT_FN_ATTRS 1036_mm_andnot_si128(__m128i __a, __m128i __b) 1037{ 1038 return (__m128i)(~(__v2di)__a & (__v2di)__b); 1039} 1040/// \brief Performs a bitwise OR of two 128-bit integer vectors. 1041/// 1042/// \headerfile <x86intrin.h> 1043/// 1044/// This intrinsic corresponds to the \c VPOR / POR instruction. 1045/// 1046/// \param __a 1047/// A 128-bit integer vector containing one of the source operands. 1048/// \param __b 1049/// A 128-bit integer vector containing one of the source operands. 1050/// \returns A 128-bit integer vector containing the bitwise OR of the values 1051/// in both operands. 1052static __inline__ __m128i __DEFAULT_FN_ATTRS 1053_mm_or_si128(__m128i __a, __m128i __b) 1054{ 1055 return (__m128i)((__v2di)__a | (__v2di)__b); 1056} 1057 1058/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 1059/// 1060/// \headerfile <x86intrin.h> 1061/// 1062/// This intrinsic corresponds to the \c VPXOR / PXOR instruction. 1063/// 1064/// \param __a 1065/// A 128-bit integer vector containing one of the source operands. 1066/// \param __b 1067/// A 128-bit integer vector containing one of the source operands. 1068/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 1069/// values in both operands. 1070static __inline__ __m128i __DEFAULT_FN_ATTRS 1071_mm_xor_si128(__m128i __a, __m128i __b) 1072{ 1073 return (__m128i)((__v2di)__a ^ (__v2di)__b); 1074} 1075 1076/// \brief Left-shifts the 128-bit integer vector operand by the specified 1077/// number of bytes. Low-order bits are cleared. 1078/// 1079/// \headerfile <x86intrin.h> 1080/// 1081/// \code 1082/// __m128i _mm_slli_si128(__m128i a, const int imm); 1083/// \endcode 1084/// 1085/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction. 1086/// 1087/// \param a 1088/// A 128-bit integer vector containing the source operand. 1089/// \param imm 1090/// An immediate value specifying the number of bytes to left-shift 1091/// operand a. 1092/// \returns A 128-bit integer vector containing the left-shifted value. 1093#define _mm_slli_si128(a, imm) __extension__ ({ \ 1094 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ 1095 (__v16qi)(__m128i)(a), \ 1096 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \ 1097 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \ 1098 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \ 1099 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \ 1100 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \ 1101 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \ 1102 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \ 1103 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \ 1104 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \ 1105 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \ 1106 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \ 1107 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \ 1108 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \ 1109 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \ 1110 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \ 1111 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); }) 1112 1113#define _mm_bslli_si128(a, imm) \ 1114 _mm_slli_si128((a), (imm)) 1115 1116/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1117/// by the specified number of bits. Low-order bits are cleared. 1118/// 1119/// \headerfile <x86intrin.h> 1120/// 1121/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1122/// 1123/// \param __a 1124/// A 128-bit integer vector containing the source operand. 1125/// \param __count 1126/// An integer value specifying the number of bits to left-shift each value 1127/// in operand __a. 1128/// \returns A 128-bit integer vector containing the left-shifted values. 1129static __inline__ __m128i __DEFAULT_FN_ATTRS 1130_mm_slli_epi16(__m128i __a, int __count) 1131{ 1132 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 1133} 1134 1135/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1136/// by the specified number of bits. Low-order bits are cleared. 1137/// 1138/// \headerfile <x86intrin.h> 1139/// 1140/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1141/// 1142/// \param __a 1143/// A 128-bit integer vector containing the source operand. 1144/// \param __count 1145/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1146/// to left-shift each value in operand __a. 1147/// \returns A 128-bit integer vector containing the left-shifted values. 1148static __inline__ __m128i __DEFAULT_FN_ATTRS 1149_mm_sll_epi16(__m128i __a, __m128i __count) 1150{ 1151 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 1152} 1153 1154/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1155/// by the specified number of bits. Low-order bits are cleared. 1156/// 1157/// \headerfile <x86intrin.h> 1158/// 1159/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1160/// 1161/// \param __a 1162/// A 128-bit integer vector containing the source operand. 1163/// \param __count 1164/// An integer value specifying the number of bits to left-shift each value 1165/// in operand __a. 1166/// \returns A 128-bit integer vector containing the left-shifted values. 1167static __inline__ __m128i __DEFAULT_FN_ATTRS 1168_mm_slli_epi32(__m128i __a, int __count) 1169{ 1170 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 1171} 1172 1173/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1174/// by the specified number of bits. Low-order bits are cleared. 1175/// 1176/// \headerfile <x86intrin.h> 1177/// 1178/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1179/// 1180/// \param __a 1181/// A 128-bit integer vector containing the source operand. 1182/// \param __count 1183/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1184/// to left-shift each value in operand __a. 1185/// \returns A 128-bit integer vector containing the left-shifted values. 1186static __inline__ __m128i __DEFAULT_FN_ATTRS 1187_mm_sll_epi32(__m128i __a, __m128i __count) 1188{ 1189 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 1190} 1191 1192/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1193/// by the specified number of bits. Low-order bits are cleared. 1194/// 1195/// \headerfile <x86intrin.h> 1196/// 1197/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1198/// 1199/// \param __a 1200/// A 128-bit integer vector containing the source operand. 1201/// \param __count 1202/// An integer value specifying the number of bits to left-shift each value 1203/// in operand __a. 1204/// \returns A 128-bit integer vector containing the left-shifted values. 1205static __inline__ __m128i __DEFAULT_FN_ATTRS 1206_mm_slli_epi64(__m128i __a, int __count) 1207{ 1208 return __builtin_ia32_psllqi128((__v2di)__a, __count); 1209} 1210 1211/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1212/// by the specified number of bits. Low-order bits are cleared. 1213/// 1214/// \headerfile <x86intrin.h> 1215/// 1216/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1217/// 1218/// \param __a 1219/// A 128-bit integer vector containing the source operand. 1220/// \param __count 1221/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1222/// to left-shift each value in operand __a. 1223/// \returns A 128-bit integer vector containing the left-shifted values. 1224static __inline__ __m128i __DEFAULT_FN_ATTRS 1225_mm_sll_epi64(__m128i __a, __m128i __count) 1226{ 1227 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 1228} 1229 1230/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1231/// by the specified number of bits. High-order bits are filled with the sign 1232/// bit of the initial value. 1233/// 1234/// \headerfile <x86intrin.h> 1235/// 1236/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1237/// 1238/// \param __a 1239/// A 128-bit integer vector containing the source operand. 1240/// \param __count 1241/// An integer value specifying the number of bits to right-shift each value 1242/// in operand __a. 1243/// \returns A 128-bit integer vector containing the right-shifted values. 1244static __inline__ __m128i __DEFAULT_FN_ATTRS 1245_mm_srai_epi16(__m128i __a, int __count) 1246{ 1247 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 1248} 1249 1250/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1251/// by the specified number of bits. High-order bits are filled with the sign 1252/// bit of the initial value. 1253/// 1254/// \headerfile <x86intrin.h> 1255/// 1256/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1257/// 1258/// \param __a 1259/// A 128-bit integer vector containing the source operand. 1260/// \param __count 1261/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1262/// to right-shift each value in operand __a. 1263/// \returns A 128-bit integer vector containing the right-shifted values. 1264static __inline__ __m128i __DEFAULT_FN_ATTRS 1265_mm_sra_epi16(__m128i __a, __m128i __count) 1266{ 1267 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 1268} 1269 1270/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1271/// by the specified number of bits. High-order bits are filled with the sign 1272/// bit of the initial value. 1273/// 1274/// \headerfile <x86intrin.h> 1275/// 1276/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1277/// 1278/// \param __a 1279/// A 128-bit integer vector containing the source operand. 1280/// \param __count 1281/// An integer value specifying the number of bits to right-shift each value 1282/// in operand __a. 1283/// \returns A 128-bit integer vector containing the right-shifted values. 1284static __inline__ __m128i __DEFAULT_FN_ATTRS 1285_mm_srai_epi32(__m128i __a, int __count) 1286{ 1287 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 1288} 1289 1290/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1291/// by the specified number of bits. High-order bits are filled with the sign 1292/// bit of the initial value. 1293/// 1294/// \headerfile <x86intrin.h> 1295/// 1296/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1297/// 1298/// \param __a 1299/// A 128-bit integer vector containing the source operand. 1300/// \param __count 1301/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1302/// to right-shift each value in operand __a. 1303/// \returns A 128-bit integer vector containing the right-shifted values. 1304static __inline__ __m128i __DEFAULT_FN_ATTRS 1305_mm_sra_epi32(__m128i __a, __m128i __count) 1306{ 1307 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 1308} 1309 1310/// \brief Right-shifts the 128-bit integer vector operand by the specified 1311/// number of bytes. High-order bits are cleared. 1312/// 1313/// \headerfile <x86intrin.h> 1314/// 1315/// \code 1316/// __m128i _mm_srli_si128(__m128i a, const int imm); 1317/// \endcode 1318/// 1319/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction. 1320/// 1321/// \param a 1322/// A 128-bit integer vector containing the source operand. 1323/// \param imm 1324/// An immediate value specifying the number of bytes to right-shift operand 1325/// a. 1326/// \returns A 128-bit integer vector containing the right-shifted value. 1327#define _mm_srli_si128(a, imm) __extension__ ({ \ 1328 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ 1329 (__v16qi)_mm_setzero_si128(), \ 1330 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \ 1331 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \ 1332 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \ 1333 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \ 1334 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \ 1335 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \ 1336 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \ 1337 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \ 1338 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \ 1339 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \ 1340 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \ 1341 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \ 1342 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \ 1343 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \ 1344 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \ 1345 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); }) 1346 1347#define _mm_bsrli_si128(a, imm) \ 1348 _mm_srli_si128((a), (imm)) 1349 1350/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1351/// operand by the specified number of bits. High-order bits are cleared. 1352/// 1353/// \headerfile <x86intrin.h> 1354/// 1355/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1356/// 1357/// \param __a 1358/// A 128-bit integer vector containing the source operand. 1359/// \param __count 1360/// An integer value specifying the number of bits to right-shift each value 1361/// in operand __a. 1362/// \returns A 128-bit integer vector containing the right-shifted values. 1363static __inline__ __m128i __DEFAULT_FN_ATTRS 1364_mm_srli_epi16(__m128i __a, int __count) 1365{ 1366 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 1367} 1368 1369/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1370/// operand by the specified number of bits. High-order bits are cleared. 1371/// 1372/// \headerfile <x86intrin.h> 1373/// 1374/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1375/// 1376/// \param __a 1377/// A 128-bit integer vector containing the source operand. 1378/// \param __count 1379/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1380/// to right-shift each value in operand __a. 1381/// \returns A 128-bit integer vector containing the right-shifted values. 1382static __inline__ __m128i __DEFAULT_FN_ATTRS 1383_mm_srl_epi16(__m128i __a, __m128i __count) 1384{ 1385 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 1386} 1387 1388/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1389/// operand by the specified number of bits. High-order bits are cleared. 1390/// 1391/// \headerfile <x86intrin.h> 1392/// 1393/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1394/// 1395/// \param __a 1396/// A 128-bit integer vector containing the source operand. 1397/// \param __count 1398/// An integer value specifying the number of bits to right-shift each value 1399/// in operand __a. 1400/// \returns A 128-bit integer vector containing the right-shifted values. 1401static __inline__ __m128i __DEFAULT_FN_ATTRS 1402_mm_srli_epi32(__m128i __a, int __count) 1403{ 1404 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 1405} 1406 1407/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1408/// operand by the specified number of bits. High-order bits are cleared. 1409/// 1410/// \headerfile <x86intrin.h> 1411/// 1412/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1413/// 1414/// \param __a 1415/// A 128-bit integer vector containing the source operand. 1416/// \param __count 1417/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1418/// to right-shift each value in operand __a. 1419/// \returns A 128-bit integer vector containing the right-shifted values. 1420static __inline__ __m128i __DEFAULT_FN_ATTRS 1421_mm_srl_epi32(__m128i __a, __m128i __count) 1422{ 1423 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 1424} 1425 1426/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1427/// operand by the specified number of bits. High-order bits are cleared. 1428/// 1429/// \headerfile <x86intrin.h> 1430/// 1431/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1432/// 1433/// \param __a 1434/// A 128-bit integer vector containing the source operand. 1435/// \param __count 1436/// An integer value specifying the number of bits to right-shift each value 1437/// in operand __a. 1438/// \returns A 128-bit integer vector containing the right-shifted values. 1439static __inline__ __m128i __DEFAULT_FN_ATTRS 1440_mm_srli_epi64(__m128i __a, int __count) 1441{ 1442 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 1443} 1444 1445/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1446/// operand by the specified number of bits. High-order bits are cleared. 1447/// 1448/// \headerfile <x86intrin.h> 1449/// 1450/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1451/// 1452/// \param __a 1453/// A 128-bit integer vector containing the source operand. 1454/// \param __count 1455/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1456/// to right-shift each value in operand __a. 1457/// \returns A 128-bit integer vector containing the right-shifted values. 1458static __inline__ __m128i __DEFAULT_FN_ATTRS 1459_mm_srl_epi64(__m128i __a, __m128i __count) 1460{ 1461 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 1462} 1463 1464/// \brief Compares each of the corresponding 8-bit values of the 128-bit 1465/// integer vectors for equality. Each comparison yields 0h for false, FFh 1466/// for true. 1467/// 1468/// \headerfile <x86intrin.h> 1469/// 1470/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction. 1471/// 1472/// \param __a 1473/// A 128-bit integer vector. 1474/// \param __b 1475/// A 128-bit integer vector. 1476/// \returns A 128-bit integer vector containing the comparison results. 1477static __inline__ __m128i __DEFAULT_FN_ATTRS 1478_mm_cmpeq_epi8(__m128i __a, __m128i __b) 1479{ 1480 return (__m128i)((__v16qi)__a == (__v16qi)__b); 1481} 1482 1483/// \brief Compares each of the corresponding 16-bit values of the 128-bit 1484/// integer vectors for equality. Each comparison yields 0h for false, FFFFh 1485/// for true. 1486/// 1487/// \headerfile <x86intrin.h> 1488/// 1489/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction. 1490/// 1491/// \param __a 1492/// A 128-bit integer vector. 1493/// \param __b 1494/// A 128-bit integer vector. 1495/// \returns A 128-bit integer vector containing the comparison results. 1496static __inline__ __m128i __DEFAULT_FN_ATTRS 1497_mm_cmpeq_epi16(__m128i __a, __m128i __b) 1498{ 1499 return (__m128i)((__v8hi)__a == (__v8hi)__b); 1500} 1501 1502/// \brief Compares each of the corresponding 32-bit values of the 128-bit 1503/// integer vectors for equality. Each comparison yields 0h for false, 1504/// FFFFFFFFh for true. 1505/// 1506/// \headerfile <x86intrin.h> 1507/// 1508/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction. 1509/// 1510/// \param __a 1511/// A 128-bit integer vector. 1512/// \param __b 1513/// A 128-bit integer vector. 1514/// \returns A 128-bit integer vector containing the comparison results. 1515static __inline__ __m128i __DEFAULT_FN_ATTRS 1516_mm_cmpeq_epi32(__m128i __a, __m128i __b) 1517{ 1518 return (__m128i)((__v4si)__a == (__v4si)__b); 1519} 1520 1521/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1522/// integer vectors to determine if the values in the first operand are 1523/// greater than those in the second operand. Each comparison yields 0h for 1524/// false, FFh for true. 1525/// 1526/// \headerfile <x86intrin.h> 1527/// 1528/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1529/// 1530/// \param __a 1531/// A 128-bit integer vector. 1532/// \param __b 1533/// A 128-bit integer vector. 1534/// \returns A 128-bit integer vector containing the comparison results. 1535static __inline__ __m128i __DEFAULT_FN_ATTRS 1536_mm_cmpgt_epi8(__m128i __a, __m128i __b) 1537{ 1538 /* This function always performs a signed comparison, but __v16qi is a char 1539 which may be signed or unsigned, so use __v16qs. */ 1540 return (__m128i)((__v16qs)__a > (__v16qs)__b); 1541} 1542 1543/// \brief Compares each of the corresponding signed 16-bit values of the 1544/// 128-bit integer vectors to determine if the values in the first operand 1545/// are greater than those in the second operand. Each comparison yields 0h 1546/// for false, FFFFh for true. 1547/// 1548/// \headerfile <x86intrin.h> 1549/// 1550/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1551/// 1552/// \param __a 1553/// A 128-bit integer vector. 1554/// \param __b 1555/// A 128-bit integer vector. 1556/// \returns A 128-bit integer vector containing the comparison results. 1557static __inline__ __m128i __DEFAULT_FN_ATTRS 1558_mm_cmpgt_epi16(__m128i __a, __m128i __b) 1559{ 1560 return (__m128i)((__v8hi)__a > (__v8hi)__b); 1561} 1562 1563/// \brief Compares each of the corresponding signed 32-bit values of the 1564/// 128-bit integer vectors to determine if the values in the first operand 1565/// are greater than those in the second operand. Each comparison yields 0h 1566/// for false, FFFFFFFFh for true. 1567/// 1568/// \headerfile <x86intrin.h> 1569/// 1570/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1571/// 1572/// \param __a 1573/// A 128-bit integer vector. 1574/// \param __b 1575/// A 128-bit integer vector. 1576/// \returns A 128-bit integer vector containing the comparison results. 1577static __inline__ __m128i __DEFAULT_FN_ATTRS 1578_mm_cmpgt_epi32(__m128i __a, __m128i __b) 1579{ 1580 return (__m128i)((__v4si)__a > (__v4si)__b); 1581} 1582 1583/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1584/// integer vectors to determine if the values in the first operand are less 1585/// than those in the second operand. Each comparison yields 0h for false, 1586/// FFh for true. 1587/// 1588/// \headerfile <x86intrin.h> 1589/// 1590/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1591/// 1592/// \param __a 1593/// A 128-bit integer vector. 1594/// \param __b 1595/// A 128-bit integer vector. 1596/// \returns A 128-bit integer vector containing the comparison results. 1597static __inline__ __m128i __DEFAULT_FN_ATTRS 1598_mm_cmplt_epi8(__m128i __a, __m128i __b) 1599{ 1600 return _mm_cmpgt_epi8(__b, __a); 1601} 1602 1603/// \brief Compares each of the corresponding signed 16-bit values of the 1604/// 128-bit integer vectors to determine if the values in the first operand 1605/// are less than those in the second operand. Each comparison yields 0h for 1606/// false, FFFFh for true. 1607/// 1608/// \headerfile <x86intrin.h> 1609/// 1610/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1611/// 1612/// \param __a 1613/// A 128-bit integer vector. 1614/// \param __b 1615/// A 128-bit integer vector. 1616/// \returns A 128-bit integer vector containing the comparison results. 1617static __inline__ __m128i __DEFAULT_FN_ATTRS 1618_mm_cmplt_epi16(__m128i __a, __m128i __b) 1619{ 1620 return _mm_cmpgt_epi16(__b, __a); 1621} 1622 1623/// \brief Compares each of the corresponding signed 32-bit values of the 1624/// 128-bit integer vectors to determine if the values in the first operand 1625/// are less than those in the second operand. Each comparison yields 0h for 1626/// false, FFFFFFFFh for true. 1627/// 1628/// \headerfile <x86intrin.h> 1629/// 1630/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1631/// 1632/// \param __a 1633/// A 128-bit integer vector. 1634/// \param __b 1635/// A 128-bit integer vector. 1636/// \returns A 128-bit integer vector containing the comparison results. 1637static __inline__ __m128i __DEFAULT_FN_ATTRS 1638_mm_cmplt_epi32(__m128i __a, __m128i __b) 1639{ 1640 return _mm_cmpgt_epi32(__b, __a); 1641} 1642 1643#ifdef __x86_64__ 1644/// \brief Converts a 64-bit signed integer value from the second operand into a 1645/// double-precision value and returns it in the lower element of a [2 x 1646/// double] vector; the upper element of the returned vector is copied from 1647/// the upper element of the first operand. 1648/// 1649/// \headerfile <x86intrin.h> 1650/// 1651/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. 1652/// 1653/// \param __a 1654/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 1655/// copied to the upper 64 bits of the destination. 1656/// \param __b 1657/// A 64-bit signed integer operand containing the value to be converted. 1658/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 1659/// converted value of the second operand. The upper 64 bits are copied from 1660/// the upper 64 bits of the first operand. 1661static __inline__ __m128d __DEFAULT_FN_ATTRS 1662_mm_cvtsi64_sd(__m128d __a, long long __b) 1663{ 1664 __a[0] = __b; 1665 return __a; 1666} 1667 1668/// \brief Converts the first (lower) element of a vector of [2 x double] into a 1669/// 64-bit signed integer value, according to the current rounding mode. 1670/// 1671/// \headerfile <x86intrin.h> 1672/// 1673/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. 1674/// 1675/// \param __a 1676/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1677/// conversion. 1678/// \returns A 64-bit signed integer containing the converted value. 1679static __inline__ long long __DEFAULT_FN_ATTRS 1680_mm_cvtsd_si64(__m128d __a) 1681{ 1682 return __builtin_ia32_cvtsd2si64((__v2df)__a); 1683} 1684 1685/// \brief Converts the first (lower) element of a vector of [2 x double] into a 1686/// 64-bit signed integer value, truncating the result when it is inexact. 1687/// 1688/// \headerfile <x86intrin.h> 1689/// 1690/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. 1691/// 1692/// \param __a 1693/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1694/// conversion. 1695/// \returns A 64-bit signed integer containing the converted value. 1696static __inline__ long long __DEFAULT_FN_ATTRS 1697_mm_cvttsd_si64(__m128d __a) 1698{ 1699 return __a[0]; 1700} 1701#endif 1702 1703/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 1704/// 1705/// \headerfile <x86intrin.h> 1706/// 1707/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. 1708/// 1709/// \param __a 1710/// A 128-bit integer vector. 1711/// \returns A 128-bit vector of [4 x float] containing the converted values. 1712static __inline__ __m128 __DEFAULT_FN_ATTRS 1713_mm_cvtepi32_ps(__m128i __a) 1714{ 1715 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1716} 1717 1718/// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 1719/// 1720/// \headerfile <x86intrin.h> 1721/// 1722/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. 1723/// 1724/// \param __a 1725/// A 128-bit vector of [4 x float]. 1726/// \returns A 128-bit integer vector of [4 x i32] containing the converted 1727/// values. 1728static __inline__ __m128i __DEFAULT_FN_ATTRS 1729_mm_cvtps_epi32(__m128 __a) 1730{ 1731 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 1732} 1733 1734/// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 1735/// truncating the result when it is inexact. 1736/// 1737/// \headerfile <x86intrin.h> 1738/// 1739/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction. 1740/// 1741/// \param __a 1742/// A 128-bit vector of [4 x float]. 1743/// \returns A 128-bit vector of [4 x i32] containing the converted values. 1744static __inline__ __m128i __DEFAULT_FN_ATTRS 1745_mm_cvttps_epi32(__m128 __a) 1746{ 1747 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 1748} 1749 1750/// \brief Returns a vector of [4 x i32] where the lowest element is the input 1751/// operand and the remaining elements are zero. 1752/// 1753/// \headerfile <x86intrin.h> 1754/// 1755/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1756/// 1757/// \param __a 1758/// A 32-bit signed integer operand. 1759/// \returns A 128-bit vector of [4 x i32]. 1760static __inline__ __m128i __DEFAULT_FN_ATTRS 1761_mm_cvtsi32_si128(int __a) 1762{ 1763 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1764} 1765 1766#ifdef __x86_64__ 1767/// \brief Returns a vector of [2 x i64] where the lower element is the input 1768/// operand and the upper element is zero. 1769/// 1770/// \headerfile <x86intrin.h> 1771/// 1772/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1773/// 1774/// \param __a 1775/// A 64-bit signed integer operand containing the value to be converted. 1776/// \returns A 128-bit vector of [2 x i64] containing the converted value. 1777static __inline__ __m128i __DEFAULT_FN_ATTRS 1778_mm_cvtsi64_si128(long long __a) 1779{ 1780 return (__m128i){ __a, 0 }; 1781} 1782#endif 1783 1784/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 1785/// 32-bit signed integer value. 1786/// 1787/// \headerfile <x86intrin.h> 1788/// 1789/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1790/// 1791/// \param __a 1792/// A vector of [4 x i32]. The least significant 32 bits are moved to the 1793/// destination. 1794/// \returns A 32-bit signed integer containing the moved value. 1795static __inline__ int __DEFAULT_FN_ATTRS 1796_mm_cvtsi128_si32(__m128i __a) 1797{ 1798 __v4si __b = (__v4si)__a; 1799 return __b[0]; 1800} 1801 1802#ifdef __x86_64__ 1803/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 1804/// 64-bit signed integer value. 1805/// 1806/// \headerfile <x86intrin.h> 1807/// 1808/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1809/// 1810/// \param __a 1811/// A vector of [2 x i64]. The least significant 64 bits are moved to the 1812/// destination. 1813/// \returns A 64-bit signed integer containing the moved value. 1814static __inline__ long long __DEFAULT_FN_ATTRS 1815_mm_cvtsi128_si64(__m128i __a) 1816{ 1817 return __a[0]; 1818} 1819#endif 1820 1821/// \brief Moves packed integer values from an aligned 128-bit memory location 1822/// to elements in a 128-bit integer vector. 1823/// 1824/// \headerfile <x86intrin.h> 1825/// 1826/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction. 1827/// 1828/// \param __p 1829/// An aligned pointer to a memory location containing integer values. 1830/// \returns A 128-bit integer vector containing the moved values. 1831static __inline__ __m128i __DEFAULT_FN_ATTRS 1832_mm_load_si128(__m128i const *__p) 1833{ 1834 return *__p; 1835} 1836 1837/// \brief Moves packed integer values from an unaligned 128-bit memory location 1838/// to elements in a 128-bit integer vector. 1839/// 1840/// \headerfile <x86intrin.h> 1841/// 1842/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction. 1843/// 1844/// \param __p 1845/// A pointer to a memory location containing integer values. 1846/// \returns A 128-bit integer vector containing the moved values. 1847static __inline__ __m128i __DEFAULT_FN_ATTRS 1848_mm_loadu_si128(__m128i const *__p) 1849{ 1850 struct __loadu_si128 { 1851 __m128i __v; 1852 } __attribute__((__packed__, __may_alias__)); 1853 return ((struct __loadu_si128*)__p)->__v; 1854} 1855 1856/// \brief Returns a vector of [2 x i64] where the lower element is taken from 1857/// the lower element of the operand, and the upper element is zero. 1858/// 1859/// \headerfile <x86intrin.h> 1860/// 1861/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1862/// 1863/// \param __p 1864/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 1865/// the destination. 1866/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 1867/// moved value. The higher order bits are cleared. 1868static __inline__ __m128i __DEFAULT_FN_ATTRS 1869_mm_loadl_epi64(__m128i const *__p) 1870{ 1871 struct __mm_loadl_epi64_struct { 1872 long long __u; 1873 } __attribute__((__packed__, __may_alias__)); 1874 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1875} 1876 1877/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 1878/// This could be used as an argument to another intrinsic function where the 1879/// argument is required but the value is not actually used. 1880/// 1881/// \headerfile <x86intrin.h> 1882/// 1883/// This intrinsic has no corresponding instruction. 1884/// 1885/// \returns A 128-bit vector of [4 x i32] with unspecified content. 1886static __inline__ __m128i __DEFAULT_FN_ATTRS 1887_mm_undefined_si128() 1888{ 1889 return (__m128i)__builtin_ia32_undef128(); 1890} 1891 1892/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1893/// the specified 64-bit integer values. 1894/// 1895/// \headerfile <x86intrin.h> 1896/// 1897/// This intrinsic is a utility function and does not correspond to a specific 1898/// instruction. 1899/// 1900/// \param __q1 1901/// A 64-bit integer value used to initialize the upper 64 bits of the 1902/// destination vector of [2 x i64]. 1903/// \param __q0 1904/// A 64-bit integer value used to initialize the lower 64 bits of the 1905/// destination vector of [2 x i64]. 1906/// \returns An initialized 128-bit vector of [2 x i64] containing the values 1907/// provided in the operands. 1908static __inline__ __m128i __DEFAULT_FN_ATTRS 1909_mm_set_epi64x(long long __q1, long long __q0) 1910{ 1911 return (__m128i){ __q0, __q1 }; 1912} 1913 1914/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1915/// the specified 64-bit integer values. 1916/// 1917/// \headerfile <x86intrin.h> 1918/// 1919/// This intrinsic is a utility function and does not correspond to a specific 1920/// instruction. 1921/// 1922/// \param __q1 1923/// A 64-bit integer value used to initialize the upper 64 bits of the 1924/// destination vector of [2 x i64]. 1925/// \param __q0 1926/// A 64-bit integer value used to initialize the lower 64 bits of the 1927/// destination vector of [2 x i64]. 1928/// \returns An initialized 128-bit vector of [2 x i64] containing the values 1929/// provided in the operands. 1930static __inline__ __m128i __DEFAULT_FN_ATTRS 1931_mm_set_epi64(__m64 __q1, __m64 __q0) 1932{ 1933 return (__m128i){ (long long)__q0, (long long)__q1 }; 1934} 1935 1936/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 1937/// the specified 32-bit integer values. 1938/// 1939/// \headerfile <x86intrin.h> 1940/// 1941/// This intrinsic is a utility function and does not correspond to a specific 1942/// instruction. 1943/// 1944/// \param __i3 1945/// A 32-bit integer value used to initialize bits [127:96] of the 1946/// destination vector. 1947/// \param __i2 1948/// A 32-bit integer value used to initialize bits [95:64] of the destination 1949/// vector. 1950/// \param __i1 1951/// A 32-bit integer value used to initialize bits [63:32] of the destination 1952/// vector. 1953/// \param __i0 1954/// A 32-bit integer value used to initialize bits [31:0] of the destination 1955/// vector. 1956/// \returns An initialized 128-bit vector of [4 x i32] containing the values 1957/// provided in the operands. 1958static __inline__ __m128i __DEFAULT_FN_ATTRS 1959_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 1960{ 1961 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 1962} 1963 1964/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 1965/// the specified 16-bit integer values. 1966/// 1967/// \headerfile <x86intrin.h> 1968/// 1969/// This intrinsic is a utility function and does not correspond to a specific 1970/// instruction. 1971/// 1972/// \param __w7 1973/// A 16-bit integer value used to initialize bits [127:112] of the 1974/// destination vector. 1975/// \param __w6 1976/// A 16-bit integer value used to initialize bits [111:96] of the 1977/// destination vector. 1978/// \param __w5 1979/// A 16-bit integer value used to initialize bits [95:80] of the destination 1980/// vector. 1981/// \param __w4 1982/// A 16-bit integer value used to initialize bits [79:64] of the destination 1983/// vector. 1984/// \param __w3 1985/// A 16-bit integer value used to initialize bits [63:48] of the destination 1986/// vector. 1987/// \param __w2 1988/// A 16-bit integer value used to initialize bits [47:32] of the destination 1989/// vector. 1990/// \param __w1 1991/// A 16-bit integer value used to initialize bits [31:16] of the destination 1992/// vector. 1993/// \param __w0 1994/// A 16-bit integer value used to initialize bits [15:0] of the destination 1995/// vector. 1996/// \returns An initialized 128-bit vector of [8 x i16] containing the values 1997/// provided in the operands. 1998static __inline__ __m128i __DEFAULT_FN_ATTRS 1999_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 2000{ 2001 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2002} 2003 2004/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 2005/// the specified 8-bit integer values. 2006/// 2007/// \headerfile <x86intrin.h> 2008/// 2009/// This intrinsic is a utility function and does not correspond to a specific 2010/// instruction. 2011/// 2012/// \param __b15 2013/// Initializes bits [127:120] of the destination vector. 2014/// \param __b14 2015/// Initializes bits [119:112] of the destination vector. 2016/// \param __b13 2017/// Initializes bits [111:104] of the destination vector. 2018/// \param __b12 2019/// Initializes bits [103:96] of the destination vector. 2020/// \param __b11 2021/// Initializes bits [95:88] of the destination vector. 2022/// \param __b10 2023/// Initializes bits [87:80] of the destination vector. 2024/// \param __b9 2025/// Initializes bits [79:72] of the destination vector. 2026/// \param __b8 2027/// Initializes bits [71:64] of the destination vector. 2028/// \param __b7 2029/// Initializes bits [63:56] of the destination vector. 2030/// \param __b6 2031/// Initializes bits [55:48] of the destination vector. 2032/// \param __b5 2033/// Initializes bits [47:40] of the destination vector. 2034/// \param __b4 2035/// Initializes bits [39:32] of the destination vector. 2036/// \param __b3 2037/// Initializes bits [31:24] of the destination vector. 2038/// \param __b2 2039/// Initializes bits [23:16] of the destination vector. 2040/// \param __b1 2041/// Initializes bits [15:8] of the destination vector. 2042/// \param __b0 2043/// Initializes bits [7:0] of the destination vector. 2044/// \returns An initialized 128-bit vector of [16 x i8] containing the values 2045/// provided in the operands. 2046static __inline__ __m128i __DEFAULT_FN_ATTRS 2047_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 2048{ 2049 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2050} 2051 2052/// \brief Initializes both values in a 128-bit integer vector with the 2053/// specified 64-bit integer value. 2054/// 2055/// \headerfile <x86intrin.h> 2056/// 2057/// This intrinsic is a utility function and does not correspond to a specific 2058/// instruction. 2059/// 2060/// \param __q 2061/// Integer value used to initialize the elements of the destination integer 2062/// vector. 2063/// \returns An initialized 128-bit integer vector of [2 x i64] with both 2064/// elements containing the value provided in the operand. 2065static __inline__ __m128i __DEFAULT_FN_ATTRS 2066_mm_set1_epi64x(long long __q) 2067{ 2068 return (__m128i){ __q, __q }; 2069} 2070 2071/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 2072/// specified 64-bit value. 2073/// 2074/// \headerfile <x86intrin.h> 2075/// 2076/// This intrinsic is a utility function and does not correspond to a specific 2077/// instruction. 2078/// 2079/// \param __q 2080/// A 64-bit value used to initialize the elements of the destination integer 2081/// vector. 2082/// \returns An initialized 128-bit vector of [2 x i64] with all elements 2083/// containing the value provided in the operand. 2084static __inline__ __m128i __DEFAULT_FN_ATTRS 2085_mm_set1_epi64(__m64 __q) 2086{ 2087 return (__m128i){ (long long)__q, (long long)__q }; 2088} 2089 2090/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 2091/// specified 32-bit value. 2092/// 2093/// \headerfile <x86intrin.h> 2094/// 2095/// This intrinsic is a utility function and does not correspond to a specific 2096/// instruction. 2097/// 2098/// \param __i 2099/// A 32-bit value used to initialize the elements of the destination integer 2100/// vector. 2101/// \returns An initialized 128-bit vector of [4 x i32] with all elements 2102/// containing the value provided in the operand. 2103static __inline__ __m128i __DEFAULT_FN_ATTRS 2104_mm_set1_epi32(int __i) 2105{ 2106 return (__m128i)(__v4si){ __i, __i, __i, __i }; 2107} 2108 2109/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 2110/// specified 16-bit value. 2111/// 2112/// \headerfile <x86intrin.h> 2113/// 2114/// This intrinsic is a utility function and does not correspond to a specific 2115/// instruction. 2116/// 2117/// \param __w 2118/// A 16-bit value used to initialize the elements of the destination integer 2119/// vector. 2120/// \returns An initialized 128-bit vector of [8 x i16] with all elements 2121/// containing the value provided in the operand. 2122static __inline__ __m128i __DEFAULT_FN_ATTRS 2123_mm_set1_epi16(short __w) 2124{ 2125 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 2126} 2127 2128/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 2129/// specified 8-bit value. 2130/// 2131/// \headerfile <x86intrin.h> 2132/// 2133/// This intrinsic is a utility function and does not correspond to a specific 2134/// instruction. 2135/// 2136/// \param __b 2137/// An 8-bit value used to initialize the elements of the destination integer 2138/// vector. 2139/// \returns An initialized 128-bit vector of [16 x i8] with all elements 2140/// containing the value provided in the operand. 2141static __inline__ __m128i __DEFAULT_FN_ATTRS 2142_mm_set1_epi8(char __b) 2143{ 2144 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 2145} 2146 2147static __inline__ __m128i __DEFAULT_FN_ATTRS 2148_mm_setr_epi64(__m64 __q0, __m64 __q1) 2149{ 2150 return (__m128i){ (long long)__q0, (long long)__q1 }; 2151} 2152 2153static __inline__ __m128i __DEFAULT_FN_ATTRS 2154_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 2155{ 2156 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 2157} 2158 2159static __inline__ __m128i __DEFAULT_FN_ATTRS 2160_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 2161{ 2162 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2163} 2164 2165static __inline__ __m128i __DEFAULT_FN_ATTRS 2166_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 2167{ 2168 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2169} 2170 2171static __inline__ __m128i __DEFAULT_FN_ATTRS 2172_mm_setzero_si128(void) 2173{ 2174 return (__m128i){ 0LL, 0LL }; 2175} 2176 2177static __inline__ void __DEFAULT_FN_ATTRS 2178_mm_store_si128(__m128i *__p, __m128i __b) 2179{ 2180 *__p = __b; 2181} 2182 2183static __inline__ void __DEFAULT_FN_ATTRS 2184_mm_storeu_si128(__m128i *__p, __m128i __b) 2185{ 2186 struct __storeu_si128 { 2187 __m128i __v; 2188 } __attribute__((__packed__, __may_alias__)); 2189 ((struct __storeu_si128*)__p)->__v = __b; 2190} 2191 2192static __inline__ void __DEFAULT_FN_ATTRS 2193_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 2194{ 2195 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 2196} 2197 2198static __inline__ void __DEFAULT_FN_ATTRS 2199_mm_storel_epi64(__m128i *__p, __m128i __a) 2200{ 2201 struct __mm_storel_epi64_struct { 2202 long long __u; 2203 } __attribute__((__packed__, __may_alias__)); 2204 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 2205} 2206 2207static __inline__ void __DEFAULT_FN_ATTRS 2208_mm_stream_pd(double *__p, __m128d __a) 2209{ 2210 __builtin_ia32_movntpd(__p, (__v2df)__a); 2211} 2212 2213static __inline__ void __DEFAULT_FN_ATTRS 2214_mm_stream_si128(__m128i *__p, __m128i __a) 2215{ 2216 __builtin_ia32_movntdq(__p, (__v2di)__a); 2217} 2218 2219static __inline__ void __DEFAULT_FN_ATTRS 2220_mm_stream_si32(int *__p, int __a) 2221{ 2222 __builtin_ia32_movnti(__p, __a); 2223} 2224 2225#ifdef __x86_64__ 2226static __inline__ void __DEFAULT_FN_ATTRS 2227_mm_stream_si64(long long *__p, long long __a) 2228{ 2229 __builtin_ia32_movnti64(__p, __a); 2230} 2231#endif 2232 2233static __inline__ void __DEFAULT_FN_ATTRS 2234_mm_clflush(void const *__p) 2235{ 2236 __builtin_ia32_clflush(__p); 2237} 2238 2239static __inline__ void __DEFAULT_FN_ATTRS 2240_mm_lfence(void) 2241{ 2242 __builtin_ia32_lfence(); 2243} 2244 2245static __inline__ void __DEFAULT_FN_ATTRS 2246_mm_mfence(void) 2247{ 2248 __builtin_ia32_mfence(); 2249} 2250 2251static __inline__ __m128i __DEFAULT_FN_ATTRS 2252_mm_packs_epi16(__m128i __a, __m128i __b) 2253{ 2254 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 2255} 2256 2257static __inline__ __m128i __DEFAULT_FN_ATTRS 2258_mm_packs_epi32(__m128i __a, __m128i __b) 2259{ 2260 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 2261} 2262 2263static __inline__ __m128i __DEFAULT_FN_ATTRS 2264_mm_packus_epi16(__m128i __a, __m128i __b) 2265{ 2266 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 2267} 2268 2269static __inline__ int __DEFAULT_FN_ATTRS 2270_mm_extract_epi16(__m128i __a, int __imm) 2271{ 2272 __v8hi __b = (__v8hi)__a; 2273 return (unsigned short)__b[__imm & 7]; 2274} 2275 2276static __inline__ __m128i __DEFAULT_FN_ATTRS 2277_mm_insert_epi16(__m128i __a, int __b, int __imm) 2278{ 2279 __v8hi __c = (__v8hi)__a; 2280 __c[__imm & 7] = __b; 2281 return (__m128i)__c; 2282} 2283 2284static __inline__ int __DEFAULT_FN_ATTRS 2285_mm_movemask_epi8(__m128i __a) 2286{ 2287 return __builtin_ia32_pmovmskb128((__v16qi)__a); 2288} 2289 2290#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 2291 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 2292 (__v4si)_mm_setzero_si128(), \ 2293 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 2294 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 2295 2296#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 2297 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2298 (__v8hi)_mm_setzero_si128(), \ 2299 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 2300 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 2301 4, 5, 6, 7); }) 2302 2303#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 2304 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2305 (__v8hi)_mm_setzero_si128(), \ 2306 0, 1, 2, 3, \ 2307 4 + (((imm) & 0x03) >> 0), \ 2308 4 + (((imm) & 0x0c) >> 2), \ 2309 4 + (((imm) & 0x30) >> 4), \ 2310 4 + (((imm) & 0xc0) >> 6)); }) 2311 2312static __inline__ __m128i __DEFAULT_FN_ATTRS 2313_mm_unpackhi_epi8(__m128i __a, __m128i __b) 2314{ 2315 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2316} 2317 2318static __inline__ __m128i __DEFAULT_FN_ATTRS 2319_mm_unpackhi_epi16(__m128i __a, __m128i __b) 2320{ 2321 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 2322} 2323 2324static __inline__ __m128i __DEFAULT_FN_ATTRS 2325_mm_unpackhi_epi32(__m128i __a, __m128i __b) 2326{ 2327 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 2328} 2329 2330static __inline__ __m128i __DEFAULT_FN_ATTRS 2331_mm_unpackhi_epi64(__m128i __a, __m128i __b) 2332{ 2333 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 2334} 2335 2336static __inline__ __m128i __DEFAULT_FN_ATTRS 2337_mm_unpacklo_epi8(__m128i __a, __m128i __b) 2338{ 2339 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 2340} 2341 2342static __inline__ __m128i __DEFAULT_FN_ATTRS 2343_mm_unpacklo_epi16(__m128i __a, __m128i __b) 2344{ 2345 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 2346} 2347 2348static __inline__ __m128i __DEFAULT_FN_ATTRS 2349_mm_unpacklo_epi32(__m128i __a, __m128i __b) 2350{ 2351 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 2352} 2353 2354static __inline__ __m128i __DEFAULT_FN_ATTRS 2355_mm_unpacklo_epi64(__m128i __a, __m128i __b) 2356{ 2357 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 2358} 2359 2360static __inline__ __m64 __DEFAULT_FN_ATTRS 2361_mm_movepi64_pi64(__m128i __a) 2362{ 2363 return (__m64)__a[0]; 2364} 2365 2366static __inline__ __m128i __DEFAULT_FN_ATTRS 2367_mm_movpi64_epi64(__m64 __a) 2368{ 2369 return (__m128i){ (long long)__a, 0 }; 2370} 2371 2372static __inline__ __m128i __DEFAULT_FN_ATTRS 2373_mm_move_epi64(__m128i __a) 2374{ 2375 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); 2376} 2377 2378static __inline__ __m128d __DEFAULT_FN_ATTRS 2379_mm_unpackhi_pd(__m128d __a, __m128d __b) 2380{ 2381 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 2382} 2383 2384static __inline__ __m128d __DEFAULT_FN_ATTRS 2385_mm_unpacklo_pd(__m128d __a, __m128d __b) 2386{ 2387 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 2388} 2389 2390static __inline__ int __DEFAULT_FN_ATTRS 2391_mm_movemask_pd(__m128d __a) 2392{ 2393 return __builtin_ia32_movmskpd((__v2df)__a); 2394} 2395 2396#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 2397 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 2398 (i) & 1, (((i) & 2) >> 1) + 2); }) 2399 2400static __inline__ __m128 __DEFAULT_FN_ATTRS 2401_mm_castpd_ps(__m128d __a) 2402{ 2403 return (__m128)__a; 2404} 2405 2406static __inline__ __m128i __DEFAULT_FN_ATTRS 2407_mm_castpd_si128(__m128d __a) 2408{ 2409 return (__m128i)__a; 2410} 2411 2412static __inline__ __m128d __DEFAULT_FN_ATTRS 2413_mm_castps_pd(__m128 __a) 2414{ 2415 return (__m128d)__a; 2416} 2417 2418static __inline__ __m128i __DEFAULT_FN_ATTRS 2419_mm_castps_si128(__m128 __a) 2420{ 2421 return (__m128i)__a; 2422} 2423 2424static __inline__ __m128 __DEFAULT_FN_ATTRS 2425_mm_castsi128_ps(__m128i __a) 2426{ 2427 return (__m128)__a; 2428} 2429 2430static __inline__ __m128d __DEFAULT_FN_ATTRS 2431_mm_castsi128_pd(__m128i __a) 2432{ 2433 return (__m128d)__a; 2434} 2435 2436static __inline__ void __DEFAULT_FN_ATTRS 2437_mm_pause(void) 2438{ 2439 __builtin_ia32_pause(); 2440} 2441 2442#undef __DEFAULT_FN_ATTRS 2443 2444#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 2445 2446#endif /* __EMMINTRIN_H */ 2447