1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#include <xmmintrin.h> 28 29typedef double __m128d __attribute__((__vector_size__(16))); 30typedef long long __m128i __attribute__((__vector_size__(16))); 31 32/* Type defines. */ 33typedef double __v2df __attribute__ ((__vector_size__ (16))); 34typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35typedef short __v8hi __attribute__((__vector_size__(16))); 36typedef char __v16qi __attribute__((__vector_size__(16))); 37 38/* Unsigned types */ 39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 40typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 41typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 42 43/* We need an explicitly signed variant for char. Note that this shouldn't 44 * appear in the interface though. */ 45typedef signed char __v16qs __attribute__((__vector_size__(16))); 46 47#include <f16cintrin.h> 48 49/* Define the default attributes for the functions in this file. */ 50#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 51 52static __inline__ __m128d __DEFAULT_FN_ATTRS 53_mm_add_sd(__m128d __a, __m128d __b) 54{ 55 __a[0] += __b[0]; 56 return __a; 57} 58 59static __inline__ __m128d __DEFAULT_FN_ATTRS 60_mm_add_pd(__m128d __a, __m128d __b) 61{ 62 return (__m128d)((__v2df)__a + (__v2df)__b); 63} 64 65static __inline__ __m128d __DEFAULT_FN_ATTRS 66_mm_sub_sd(__m128d __a, __m128d __b) 67{ 68 __a[0] -= __b[0]; 69 return __a; 70} 71 72static __inline__ __m128d __DEFAULT_FN_ATTRS 73_mm_sub_pd(__m128d __a, __m128d __b) 74{ 75 return (__m128d)((__v2df)__a - (__v2df)__b); 76} 77 78static __inline__ __m128d __DEFAULT_FN_ATTRS 79_mm_mul_sd(__m128d __a, __m128d __b) 80{ 81 __a[0] *= __b[0]; 82 return __a; 83} 84 85static __inline__ __m128d __DEFAULT_FN_ATTRS 86_mm_mul_pd(__m128d __a, __m128d __b) 87{ 88 return (__m128d)((__v2df)__a * (__v2df)__b); 89} 90 91static __inline__ __m128d __DEFAULT_FN_ATTRS 92_mm_div_sd(__m128d __a, __m128d __b) 93{ 94 __a[0] /= __b[0]; 95 return __a; 96} 97 98static __inline__ __m128d __DEFAULT_FN_ATTRS 99_mm_div_pd(__m128d __a, __m128d __b) 100{ 101 return (__m128d)((__v2df)__a / (__v2df)__b); 102} 103 104static __inline__ __m128d __DEFAULT_FN_ATTRS 105_mm_sqrt_sd(__m128d __a, __m128d __b) 106{ 107 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 108 return (__m128d) { __c[0], __a[1] }; 109} 110 111static __inline__ __m128d __DEFAULT_FN_ATTRS 112_mm_sqrt_pd(__m128d __a) 113{ 114 return __builtin_ia32_sqrtpd((__v2df)__a); 115} 116 117static __inline__ __m128d __DEFAULT_FN_ATTRS 118_mm_min_sd(__m128d __a, __m128d __b) 119{ 120 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 121} 122 123static __inline__ __m128d __DEFAULT_FN_ATTRS 124_mm_min_pd(__m128d __a, __m128d __b) 125{ 126 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 127} 128 129static __inline__ __m128d __DEFAULT_FN_ATTRS 130_mm_max_sd(__m128d __a, __m128d __b) 131{ 132 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 133} 134 135static __inline__ __m128d __DEFAULT_FN_ATTRS 136_mm_max_pd(__m128d __a, __m128d __b) 137{ 138 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 139} 140 141static __inline__ __m128d __DEFAULT_FN_ATTRS 142_mm_and_pd(__m128d __a, __m128d __b) 143{ 144 return (__m128d)((__v4su)__a & (__v4su)__b); 145} 146 147static __inline__ __m128d __DEFAULT_FN_ATTRS 148_mm_andnot_pd(__m128d __a, __m128d __b) 149{ 150 return (__m128d)(~(__v4su)__a & (__v4su)__b); 151} 152 153static __inline__ __m128d __DEFAULT_FN_ATTRS 154_mm_or_pd(__m128d __a, __m128d __b) 155{ 156 return (__m128d)((__v4su)__a | (__v4su)__b); 157} 158 159static __inline__ __m128d __DEFAULT_FN_ATTRS 160_mm_xor_pd(__m128d __a, __m128d __b) 161{ 162 return (__m128d)((__v4su)__a ^ (__v4su)__b); 163} 164 165static __inline__ __m128d __DEFAULT_FN_ATTRS 166_mm_cmpeq_pd(__m128d __a, __m128d __b) 167{ 168 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 169} 170 171static __inline__ __m128d __DEFAULT_FN_ATTRS 172_mm_cmplt_pd(__m128d __a, __m128d __b) 173{ 174 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 175} 176 177static __inline__ __m128d __DEFAULT_FN_ATTRS 178_mm_cmple_pd(__m128d __a, __m128d __b) 179{ 180 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 181} 182 183static __inline__ __m128d __DEFAULT_FN_ATTRS 184_mm_cmpgt_pd(__m128d __a, __m128d __b) 185{ 186 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 187} 188 189static __inline__ __m128d __DEFAULT_FN_ATTRS 190_mm_cmpge_pd(__m128d __a, __m128d __b) 191{ 192 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 193} 194 195static __inline__ __m128d __DEFAULT_FN_ATTRS 196_mm_cmpord_pd(__m128d __a, __m128d __b) 197{ 198 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 199} 200 201static __inline__ __m128d __DEFAULT_FN_ATTRS 202_mm_cmpunord_pd(__m128d __a, __m128d __b) 203{ 204 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 205} 206 207static __inline__ __m128d __DEFAULT_FN_ATTRS 208_mm_cmpneq_pd(__m128d __a, __m128d __b) 209{ 210 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 211} 212 213static __inline__ __m128d __DEFAULT_FN_ATTRS 214_mm_cmpnlt_pd(__m128d __a, __m128d __b) 215{ 216 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 217} 218 219static __inline__ __m128d __DEFAULT_FN_ATTRS 220_mm_cmpnle_pd(__m128d __a, __m128d __b) 221{ 222 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 223} 224 225static __inline__ __m128d __DEFAULT_FN_ATTRS 226_mm_cmpngt_pd(__m128d __a, __m128d __b) 227{ 228 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 229} 230 231static __inline__ __m128d __DEFAULT_FN_ATTRS 232_mm_cmpnge_pd(__m128d __a, __m128d __b) 233{ 234 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 235} 236 237static __inline__ __m128d __DEFAULT_FN_ATTRS 238_mm_cmpeq_sd(__m128d __a, __m128d __b) 239{ 240 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 241} 242 243static __inline__ __m128d __DEFAULT_FN_ATTRS 244_mm_cmplt_sd(__m128d __a, __m128d __b) 245{ 246 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 247} 248 249static __inline__ __m128d __DEFAULT_FN_ATTRS 250_mm_cmple_sd(__m128d __a, __m128d __b) 251{ 252 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 253} 254 255static __inline__ __m128d __DEFAULT_FN_ATTRS 256_mm_cmpgt_sd(__m128d __a, __m128d __b) 257{ 258 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 259 return (__m128d) { __c[0], __a[1] }; 260} 261 262static __inline__ __m128d __DEFAULT_FN_ATTRS 263_mm_cmpge_sd(__m128d __a, __m128d __b) 264{ 265 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 266 return (__m128d) { __c[0], __a[1] }; 267} 268 269static __inline__ __m128d __DEFAULT_FN_ATTRS 270_mm_cmpord_sd(__m128d __a, __m128d __b) 271{ 272 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 273} 274 275static __inline__ __m128d __DEFAULT_FN_ATTRS 276_mm_cmpunord_sd(__m128d __a, __m128d __b) 277{ 278 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 279} 280 281static __inline__ __m128d __DEFAULT_FN_ATTRS 282_mm_cmpneq_sd(__m128d __a, __m128d __b) 283{ 284 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 285} 286 287static __inline__ __m128d __DEFAULT_FN_ATTRS 288_mm_cmpnlt_sd(__m128d __a, __m128d __b) 289{ 290 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 291} 292 293static __inline__ __m128d __DEFAULT_FN_ATTRS 294_mm_cmpnle_sd(__m128d __a, __m128d __b) 295{ 296 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 297} 298 299static __inline__ __m128d __DEFAULT_FN_ATTRS 300_mm_cmpngt_sd(__m128d __a, __m128d __b) 301{ 302 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 303 return (__m128d) { __c[0], __a[1] }; 304} 305 306static __inline__ __m128d __DEFAULT_FN_ATTRS 307_mm_cmpnge_sd(__m128d __a, __m128d __b) 308{ 309 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 310 return (__m128d) { __c[0], __a[1] }; 311} 312 313static __inline__ int __DEFAULT_FN_ATTRS 314_mm_comieq_sd(__m128d __a, __m128d __b) 315{ 316 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 317} 318 319static __inline__ int __DEFAULT_FN_ATTRS 320_mm_comilt_sd(__m128d __a, __m128d __b) 321{ 322 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 323} 324 325static __inline__ int __DEFAULT_FN_ATTRS 326_mm_comile_sd(__m128d __a, __m128d __b) 327{ 328 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 329} 330 331static __inline__ int __DEFAULT_FN_ATTRS 332_mm_comigt_sd(__m128d __a, __m128d __b) 333{ 334 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 335} 336 337static __inline__ int __DEFAULT_FN_ATTRS 338_mm_comige_sd(__m128d __a, __m128d __b) 339{ 340 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 341} 342 343static __inline__ int __DEFAULT_FN_ATTRS 344_mm_comineq_sd(__m128d __a, __m128d __b) 345{ 346 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 347} 348 349static __inline__ int __DEFAULT_FN_ATTRS 350_mm_ucomieq_sd(__m128d __a, __m128d __b) 351{ 352 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 353} 354 355static __inline__ int __DEFAULT_FN_ATTRS 356_mm_ucomilt_sd(__m128d __a, __m128d __b) 357{ 358 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 359} 360 361static __inline__ int __DEFAULT_FN_ATTRS 362_mm_ucomile_sd(__m128d __a, __m128d __b) 363{ 364 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 365} 366 367static __inline__ int __DEFAULT_FN_ATTRS 368_mm_ucomigt_sd(__m128d __a, __m128d __b) 369{ 370 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 371} 372 373static __inline__ int __DEFAULT_FN_ATTRS 374_mm_ucomige_sd(__m128d __a, __m128d __b) 375{ 376 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 377} 378 379static __inline__ int __DEFAULT_FN_ATTRS 380_mm_ucomineq_sd(__m128d __a, __m128d __b) 381{ 382 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 383} 384 385static __inline__ __m128 __DEFAULT_FN_ATTRS 386_mm_cvtpd_ps(__m128d __a) 387{ 388 return __builtin_ia32_cvtpd2ps((__v2df)__a); 389} 390 391static __inline__ __m128d __DEFAULT_FN_ATTRS 392_mm_cvtps_pd(__m128 __a) 393{ 394 return (__m128d) __builtin_convertvector( 395 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 396} 397 398static __inline__ __m128d __DEFAULT_FN_ATTRS 399_mm_cvtepi32_pd(__m128i __a) 400{ 401 return (__m128d) __builtin_convertvector( 402 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 403} 404 405static __inline__ __m128i __DEFAULT_FN_ATTRS 406_mm_cvtpd_epi32(__m128d __a) 407{ 408 return __builtin_ia32_cvtpd2dq((__v2df)__a); 409} 410 411static __inline__ int __DEFAULT_FN_ATTRS 412_mm_cvtsd_si32(__m128d __a) 413{ 414 return __builtin_ia32_cvtsd2si((__v2df)__a); 415} 416 417static __inline__ __m128 __DEFAULT_FN_ATTRS 418_mm_cvtsd_ss(__m128 __a, __m128d __b) 419{ 420 __a[0] = __b[0]; 421 return __a; 422} 423 424static __inline__ __m128d __DEFAULT_FN_ATTRS 425_mm_cvtsi32_sd(__m128d __a, int __b) 426{ 427 __a[0] = __b; 428 return __a; 429} 430 431static __inline__ __m128d __DEFAULT_FN_ATTRS 432_mm_cvtss_sd(__m128d __a, __m128 __b) 433{ 434 __a[0] = __b[0]; 435 return __a; 436} 437 438static __inline__ __m128i __DEFAULT_FN_ATTRS 439_mm_cvttpd_epi32(__m128d __a) 440{ 441 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 442} 443 444static __inline__ int __DEFAULT_FN_ATTRS 445_mm_cvttsd_si32(__m128d __a) 446{ 447 return __a[0]; 448} 449 450static __inline__ __m64 __DEFAULT_FN_ATTRS 451_mm_cvtpd_pi32(__m128d __a) 452{ 453 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 454} 455 456static __inline__ __m64 __DEFAULT_FN_ATTRS 457_mm_cvttpd_pi32(__m128d __a) 458{ 459 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 460} 461 462static __inline__ __m128d __DEFAULT_FN_ATTRS 463_mm_cvtpi32_pd(__m64 __a) 464{ 465 return __builtin_ia32_cvtpi2pd((__v2si)__a); 466} 467 468static __inline__ double __DEFAULT_FN_ATTRS 469_mm_cvtsd_f64(__m128d __a) 470{ 471 return __a[0]; 472} 473 474static __inline__ __m128d __DEFAULT_FN_ATTRS 475_mm_load_pd(double const *__dp) 476{ 477 return *(__m128d*)__dp; 478} 479 480static __inline__ __m128d __DEFAULT_FN_ATTRS 481_mm_load1_pd(double const *__dp) 482{ 483 struct __mm_load1_pd_struct { 484 double __u; 485 } __attribute__((__packed__, __may_alias__)); 486 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 487 return (__m128d){ __u, __u }; 488} 489 490#define _mm_load_pd1(dp) _mm_load1_pd(dp) 491 492static __inline__ __m128d __DEFAULT_FN_ATTRS 493_mm_loadr_pd(double const *__dp) 494{ 495 __m128d __u = *(__m128d*)__dp; 496 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 497} 498 499static __inline__ __m128d __DEFAULT_FN_ATTRS 500_mm_loadu_pd(double const *__dp) 501{ 502 struct __loadu_pd { 503 __m128d __v; 504 } __attribute__((__packed__, __may_alias__)); 505 return ((struct __loadu_pd*)__dp)->__v; 506} 507 508static __inline__ __m128i __DEFAULT_FN_ATTRS 509_mm_loadu_si64(void const *__a) 510{ 511 struct __loadu_si64 { 512 long long __v; 513 } __attribute__((__packed__, __may_alias__)); 514 long long __u = ((struct __loadu_si64*)__a)->__v; 515 return (__m128i){__u, 0L}; 516} 517 518static __inline__ __m128d __DEFAULT_FN_ATTRS 519_mm_load_sd(double const *__dp) 520{ 521 struct __mm_load_sd_struct { 522 double __u; 523 } __attribute__((__packed__, __may_alias__)); 524 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 525 return (__m128d){ __u, 0 }; 526} 527 528static __inline__ __m128d __DEFAULT_FN_ATTRS 529_mm_loadh_pd(__m128d __a, double const *__dp) 530{ 531 struct __mm_loadh_pd_struct { 532 double __u; 533 } __attribute__((__packed__, __may_alias__)); 534 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 535 return (__m128d){ __a[0], __u }; 536} 537 538static __inline__ __m128d __DEFAULT_FN_ATTRS 539_mm_loadl_pd(__m128d __a, double const *__dp) 540{ 541 struct __mm_loadl_pd_struct { 542 double __u; 543 } __attribute__((__packed__, __may_alias__)); 544 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 545 return (__m128d){ __u, __a[1] }; 546} 547 548static __inline__ __m128d __DEFAULT_FN_ATTRS 549_mm_undefined_pd(void) 550{ 551 return (__m128d)__builtin_ia32_undef128(); 552} 553 554static __inline__ __m128d __DEFAULT_FN_ATTRS 555_mm_set_sd(double __w) 556{ 557 return (__m128d){ __w, 0 }; 558} 559 560static __inline__ __m128d __DEFAULT_FN_ATTRS 561_mm_set1_pd(double __w) 562{ 563 return (__m128d){ __w, __w }; 564} 565 566static __inline__ __m128d __DEFAULT_FN_ATTRS 567_mm_set_pd(double __w, double __x) 568{ 569 return (__m128d){ __x, __w }; 570} 571 572static __inline__ __m128d __DEFAULT_FN_ATTRS 573_mm_setr_pd(double __w, double __x) 574{ 575 return (__m128d){ __w, __x }; 576} 577 578static __inline__ __m128d __DEFAULT_FN_ATTRS 579_mm_setzero_pd(void) 580{ 581 return (__m128d){ 0, 0 }; 582} 583 584static __inline__ __m128d __DEFAULT_FN_ATTRS 585_mm_move_sd(__m128d __a, __m128d __b) 586{ 587 return (__m128d){ __b[0], __a[1] }; 588} 589 590static __inline__ void __DEFAULT_FN_ATTRS 591_mm_store_sd(double *__dp, __m128d __a) 592{ 593 struct __mm_store_sd_struct { 594 double __u; 595 } __attribute__((__packed__, __may_alias__)); 596 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 597} 598 599static __inline__ void __DEFAULT_FN_ATTRS 600_mm_store_pd(double *__dp, __m128d __a) 601{ 602 *(__m128d*)__dp = __a; 603} 604 605static __inline__ void __DEFAULT_FN_ATTRS 606_mm_store1_pd(double *__dp, __m128d __a) 607{ 608 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 609 _mm_store_pd(__dp, __a); 610} 611 612static __inline__ void __DEFAULT_FN_ATTRS 613_mm_store_pd1(double *__dp, __m128d __a) 614{ 615 return _mm_store1_pd(__dp, __a); 616} 617 618static __inline__ void __DEFAULT_FN_ATTRS 619_mm_storeu_pd(double *__dp, __m128d __a) 620{ 621 struct __storeu_pd { 622 __m128d __v; 623 } __attribute__((__packed__, __may_alias__)); 624 ((struct __storeu_pd*)__dp)->__v = __a; 625} 626 627static __inline__ void __DEFAULT_FN_ATTRS 628_mm_storer_pd(double *__dp, __m128d __a) 629{ 630 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 631 *(__m128d *)__dp = __a; 632} 633 634static __inline__ void __DEFAULT_FN_ATTRS 635_mm_storeh_pd(double *__dp, __m128d __a) 636{ 637 struct __mm_storeh_pd_struct { 638 double __u; 639 } __attribute__((__packed__, __may_alias__)); 640 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 641} 642 643static __inline__ void __DEFAULT_FN_ATTRS 644_mm_storel_pd(double *__dp, __m128d __a) 645{ 646 struct __mm_storeh_pd_struct { 647 double __u; 648 } __attribute__((__packed__, __may_alias__)); 649 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 650} 651 652static __inline__ __m128i __DEFAULT_FN_ATTRS 653_mm_add_epi8(__m128i __a, __m128i __b) 654{ 655 return (__m128i)((__v16qu)__a + (__v16qu)__b); 656} 657 658static __inline__ __m128i __DEFAULT_FN_ATTRS 659_mm_add_epi16(__m128i __a, __m128i __b) 660{ 661 return (__m128i)((__v8hu)__a + (__v8hu)__b); 662} 663 664static __inline__ __m128i __DEFAULT_FN_ATTRS 665_mm_add_epi32(__m128i __a, __m128i __b) 666{ 667 return (__m128i)((__v4su)__a + (__v4su)__b); 668} 669 670static __inline__ __m64 __DEFAULT_FN_ATTRS 671_mm_add_si64(__m64 __a, __m64 __b) 672{ 673 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 674} 675 676static __inline__ __m128i __DEFAULT_FN_ATTRS 677_mm_add_epi64(__m128i __a, __m128i __b) 678{ 679 return (__m128i)((__v2du)__a + (__v2du)__b); 680} 681 682static __inline__ __m128i __DEFAULT_FN_ATTRS 683_mm_adds_epi8(__m128i __a, __m128i __b) 684{ 685 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 686} 687 688static __inline__ __m128i __DEFAULT_FN_ATTRS 689_mm_adds_epi16(__m128i __a, __m128i __b) 690{ 691 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 692} 693 694static __inline__ __m128i __DEFAULT_FN_ATTRS 695_mm_adds_epu8(__m128i __a, __m128i __b) 696{ 697 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 698} 699 700static __inline__ __m128i __DEFAULT_FN_ATTRS 701_mm_adds_epu16(__m128i __a, __m128i __b) 702{ 703 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 704} 705 706static __inline__ __m128i __DEFAULT_FN_ATTRS 707_mm_avg_epu8(__m128i __a, __m128i __b) 708{ 709 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 710} 711 712static __inline__ __m128i __DEFAULT_FN_ATTRS 713_mm_avg_epu16(__m128i __a, __m128i __b) 714{ 715 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 716} 717 718static __inline__ __m128i __DEFAULT_FN_ATTRS 719_mm_madd_epi16(__m128i __a, __m128i __b) 720{ 721 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 722} 723 724static __inline__ __m128i __DEFAULT_FN_ATTRS 725_mm_max_epi16(__m128i __a, __m128i __b) 726{ 727 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 728} 729 730static __inline__ __m128i __DEFAULT_FN_ATTRS 731_mm_max_epu8(__m128i __a, __m128i __b) 732{ 733 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 734} 735 736static __inline__ __m128i __DEFAULT_FN_ATTRS 737_mm_min_epi16(__m128i __a, __m128i __b) 738{ 739 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 740} 741 742static __inline__ __m128i __DEFAULT_FN_ATTRS 743_mm_min_epu8(__m128i __a, __m128i __b) 744{ 745 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 746} 747 748static __inline__ __m128i __DEFAULT_FN_ATTRS 749_mm_mulhi_epi16(__m128i __a, __m128i __b) 750{ 751 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 752} 753 754static __inline__ __m128i __DEFAULT_FN_ATTRS 755_mm_mulhi_epu16(__m128i __a, __m128i __b) 756{ 757 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 758} 759 760/// \brief Multiplies the corresponding elements of two [8 x short] vectors and 761/// returns a vector containing the low-order 16 bits of each 32-bit product 762/// in the corresponding element. 763/// 764/// \headerfile <x86intrin.h> 765/// 766/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction. 767/// 768/// \param __a 769/// A 128-bit integer vector containing one of the source operands. 770/// \param __b 771/// A 128-bit integer vector containing one of the source operands. 772/// \returns A 128-bit integer vector containing the products of both operands. 773static __inline__ __m128i __DEFAULT_FN_ATTRS 774_mm_mullo_epi16(__m128i __a, __m128i __b) 775{ 776 return (__m128i)((__v8hu)__a * (__v8hu)__b); 777} 778 779/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 780/// of the two 64-bit integer vectors and returns the 64-bit unsigned 781/// product. 782/// 783/// \headerfile <x86intrin.h> 784/// 785/// This intrinsic corresponds to the \c PMULUDQ instruction. 786/// 787/// \param __a 788/// A 64-bit integer containing one of the source operands. 789/// \param __b 790/// A 64-bit integer containing one of the source operands. 791/// \returns A 64-bit integer vector containing the product of both operands. 792static __inline__ __m64 __DEFAULT_FN_ATTRS 793_mm_mul_su32(__m64 __a, __m64 __b) 794{ 795 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 796} 797 798/// \brief Multiplies 32-bit unsigned integer values contained in the lower 799/// bits of the corresponding elements of two [2 x i64] vectors, and returns 800/// the 64-bit products in the corresponding elements of a [2 x i64] vector. 801/// 802/// \headerfile <x86intrin.h> 803/// 804/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction. 805/// 806/// \param __a 807/// A [2 x i64] vector containing one of the source operands. 808/// \param __b 809/// A [2 x i64] vector containing one of the source operands. 810/// \returns A [2 x i64] vector containing the product of both operands. 811static __inline__ __m128i __DEFAULT_FN_ATTRS 812_mm_mul_epu32(__m128i __a, __m128i __b) 813{ 814 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 815} 816 817/// \brief Computes the absolute differences of corresponding 8-bit integer 818/// values in two 128-bit vectors. Sums the first 8 absolute differences, and 819/// separately sums the second 8 absolute differences. Packss these two 820/// unsigned 16-bit integer sums into the upper and lower elements of a 821/// [2 x i64] vector. 822/// 823/// \headerfile <x86intrin.h> 824/// 825/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction. 826/// 827/// \param __a 828/// A 128-bit integer vector containing one of the source operands. 829/// \param __b 830/// A 128-bit integer vector containing one of the source operands. 831/// \returns A [2 x i64] vector containing the sums of the sets of absolute 832/// differences between both operands. 833static __inline__ __m128i __DEFAULT_FN_ATTRS 834_mm_sad_epu8(__m128i __a, __m128i __b) 835{ 836 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 837} 838 839/// \brief Subtracts the corresponding 8-bit integer values in the operands. 840/// 841/// \headerfile <x86intrin.h> 842/// 843/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction. 844/// 845/// \param __a 846/// A 128-bit integer vector containing the minuends. 847/// \param __b 848/// A 128-bit integer vector containing the subtrahends. 849/// \returns A 128-bit integer vector containing the differences of the values 850/// in the operands. 851static __inline__ __m128i __DEFAULT_FN_ATTRS 852_mm_sub_epi8(__m128i __a, __m128i __b) 853{ 854 return (__m128i)((__v16qu)__a - (__v16qu)__b); 855} 856 857/// \brief Subtracts the corresponding 16-bit integer values in the operands. 858/// 859/// \headerfile <x86intrin.h> 860/// 861/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction. 862/// 863/// \param __a 864/// A 128-bit integer vector containing the minuends. 865/// \param __b 866/// A 128-bit integer vector containing the subtrahends. 867/// \returns A 128-bit integer vector containing the differences of the values 868/// in the operands. 869static __inline__ __m128i __DEFAULT_FN_ATTRS 870_mm_sub_epi16(__m128i __a, __m128i __b) 871{ 872 return (__m128i)((__v8hu)__a - (__v8hu)__b); 873} 874 875/// \brief Subtracts the corresponding 32-bit integer values in the operands. 876/// 877/// \headerfile <x86intrin.h> 878/// 879/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction. 880/// 881/// \param __a 882/// A 128-bit integer vector containing the minuends. 883/// \param __b 884/// A 128-bit integer vector containing the subtrahends. 885/// \returns A 128-bit integer vector containing the differences of the values 886/// in the operands. 887static __inline__ __m128i __DEFAULT_FN_ATTRS 888_mm_sub_epi32(__m128i __a, __m128i __b) 889{ 890 return (__m128i)((__v4su)__a - (__v4su)__b); 891} 892 893/// \brief Subtracts signed or unsigned 64-bit integer values and writes the 894/// difference to the corresponding bits in the destination. 895/// 896/// \headerfile <x86intrin.h> 897/// 898/// This intrinsic corresponds to the \c PSUBQ instruction. 899/// 900/// \param __a 901/// A 64-bit integer vector containing the minuend. 902/// \param __b 903/// A 64-bit integer vector containing the subtrahend. 904/// \returns A 64-bit integer vector containing the difference of the values in 905/// the operands. 906static __inline__ __m64 __DEFAULT_FN_ATTRS 907_mm_sub_si64(__m64 __a, __m64 __b) 908{ 909 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 910} 911 912/// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 913/// 914/// \headerfile <x86intrin.h> 915/// 916/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction. 917/// 918/// \param __a 919/// A 128-bit integer vector containing the minuends. 920/// \param __b 921/// A 128-bit integer vector containing the subtrahends. 922/// \returns A 128-bit integer vector containing the differences of the values 923/// in the operands. 924static __inline__ __m128i __DEFAULT_FN_ATTRS 925_mm_sub_epi64(__m128i __a, __m128i __b) 926{ 927 return (__m128i)((__v2du)__a - (__v2du)__b); 928} 929 930/// \brief Subtracts corresponding 8-bit signed integer values in the input and 931/// returns the differences in the corresponding bytes in the destination. 932/// Differences greater than 7Fh are saturated to 7Fh, and differences less 933/// than 80h are saturated to 80h. 934/// 935/// \headerfile <x86intrin.h> 936/// 937/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction. 938/// 939/// \param __a 940/// A 128-bit integer vector containing the minuends. 941/// \param __b 942/// A 128-bit integer vector containing the subtrahends. 943/// \returns A 128-bit integer vector containing the differences of the values 944/// in the operands. 945static __inline__ __m128i __DEFAULT_FN_ATTRS 946_mm_subs_epi8(__m128i __a, __m128i __b) 947{ 948 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 949} 950 951/// \brief Subtracts corresponding 16-bit signed integer values in the input and 952/// returns the differences in the corresponding bytes in the destination. 953/// Differences greater than 7FFFh are saturated to 7FFFh, and values less 954/// than 8000h are saturated to 8000h. 955/// 956/// \headerfile <x86intrin.h> 957/// 958/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction. 959/// 960/// \param __a 961/// A 128-bit integer vector containing the minuends. 962/// \param __b 963/// A 128-bit integer vector containing the subtrahends. 964/// \returns A 128-bit integer vector containing the differences of the values 965/// in the operands. 966static __inline__ __m128i __DEFAULT_FN_ATTRS 967_mm_subs_epi16(__m128i __a, __m128i __b) 968{ 969 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 970} 971 972/// \brief Subtracts corresponding 8-bit unsigned integer values in the input 973/// and returns the differences in the corresponding bytes in the 974/// destination. Differences less than 00h are saturated to 00h. 975/// 976/// \headerfile <x86intrin.h> 977/// 978/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction. 979/// 980/// \param __a 981/// A 128-bit integer vector containing the minuends. 982/// \param __b 983/// A 128-bit integer vector containing the subtrahends. 984/// \returns A 128-bit integer vector containing the unsigned integer 985/// differences of the values in the operands. 986static __inline__ __m128i __DEFAULT_FN_ATTRS 987_mm_subs_epu8(__m128i __a, __m128i __b) 988{ 989 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 990} 991 992/// \brief Subtracts corresponding 16-bit unsigned integer values in the input 993/// and returns the differences in the corresponding bytes in the 994/// destination. Differences less than 0000h are saturated to 0000h. 995/// 996/// \headerfile <x86intrin.h> 997/// 998/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction. 999/// 1000/// \param __a 1001/// A 128-bit integer vector containing the minuends. 1002/// \param __b 1003/// A 128-bit integer vector containing the subtrahends. 1004/// \returns A 128-bit integer vector containing the unsigned integer 1005/// differences of the values in the operands. 1006static __inline__ __m128i __DEFAULT_FN_ATTRS 1007_mm_subs_epu16(__m128i __a, __m128i __b) 1008{ 1009 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 1010} 1011 1012/// \brief Performs a bitwise AND of two 128-bit integer vectors. 1013/// 1014/// \headerfile <x86intrin.h> 1015/// 1016/// This intrinsic corresponds to the \c VPAND / PAND instruction. 1017/// 1018/// \param __a 1019/// A 128-bit integer vector containing one of the source operands. 1020/// \param __b 1021/// A 128-bit integer vector containing one of the source operands. 1022/// \returns A 128-bit integer vector containing the bitwise AND of the values 1023/// in both operands. 1024static __inline__ __m128i __DEFAULT_FN_ATTRS 1025_mm_and_si128(__m128i __a, __m128i __b) 1026{ 1027 return (__m128i)((__v2du)__a & (__v2du)__b); 1028} 1029 1030/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 1031/// one's complement of the values contained in the first source operand. 1032/// 1033/// \headerfile <x86intrin.h> 1034/// 1035/// This intrinsic corresponds to the \c VPANDN / PANDN instruction. 1036/// 1037/// \param __a 1038/// A 128-bit vector containing the left source operand. The one's complement 1039/// of this value is used in the bitwise AND. 1040/// \param __b 1041/// A 128-bit vector containing the right source operand. 1042/// \returns A 128-bit integer vector containing the bitwise AND of the one's 1043/// complement of the first operand and the values in the second operand. 1044static __inline__ __m128i __DEFAULT_FN_ATTRS 1045_mm_andnot_si128(__m128i __a, __m128i __b) 1046{ 1047 return (__m128i)(~(__v2du)__a & (__v2du)__b); 1048} 1049/// \brief Performs a bitwise OR of two 128-bit integer vectors. 1050/// 1051/// \headerfile <x86intrin.h> 1052/// 1053/// This intrinsic corresponds to the \c VPOR / POR instruction. 1054/// 1055/// \param __a 1056/// A 128-bit integer vector containing one of the source operands. 1057/// \param __b 1058/// A 128-bit integer vector containing one of the source operands. 1059/// \returns A 128-bit integer vector containing the bitwise OR of the values 1060/// in both operands. 1061static __inline__ __m128i __DEFAULT_FN_ATTRS 1062_mm_or_si128(__m128i __a, __m128i __b) 1063{ 1064 return (__m128i)((__v2du)__a | (__v2du)__b); 1065} 1066 1067/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 1068/// 1069/// \headerfile <x86intrin.h> 1070/// 1071/// This intrinsic corresponds to the \c VPXOR / PXOR instruction. 1072/// 1073/// \param __a 1074/// A 128-bit integer vector containing one of the source operands. 1075/// \param __b 1076/// A 128-bit integer vector containing one of the source operands. 1077/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 1078/// values in both operands. 1079static __inline__ __m128i __DEFAULT_FN_ATTRS 1080_mm_xor_si128(__m128i __a, __m128i __b) 1081{ 1082 return (__m128i)((__v2du)__a ^ (__v2du)__b); 1083} 1084 1085/// \brief Left-shifts the 128-bit integer vector operand by the specified 1086/// number of bytes. Low-order bits are cleared. 1087/// 1088/// \headerfile <x86intrin.h> 1089/// 1090/// \code 1091/// __m128i _mm_slli_si128(__m128i a, const int imm); 1092/// \endcode 1093/// 1094/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction. 1095/// 1096/// \param a 1097/// A 128-bit integer vector containing the source operand. 1098/// \param imm 1099/// An immediate value specifying the number of bytes to left-shift 1100/// operand a. 1101/// \returns A 128-bit integer vector containing the left-shifted value. 1102#define _mm_slli_si128(a, imm) __extension__ ({ \ 1103 (__m128i)__builtin_shufflevector( \ 1104 (__v16qi)_mm_setzero_si128(), \ 1105 (__v16qi)(__m128i)(a), \ 1106 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \ 1107 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \ 1108 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \ 1109 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \ 1110 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \ 1111 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \ 1112 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \ 1113 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \ 1114 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \ 1115 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \ 1116 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \ 1117 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \ 1118 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ 1119 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ 1120 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ 1121 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) 1122 1123#define _mm_bslli_si128(a, imm) \ 1124 _mm_slli_si128((a), (imm)) 1125 1126/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1127/// by the specified number of bits. Low-order bits are cleared. 1128/// 1129/// \headerfile <x86intrin.h> 1130/// 1131/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1132/// 1133/// \param __a 1134/// A 128-bit integer vector containing the source operand. 1135/// \param __count 1136/// An integer value specifying the number of bits to left-shift each value 1137/// in operand __a. 1138/// \returns A 128-bit integer vector containing the left-shifted values. 1139static __inline__ __m128i __DEFAULT_FN_ATTRS 1140_mm_slli_epi16(__m128i __a, int __count) 1141{ 1142 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 1143} 1144 1145/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 1146/// by the specified number of bits. Low-order bits are cleared. 1147/// 1148/// \headerfile <x86intrin.h> 1149/// 1150/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 1151/// 1152/// \param __a 1153/// A 128-bit integer vector containing the source operand. 1154/// \param __count 1155/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1156/// to left-shift each value in operand __a. 1157/// \returns A 128-bit integer vector containing the left-shifted values. 1158static __inline__ __m128i __DEFAULT_FN_ATTRS 1159_mm_sll_epi16(__m128i __a, __m128i __count) 1160{ 1161 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 1162} 1163 1164/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1165/// by the specified number of bits. Low-order bits are cleared. 1166/// 1167/// \headerfile <x86intrin.h> 1168/// 1169/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1170/// 1171/// \param __a 1172/// A 128-bit integer vector containing the source operand. 1173/// \param __count 1174/// An integer value specifying the number of bits to left-shift each value 1175/// in operand __a. 1176/// \returns A 128-bit integer vector containing the left-shifted values. 1177static __inline__ __m128i __DEFAULT_FN_ATTRS 1178_mm_slli_epi32(__m128i __a, int __count) 1179{ 1180 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 1181} 1182 1183/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 1184/// by the specified number of bits. Low-order bits are cleared. 1185/// 1186/// \headerfile <x86intrin.h> 1187/// 1188/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 1189/// 1190/// \param __a 1191/// A 128-bit integer vector containing the source operand. 1192/// \param __count 1193/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1194/// to left-shift each value in operand __a. 1195/// \returns A 128-bit integer vector containing the left-shifted values. 1196static __inline__ __m128i __DEFAULT_FN_ATTRS 1197_mm_sll_epi32(__m128i __a, __m128i __count) 1198{ 1199 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 1200} 1201 1202/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1203/// by the specified number of bits. Low-order bits are cleared. 1204/// 1205/// \headerfile <x86intrin.h> 1206/// 1207/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1208/// 1209/// \param __a 1210/// A 128-bit integer vector containing the source operand. 1211/// \param __count 1212/// An integer value specifying the number of bits to left-shift each value 1213/// in operand __a. 1214/// \returns A 128-bit integer vector containing the left-shifted values. 1215static __inline__ __m128i __DEFAULT_FN_ATTRS 1216_mm_slli_epi64(__m128i __a, int __count) 1217{ 1218 return __builtin_ia32_psllqi128((__v2di)__a, __count); 1219} 1220 1221/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 1222/// by the specified number of bits. Low-order bits are cleared. 1223/// 1224/// \headerfile <x86intrin.h> 1225/// 1226/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 1227/// 1228/// \param __a 1229/// A 128-bit integer vector containing the source operand. 1230/// \param __count 1231/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1232/// to left-shift each value in operand __a. 1233/// \returns A 128-bit integer vector containing the left-shifted values. 1234static __inline__ __m128i __DEFAULT_FN_ATTRS 1235_mm_sll_epi64(__m128i __a, __m128i __count) 1236{ 1237 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 1238} 1239 1240/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1241/// by the specified number of bits. High-order bits are filled with the sign 1242/// bit of the initial value. 1243/// 1244/// \headerfile <x86intrin.h> 1245/// 1246/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1247/// 1248/// \param __a 1249/// A 128-bit integer vector containing the source operand. 1250/// \param __count 1251/// An integer value specifying the number of bits to right-shift each value 1252/// in operand __a. 1253/// \returns A 128-bit integer vector containing the right-shifted values. 1254static __inline__ __m128i __DEFAULT_FN_ATTRS 1255_mm_srai_epi16(__m128i __a, int __count) 1256{ 1257 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 1258} 1259 1260/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 1261/// by the specified number of bits. High-order bits are filled with the sign 1262/// bit of the initial value. 1263/// 1264/// \headerfile <x86intrin.h> 1265/// 1266/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 1267/// 1268/// \param __a 1269/// A 128-bit integer vector containing the source operand. 1270/// \param __count 1271/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1272/// to right-shift each value in operand __a. 1273/// \returns A 128-bit integer vector containing the right-shifted values. 1274static __inline__ __m128i __DEFAULT_FN_ATTRS 1275_mm_sra_epi16(__m128i __a, __m128i __count) 1276{ 1277 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 1278} 1279 1280/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1281/// by the specified number of bits. High-order bits are filled with the sign 1282/// bit of the initial value. 1283/// 1284/// \headerfile <x86intrin.h> 1285/// 1286/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1287/// 1288/// \param __a 1289/// A 128-bit integer vector containing the source operand. 1290/// \param __count 1291/// An integer value specifying the number of bits to right-shift each value 1292/// in operand __a. 1293/// \returns A 128-bit integer vector containing the right-shifted values. 1294static __inline__ __m128i __DEFAULT_FN_ATTRS 1295_mm_srai_epi32(__m128i __a, int __count) 1296{ 1297 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 1298} 1299 1300/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 1301/// by the specified number of bits. High-order bits are filled with the sign 1302/// bit of the initial value. 1303/// 1304/// \headerfile <x86intrin.h> 1305/// 1306/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 1307/// 1308/// \param __a 1309/// A 128-bit integer vector containing the source operand. 1310/// \param __count 1311/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1312/// to right-shift each value in operand __a. 1313/// \returns A 128-bit integer vector containing the right-shifted values. 1314static __inline__ __m128i __DEFAULT_FN_ATTRS 1315_mm_sra_epi32(__m128i __a, __m128i __count) 1316{ 1317 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 1318} 1319 1320/// \brief Right-shifts the 128-bit integer vector operand by the specified 1321/// number of bytes. High-order bits are cleared. 1322/// 1323/// \headerfile <x86intrin.h> 1324/// 1325/// \code 1326/// __m128i _mm_srli_si128(__m128i a, const int imm); 1327/// \endcode 1328/// 1329/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction. 1330/// 1331/// \param a 1332/// A 128-bit integer vector containing the source operand. 1333/// \param imm 1334/// An immediate value specifying the number of bytes to right-shift operand 1335/// a. 1336/// \returns A 128-bit integer vector containing the right-shifted value. 1337#define _mm_srli_si128(a, imm) __extension__ ({ \ 1338 (__m128i)__builtin_shufflevector( \ 1339 (__v16qi)(__m128i)(a), \ 1340 (__v16qi)_mm_setzero_si128(), \ 1341 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \ 1342 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \ 1343 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \ 1344 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \ 1345 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \ 1346 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \ 1347 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \ 1348 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \ 1349 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \ 1350 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \ 1351 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \ 1352 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \ 1353 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ 1354 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ 1355 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ 1356 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) 1357 1358#define _mm_bsrli_si128(a, imm) \ 1359 _mm_srli_si128((a), (imm)) 1360 1361/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1362/// operand by the specified number of bits. High-order bits are cleared. 1363/// 1364/// \headerfile <x86intrin.h> 1365/// 1366/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1367/// 1368/// \param __a 1369/// A 128-bit integer vector containing the source operand. 1370/// \param __count 1371/// An integer value specifying the number of bits to right-shift each value 1372/// in operand __a. 1373/// \returns A 128-bit integer vector containing the right-shifted values. 1374static __inline__ __m128i __DEFAULT_FN_ATTRS 1375_mm_srli_epi16(__m128i __a, int __count) 1376{ 1377 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 1378} 1379 1380/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 1381/// operand by the specified number of bits. High-order bits are cleared. 1382/// 1383/// \headerfile <x86intrin.h> 1384/// 1385/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 1386/// 1387/// \param __a 1388/// A 128-bit integer vector containing the source operand. 1389/// \param __count 1390/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1391/// to right-shift each value in operand __a. 1392/// \returns A 128-bit integer vector containing the right-shifted values. 1393static __inline__ __m128i __DEFAULT_FN_ATTRS 1394_mm_srl_epi16(__m128i __a, __m128i __count) 1395{ 1396 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 1397} 1398 1399/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1400/// operand by the specified number of bits. High-order bits are cleared. 1401/// 1402/// \headerfile <x86intrin.h> 1403/// 1404/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1405/// 1406/// \param __a 1407/// A 128-bit integer vector containing the source operand. 1408/// \param __count 1409/// An integer value specifying the number of bits to right-shift each value 1410/// in operand __a. 1411/// \returns A 128-bit integer vector containing the right-shifted values. 1412static __inline__ __m128i __DEFAULT_FN_ATTRS 1413_mm_srli_epi32(__m128i __a, int __count) 1414{ 1415 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 1416} 1417 1418/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 1419/// operand by the specified number of bits. High-order bits are cleared. 1420/// 1421/// \headerfile <x86intrin.h> 1422/// 1423/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 1424/// 1425/// \param __a 1426/// A 128-bit integer vector containing the source operand. 1427/// \param __count 1428/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1429/// to right-shift each value in operand __a. 1430/// \returns A 128-bit integer vector containing the right-shifted values. 1431static __inline__ __m128i __DEFAULT_FN_ATTRS 1432_mm_srl_epi32(__m128i __a, __m128i __count) 1433{ 1434 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 1435} 1436 1437/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1438/// operand by the specified number of bits. High-order bits are cleared. 1439/// 1440/// \headerfile <x86intrin.h> 1441/// 1442/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1443/// 1444/// \param __a 1445/// A 128-bit integer vector containing the source operand. 1446/// \param __count 1447/// An integer value specifying the number of bits to right-shift each value 1448/// in operand __a. 1449/// \returns A 128-bit integer vector containing the right-shifted values. 1450static __inline__ __m128i __DEFAULT_FN_ATTRS 1451_mm_srli_epi64(__m128i __a, int __count) 1452{ 1453 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 1454} 1455 1456/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 1457/// operand by the specified number of bits. High-order bits are cleared. 1458/// 1459/// \headerfile <x86intrin.h> 1460/// 1461/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 1462/// 1463/// \param __a 1464/// A 128-bit integer vector containing the source operand. 1465/// \param __count 1466/// A 128-bit integer vector in which bits [63:0] specify the number of bits 1467/// to right-shift each value in operand __a. 1468/// \returns A 128-bit integer vector containing the right-shifted values. 1469static __inline__ __m128i __DEFAULT_FN_ATTRS 1470_mm_srl_epi64(__m128i __a, __m128i __count) 1471{ 1472 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 1473} 1474 1475/// \brief Compares each of the corresponding 8-bit values of the 128-bit 1476/// integer vectors for equality. Each comparison yields 0h for false, FFh 1477/// for true. 1478/// 1479/// \headerfile <x86intrin.h> 1480/// 1481/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction. 1482/// 1483/// \param __a 1484/// A 128-bit integer vector. 1485/// \param __b 1486/// A 128-bit integer vector. 1487/// \returns A 128-bit integer vector containing the comparison results. 1488static __inline__ __m128i __DEFAULT_FN_ATTRS 1489_mm_cmpeq_epi8(__m128i __a, __m128i __b) 1490{ 1491 return (__m128i)((__v16qi)__a == (__v16qi)__b); 1492} 1493 1494/// \brief Compares each of the corresponding 16-bit values of the 128-bit 1495/// integer vectors for equality. Each comparison yields 0h for false, FFFFh 1496/// for true. 1497/// 1498/// \headerfile <x86intrin.h> 1499/// 1500/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction. 1501/// 1502/// \param __a 1503/// A 128-bit integer vector. 1504/// \param __b 1505/// A 128-bit integer vector. 1506/// \returns A 128-bit integer vector containing the comparison results. 1507static __inline__ __m128i __DEFAULT_FN_ATTRS 1508_mm_cmpeq_epi16(__m128i __a, __m128i __b) 1509{ 1510 return (__m128i)((__v8hi)__a == (__v8hi)__b); 1511} 1512 1513/// \brief Compares each of the corresponding 32-bit values of the 128-bit 1514/// integer vectors for equality. Each comparison yields 0h for false, 1515/// FFFFFFFFh for true. 1516/// 1517/// \headerfile <x86intrin.h> 1518/// 1519/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction. 1520/// 1521/// \param __a 1522/// A 128-bit integer vector. 1523/// \param __b 1524/// A 128-bit integer vector. 1525/// \returns A 128-bit integer vector containing the comparison results. 1526static __inline__ __m128i __DEFAULT_FN_ATTRS 1527_mm_cmpeq_epi32(__m128i __a, __m128i __b) 1528{ 1529 return (__m128i)((__v4si)__a == (__v4si)__b); 1530} 1531 1532/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1533/// integer vectors to determine if the values in the first operand are 1534/// greater than those in the second operand. Each comparison yields 0h for 1535/// false, FFh for true. 1536/// 1537/// \headerfile <x86intrin.h> 1538/// 1539/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1540/// 1541/// \param __a 1542/// A 128-bit integer vector. 1543/// \param __b 1544/// A 128-bit integer vector. 1545/// \returns A 128-bit integer vector containing the comparison results. 1546static __inline__ __m128i __DEFAULT_FN_ATTRS 1547_mm_cmpgt_epi8(__m128i __a, __m128i __b) 1548{ 1549 /* This function always performs a signed comparison, but __v16qi is a char 1550 which may be signed or unsigned, so use __v16qs. */ 1551 return (__m128i)((__v16qs)__a > (__v16qs)__b); 1552} 1553 1554/// \brief Compares each of the corresponding signed 16-bit values of the 1555/// 128-bit integer vectors to determine if the values in the first operand 1556/// are greater than those in the second operand. Each comparison yields 0h 1557/// for false, FFFFh for true. 1558/// 1559/// \headerfile <x86intrin.h> 1560/// 1561/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1562/// 1563/// \param __a 1564/// A 128-bit integer vector. 1565/// \param __b 1566/// A 128-bit integer vector. 1567/// \returns A 128-bit integer vector containing the comparison results. 1568static __inline__ __m128i __DEFAULT_FN_ATTRS 1569_mm_cmpgt_epi16(__m128i __a, __m128i __b) 1570{ 1571 return (__m128i)((__v8hi)__a > (__v8hi)__b); 1572} 1573 1574/// \brief Compares each of the corresponding signed 32-bit values of the 1575/// 128-bit integer vectors to determine if the values in the first operand 1576/// are greater than those in the second operand. Each comparison yields 0h 1577/// for false, FFFFFFFFh for true. 1578/// 1579/// \headerfile <x86intrin.h> 1580/// 1581/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1582/// 1583/// \param __a 1584/// A 128-bit integer vector. 1585/// \param __b 1586/// A 128-bit integer vector. 1587/// \returns A 128-bit integer vector containing the comparison results. 1588static __inline__ __m128i __DEFAULT_FN_ATTRS 1589_mm_cmpgt_epi32(__m128i __a, __m128i __b) 1590{ 1591 return (__m128i)((__v4si)__a > (__v4si)__b); 1592} 1593 1594/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 1595/// integer vectors to determine if the values in the first operand are less 1596/// than those in the second operand. Each comparison yields 0h for false, 1597/// FFh for true. 1598/// 1599/// \headerfile <x86intrin.h> 1600/// 1601/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 1602/// 1603/// \param __a 1604/// A 128-bit integer vector. 1605/// \param __b 1606/// A 128-bit integer vector. 1607/// \returns A 128-bit integer vector containing the comparison results. 1608static __inline__ __m128i __DEFAULT_FN_ATTRS 1609_mm_cmplt_epi8(__m128i __a, __m128i __b) 1610{ 1611 return _mm_cmpgt_epi8(__b, __a); 1612} 1613 1614/// \brief Compares each of the corresponding signed 16-bit values of the 1615/// 128-bit integer vectors to determine if the values in the first operand 1616/// are less than those in the second operand. Each comparison yields 0h for 1617/// false, FFFFh for true. 1618/// 1619/// \headerfile <x86intrin.h> 1620/// 1621/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 1622/// 1623/// \param __a 1624/// A 128-bit integer vector. 1625/// \param __b 1626/// A 128-bit integer vector. 1627/// \returns A 128-bit integer vector containing the comparison results. 1628static __inline__ __m128i __DEFAULT_FN_ATTRS 1629_mm_cmplt_epi16(__m128i __a, __m128i __b) 1630{ 1631 return _mm_cmpgt_epi16(__b, __a); 1632} 1633 1634/// \brief Compares each of the corresponding signed 32-bit values of the 1635/// 128-bit integer vectors to determine if the values in the first operand 1636/// are less than those in the second operand. Each comparison yields 0h for 1637/// false, FFFFFFFFh for true. 1638/// 1639/// \headerfile <x86intrin.h> 1640/// 1641/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 1642/// 1643/// \param __a 1644/// A 128-bit integer vector. 1645/// \param __b 1646/// A 128-bit integer vector. 1647/// \returns A 128-bit integer vector containing the comparison results. 1648static __inline__ __m128i __DEFAULT_FN_ATTRS 1649_mm_cmplt_epi32(__m128i __a, __m128i __b) 1650{ 1651 return _mm_cmpgt_epi32(__b, __a); 1652} 1653 1654#ifdef __x86_64__ 1655/// \brief Converts a 64-bit signed integer value from the second operand into a 1656/// double-precision value and returns it in the lower element of a [2 x 1657/// double] vector; the upper element of the returned vector is copied from 1658/// the upper element of the first operand. 1659/// 1660/// \headerfile <x86intrin.h> 1661/// 1662/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. 1663/// 1664/// \param __a 1665/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 1666/// copied to the upper 64 bits of the destination. 1667/// \param __b 1668/// A 64-bit signed integer operand containing the value to be converted. 1669/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 1670/// converted value of the second operand. The upper 64 bits are copied from 1671/// the upper 64 bits of the first operand. 1672static __inline__ __m128d __DEFAULT_FN_ATTRS 1673_mm_cvtsi64_sd(__m128d __a, long long __b) 1674{ 1675 __a[0] = __b; 1676 return __a; 1677} 1678 1679/// \brief Converts the first (lower) element of a vector of [2 x double] into a 1680/// 64-bit signed integer value, according to the current rounding mode. 1681/// 1682/// \headerfile <x86intrin.h> 1683/// 1684/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. 1685/// 1686/// \param __a 1687/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1688/// conversion. 1689/// \returns A 64-bit signed integer containing the converted value. 1690static __inline__ long long __DEFAULT_FN_ATTRS 1691_mm_cvtsd_si64(__m128d __a) 1692{ 1693 return __builtin_ia32_cvtsd2si64((__v2df)__a); 1694} 1695 1696/// \brief Converts the first (lower) element of a vector of [2 x double] into a 1697/// 64-bit signed integer value, truncating the result when it is inexact. 1698/// 1699/// \headerfile <x86intrin.h> 1700/// 1701/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. 1702/// 1703/// \param __a 1704/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1705/// conversion. 1706/// \returns A 64-bit signed integer containing the converted value. 1707static __inline__ long long __DEFAULT_FN_ATTRS 1708_mm_cvttsd_si64(__m128d __a) 1709{ 1710 return __a[0]; 1711} 1712#endif 1713 1714/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 1715/// 1716/// \headerfile <x86intrin.h> 1717/// 1718/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. 1719/// 1720/// \param __a 1721/// A 128-bit integer vector. 1722/// \returns A 128-bit vector of [4 x float] containing the converted values. 1723static __inline__ __m128 __DEFAULT_FN_ATTRS 1724_mm_cvtepi32_ps(__m128i __a) 1725{ 1726 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1727} 1728 1729/// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 1730/// 1731/// \headerfile <x86intrin.h> 1732/// 1733/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. 1734/// 1735/// \param __a 1736/// A 128-bit vector of [4 x float]. 1737/// \returns A 128-bit integer vector of [4 x i32] containing the converted 1738/// values. 1739static __inline__ __m128i __DEFAULT_FN_ATTRS 1740_mm_cvtps_epi32(__m128 __a) 1741{ 1742 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 1743} 1744 1745/// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 1746/// truncating the result when it is inexact. 1747/// 1748/// \headerfile <x86intrin.h> 1749/// 1750/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction. 1751/// 1752/// \param __a 1753/// A 128-bit vector of [4 x float]. 1754/// \returns A 128-bit vector of [4 x i32] containing the converted values. 1755static __inline__ __m128i __DEFAULT_FN_ATTRS 1756_mm_cvttps_epi32(__m128 __a) 1757{ 1758 return (__m128i)__builtin_convertvector((__v4sf)__a, __v4si); 1759} 1760 1761/// \brief Returns a vector of [4 x i32] where the lowest element is the input 1762/// operand and the remaining elements are zero. 1763/// 1764/// \headerfile <x86intrin.h> 1765/// 1766/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1767/// 1768/// \param __a 1769/// A 32-bit signed integer operand. 1770/// \returns A 128-bit vector of [4 x i32]. 1771static __inline__ __m128i __DEFAULT_FN_ATTRS 1772_mm_cvtsi32_si128(int __a) 1773{ 1774 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1775} 1776 1777#ifdef __x86_64__ 1778/// \brief Returns a vector of [2 x i64] where the lower element is the input 1779/// operand and the upper element is zero. 1780/// 1781/// \headerfile <x86intrin.h> 1782/// 1783/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1784/// 1785/// \param __a 1786/// A 64-bit signed integer operand containing the value to be converted. 1787/// \returns A 128-bit vector of [2 x i64] containing the converted value. 1788static __inline__ __m128i __DEFAULT_FN_ATTRS 1789_mm_cvtsi64_si128(long long __a) 1790{ 1791 return (__m128i){ __a, 0 }; 1792} 1793#endif 1794 1795/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 1796/// 32-bit signed integer value. 1797/// 1798/// \headerfile <x86intrin.h> 1799/// 1800/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 1801/// 1802/// \param __a 1803/// A vector of [4 x i32]. The least significant 32 bits are moved to the 1804/// destination. 1805/// \returns A 32-bit signed integer containing the moved value. 1806static __inline__ int __DEFAULT_FN_ATTRS 1807_mm_cvtsi128_si32(__m128i __a) 1808{ 1809 __v4si __b = (__v4si)__a; 1810 return __b[0]; 1811} 1812 1813#ifdef __x86_64__ 1814/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 1815/// 64-bit signed integer value. 1816/// 1817/// \headerfile <x86intrin.h> 1818/// 1819/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1820/// 1821/// \param __a 1822/// A vector of [2 x i64]. The least significant 64 bits are moved to the 1823/// destination. 1824/// \returns A 64-bit signed integer containing the moved value. 1825static __inline__ long long __DEFAULT_FN_ATTRS 1826_mm_cvtsi128_si64(__m128i __a) 1827{ 1828 return __a[0]; 1829} 1830#endif 1831 1832/// \brief Moves packed integer values from an aligned 128-bit memory location 1833/// to elements in a 128-bit integer vector. 1834/// 1835/// \headerfile <x86intrin.h> 1836/// 1837/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction. 1838/// 1839/// \param __p 1840/// An aligned pointer to a memory location containing integer values. 1841/// \returns A 128-bit integer vector containing the moved values. 1842static __inline__ __m128i __DEFAULT_FN_ATTRS 1843_mm_load_si128(__m128i const *__p) 1844{ 1845 return *__p; 1846} 1847 1848/// \brief Moves packed integer values from an unaligned 128-bit memory location 1849/// to elements in a 128-bit integer vector. 1850/// 1851/// \headerfile <x86intrin.h> 1852/// 1853/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction. 1854/// 1855/// \param __p 1856/// A pointer to a memory location containing integer values. 1857/// \returns A 128-bit integer vector containing the moved values. 1858static __inline__ __m128i __DEFAULT_FN_ATTRS 1859_mm_loadu_si128(__m128i const *__p) 1860{ 1861 struct __loadu_si128 { 1862 __m128i __v; 1863 } __attribute__((__packed__, __may_alias__)); 1864 return ((struct __loadu_si128*)__p)->__v; 1865} 1866 1867/// \brief Returns a vector of [2 x i64] where the lower element is taken from 1868/// the lower element of the operand, and the upper element is zero. 1869/// 1870/// \headerfile <x86intrin.h> 1871/// 1872/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1873/// 1874/// \param __p 1875/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 1876/// the destination. 1877/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 1878/// moved value. The higher order bits are cleared. 1879static __inline__ __m128i __DEFAULT_FN_ATTRS 1880_mm_loadl_epi64(__m128i const *__p) 1881{ 1882 struct __mm_loadl_epi64_struct { 1883 long long __u; 1884 } __attribute__((__packed__, __may_alias__)); 1885 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1886} 1887 1888/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 1889/// This could be used as an argument to another intrinsic function where the 1890/// argument is required but the value is not actually used. 1891/// 1892/// \headerfile <x86intrin.h> 1893/// 1894/// This intrinsic has no corresponding instruction. 1895/// 1896/// \returns A 128-bit vector of [4 x i32] with unspecified content. 1897static __inline__ __m128i __DEFAULT_FN_ATTRS 1898_mm_undefined_si128(void) 1899{ 1900 return (__m128i)__builtin_ia32_undef128(); 1901} 1902 1903/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1904/// the specified 64-bit integer values. 1905/// 1906/// \headerfile <x86intrin.h> 1907/// 1908/// This intrinsic is a utility function and does not correspond to a specific 1909/// instruction. 1910/// 1911/// \param __q1 1912/// A 64-bit integer value used to initialize the upper 64 bits of the 1913/// destination vector of [2 x i64]. 1914/// \param __q0 1915/// A 64-bit integer value used to initialize the lower 64 bits of the 1916/// destination vector of [2 x i64]. 1917/// \returns An initialized 128-bit vector of [2 x i64] containing the values 1918/// provided in the operands. 1919static __inline__ __m128i __DEFAULT_FN_ATTRS 1920_mm_set_epi64x(long long __q1, long long __q0) 1921{ 1922 return (__m128i){ __q0, __q1 }; 1923} 1924 1925/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 1926/// the specified 64-bit integer values. 1927/// 1928/// \headerfile <x86intrin.h> 1929/// 1930/// This intrinsic is a utility function and does not correspond to a specific 1931/// instruction. 1932/// 1933/// \param __q1 1934/// A 64-bit integer value used to initialize the upper 64 bits of the 1935/// destination vector of [2 x i64]. 1936/// \param __q0 1937/// A 64-bit integer value used to initialize the lower 64 bits of the 1938/// destination vector of [2 x i64]. 1939/// \returns An initialized 128-bit vector of [2 x i64] containing the values 1940/// provided in the operands. 1941static __inline__ __m128i __DEFAULT_FN_ATTRS 1942_mm_set_epi64(__m64 __q1, __m64 __q0) 1943{ 1944 return (__m128i){ (long long)__q0, (long long)__q1 }; 1945} 1946 1947/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 1948/// the specified 32-bit integer values. 1949/// 1950/// \headerfile <x86intrin.h> 1951/// 1952/// This intrinsic is a utility function and does not correspond to a specific 1953/// instruction. 1954/// 1955/// \param __i3 1956/// A 32-bit integer value used to initialize bits [127:96] of the 1957/// destination vector. 1958/// \param __i2 1959/// A 32-bit integer value used to initialize bits [95:64] of the destination 1960/// vector. 1961/// \param __i1 1962/// A 32-bit integer value used to initialize bits [63:32] of the destination 1963/// vector. 1964/// \param __i0 1965/// A 32-bit integer value used to initialize bits [31:0] of the destination 1966/// vector. 1967/// \returns An initialized 128-bit vector of [4 x i32] containing the values 1968/// provided in the operands. 1969static __inline__ __m128i __DEFAULT_FN_ATTRS 1970_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 1971{ 1972 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 1973} 1974 1975/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 1976/// the specified 16-bit integer values. 1977/// 1978/// \headerfile <x86intrin.h> 1979/// 1980/// This intrinsic is a utility function and does not correspond to a specific 1981/// instruction. 1982/// 1983/// \param __w7 1984/// A 16-bit integer value used to initialize bits [127:112] of the 1985/// destination vector. 1986/// \param __w6 1987/// A 16-bit integer value used to initialize bits [111:96] of the 1988/// destination vector. 1989/// \param __w5 1990/// A 16-bit integer value used to initialize bits [95:80] of the destination 1991/// vector. 1992/// \param __w4 1993/// A 16-bit integer value used to initialize bits [79:64] of the destination 1994/// vector. 1995/// \param __w3 1996/// A 16-bit integer value used to initialize bits [63:48] of the destination 1997/// vector. 1998/// \param __w2 1999/// A 16-bit integer value used to initialize bits [47:32] of the destination 2000/// vector. 2001/// \param __w1 2002/// A 16-bit integer value used to initialize bits [31:16] of the destination 2003/// vector. 2004/// \param __w0 2005/// A 16-bit integer value used to initialize bits [15:0] of the destination 2006/// vector. 2007/// \returns An initialized 128-bit vector of [8 x i16] containing the values 2008/// provided in the operands. 2009static __inline__ __m128i __DEFAULT_FN_ATTRS 2010_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 2011{ 2012 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2013} 2014 2015/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 2016/// the specified 8-bit integer values. 2017/// 2018/// \headerfile <x86intrin.h> 2019/// 2020/// This intrinsic is a utility function and does not correspond to a specific 2021/// instruction. 2022/// 2023/// \param __b15 2024/// Initializes bits [127:120] of the destination vector. 2025/// \param __b14 2026/// Initializes bits [119:112] of the destination vector. 2027/// \param __b13 2028/// Initializes bits [111:104] of the destination vector. 2029/// \param __b12 2030/// Initializes bits [103:96] of the destination vector. 2031/// \param __b11 2032/// Initializes bits [95:88] of the destination vector. 2033/// \param __b10 2034/// Initializes bits [87:80] of the destination vector. 2035/// \param __b9 2036/// Initializes bits [79:72] of the destination vector. 2037/// \param __b8 2038/// Initializes bits [71:64] of the destination vector. 2039/// \param __b7 2040/// Initializes bits [63:56] of the destination vector. 2041/// \param __b6 2042/// Initializes bits [55:48] of the destination vector. 2043/// \param __b5 2044/// Initializes bits [47:40] of the destination vector. 2045/// \param __b4 2046/// Initializes bits [39:32] of the destination vector. 2047/// \param __b3 2048/// Initializes bits [31:24] of the destination vector. 2049/// \param __b2 2050/// Initializes bits [23:16] of the destination vector. 2051/// \param __b1 2052/// Initializes bits [15:8] of the destination vector. 2053/// \param __b0 2054/// Initializes bits [7:0] of the destination vector. 2055/// \returns An initialized 128-bit vector of [16 x i8] containing the values 2056/// provided in the operands. 2057static __inline__ __m128i __DEFAULT_FN_ATTRS 2058_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 2059{ 2060 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2061} 2062 2063/// \brief Initializes both values in a 128-bit integer vector with the 2064/// specified 64-bit integer value. 2065/// 2066/// \headerfile <x86intrin.h> 2067/// 2068/// This intrinsic is a utility function and does not correspond to a specific 2069/// instruction. 2070/// 2071/// \param __q 2072/// Integer value used to initialize the elements of the destination integer 2073/// vector. 2074/// \returns An initialized 128-bit integer vector of [2 x i64] with both 2075/// elements containing the value provided in the operand. 2076static __inline__ __m128i __DEFAULT_FN_ATTRS 2077_mm_set1_epi64x(long long __q) 2078{ 2079 return (__m128i){ __q, __q }; 2080} 2081 2082/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 2083/// specified 64-bit value. 2084/// 2085/// \headerfile <x86intrin.h> 2086/// 2087/// This intrinsic is a utility function and does not correspond to a specific 2088/// instruction. 2089/// 2090/// \param __q 2091/// A 64-bit value used to initialize the elements of the destination integer 2092/// vector. 2093/// \returns An initialized 128-bit vector of [2 x i64] with all elements 2094/// containing the value provided in the operand. 2095static __inline__ __m128i __DEFAULT_FN_ATTRS 2096_mm_set1_epi64(__m64 __q) 2097{ 2098 return (__m128i){ (long long)__q, (long long)__q }; 2099} 2100 2101/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 2102/// specified 32-bit value. 2103/// 2104/// \headerfile <x86intrin.h> 2105/// 2106/// This intrinsic is a utility function and does not correspond to a specific 2107/// instruction. 2108/// 2109/// \param __i 2110/// A 32-bit value used to initialize the elements of the destination integer 2111/// vector. 2112/// \returns An initialized 128-bit vector of [4 x i32] with all elements 2113/// containing the value provided in the operand. 2114static __inline__ __m128i __DEFAULT_FN_ATTRS 2115_mm_set1_epi32(int __i) 2116{ 2117 return (__m128i)(__v4si){ __i, __i, __i, __i }; 2118} 2119 2120/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 2121/// specified 16-bit value. 2122/// 2123/// \headerfile <x86intrin.h> 2124/// 2125/// This intrinsic is a utility function and does not correspond to a specific 2126/// instruction. 2127/// 2128/// \param __w 2129/// A 16-bit value used to initialize the elements of the destination integer 2130/// vector. 2131/// \returns An initialized 128-bit vector of [8 x i16] with all elements 2132/// containing the value provided in the operand. 2133static __inline__ __m128i __DEFAULT_FN_ATTRS 2134_mm_set1_epi16(short __w) 2135{ 2136 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 2137} 2138 2139/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 2140/// specified 8-bit value. 2141/// 2142/// \headerfile <x86intrin.h> 2143/// 2144/// This intrinsic is a utility function and does not correspond to a specific 2145/// instruction. 2146/// 2147/// \param __b 2148/// An 8-bit value used to initialize the elements of the destination integer 2149/// vector. 2150/// \returns An initialized 128-bit vector of [16 x i8] with all elements 2151/// containing the value provided in the operand. 2152static __inline__ __m128i __DEFAULT_FN_ATTRS 2153_mm_set1_epi8(char __b) 2154{ 2155 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 2156} 2157 2158static __inline__ __m128i __DEFAULT_FN_ATTRS 2159_mm_setr_epi64(__m64 __q0, __m64 __q1) 2160{ 2161 return (__m128i){ (long long)__q0, (long long)__q1 }; 2162} 2163 2164static __inline__ __m128i __DEFAULT_FN_ATTRS 2165_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 2166{ 2167 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 2168} 2169 2170static __inline__ __m128i __DEFAULT_FN_ATTRS 2171_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 2172{ 2173 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 2174} 2175 2176static __inline__ __m128i __DEFAULT_FN_ATTRS 2177_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 2178{ 2179 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 2180} 2181 2182static __inline__ __m128i __DEFAULT_FN_ATTRS 2183_mm_setzero_si128(void) 2184{ 2185 return (__m128i){ 0LL, 0LL }; 2186} 2187 2188static __inline__ void __DEFAULT_FN_ATTRS 2189_mm_store_si128(__m128i *__p, __m128i __b) 2190{ 2191 *__p = __b; 2192} 2193 2194static __inline__ void __DEFAULT_FN_ATTRS 2195_mm_storeu_si128(__m128i *__p, __m128i __b) 2196{ 2197 struct __storeu_si128 { 2198 __m128i __v; 2199 } __attribute__((__packed__, __may_alias__)); 2200 ((struct __storeu_si128*)__p)->__v = __b; 2201} 2202 2203static __inline__ void __DEFAULT_FN_ATTRS 2204_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 2205{ 2206 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 2207} 2208 2209static __inline__ void __DEFAULT_FN_ATTRS 2210_mm_storel_epi64(__m128i *__p, __m128i __a) 2211{ 2212 struct __mm_storel_epi64_struct { 2213 long long __u; 2214 } __attribute__((__packed__, __may_alias__)); 2215 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 2216} 2217 2218static __inline__ void __DEFAULT_FN_ATTRS 2219_mm_stream_pd(double *__p, __m128d __a) 2220{ 2221 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 2222} 2223 2224static __inline__ void __DEFAULT_FN_ATTRS 2225_mm_stream_si128(__m128i *__p, __m128i __a) 2226{ 2227 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 2228} 2229 2230static __inline__ void __DEFAULT_FN_ATTRS 2231_mm_stream_si32(int *__p, int __a) 2232{ 2233 __builtin_ia32_movnti(__p, __a); 2234} 2235 2236#ifdef __x86_64__ 2237static __inline__ void __DEFAULT_FN_ATTRS 2238_mm_stream_si64(long long *__p, long long __a) 2239{ 2240 __builtin_ia32_movnti64(__p, __a); 2241} 2242#endif 2243 2244static __inline__ void __DEFAULT_FN_ATTRS 2245_mm_clflush(void const *__p) 2246{ 2247 __builtin_ia32_clflush(__p); 2248} 2249 2250static __inline__ void __DEFAULT_FN_ATTRS 2251_mm_lfence(void) 2252{ 2253 __builtin_ia32_lfence(); 2254} 2255 2256static __inline__ void __DEFAULT_FN_ATTRS 2257_mm_mfence(void) 2258{ 2259 __builtin_ia32_mfence(); 2260} 2261 2262static __inline__ __m128i __DEFAULT_FN_ATTRS 2263_mm_packs_epi16(__m128i __a, __m128i __b) 2264{ 2265 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 2266} 2267 2268static __inline__ __m128i __DEFAULT_FN_ATTRS 2269_mm_packs_epi32(__m128i __a, __m128i __b) 2270{ 2271 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 2272} 2273 2274static __inline__ __m128i __DEFAULT_FN_ATTRS 2275_mm_packus_epi16(__m128i __a, __m128i __b) 2276{ 2277 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 2278} 2279 2280static __inline__ int __DEFAULT_FN_ATTRS 2281_mm_extract_epi16(__m128i __a, int __imm) 2282{ 2283 __v8hi __b = (__v8hi)__a; 2284 return (unsigned short)__b[__imm & 7]; 2285} 2286 2287static __inline__ __m128i __DEFAULT_FN_ATTRS 2288_mm_insert_epi16(__m128i __a, int __b, int __imm) 2289{ 2290 __v8hi __c = (__v8hi)__a; 2291 __c[__imm & 7] = __b; 2292 return (__m128i)__c; 2293} 2294 2295static __inline__ int __DEFAULT_FN_ATTRS 2296_mm_movemask_epi8(__m128i __a) 2297{ 2298 return __builtin_ia32_pmovmskb128((__v16qi)__a); 2299} 2300 2301#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 2302 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 2303 (__v4si)_mm_undefined_si128(), \ 2304 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 2305 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) 2306 2307#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 2308 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2309 (__v8hi)_mm_undefined_si128(), \ 2310 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 2311 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ 2312 4, 5, 6, 7); }) 2313 2314#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 2315 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 2316 (__v8hi)_mm_undefined_si128(), \ 2317 0, 1, 2, 3, \ 2318 4 + (((imm) >> 0) & 0x3), \ 2319 4 + (((imm) >> 2) & 0x3), \ 2320 4 + (((imm) >> 4) & 0x3), \ 2321 4 + (((imm) >> 6) & 0x3)); }) 2322 2323static __inline__ __m128i __DEFAULT_FN_ATTRS 2324_mm_unpackhi_epi8(__m128i __a, __m128i __b) 2325{ 2326 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 2327} 2328 2329static __inline__ __m128i __DEFAULT_FN_ATTRS 2330_mm_unpackhi_epi16(__m128i __a, __m128i __b) 2331{ 2332 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 2333} 2334 2335static __inline__ __m128i __DEFAULT_FN_ATTRS 2336_mm_unpackhi_epi32(__m128i __a, __m128i __b) 2337{ 2338 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 2339} 2340 2341static __inline__ __m128i __DEFAULT_FN_ATTRS 2342_mm_unpackhi_epi64(__m128i __a, __m128i __b) 2343{ 2344 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 2345} 2346 2347static __inline__ __m128i __DEFAULT_FN_ATTRS 2348_mm_unpacklo_epi8(__m128i __a, __m128i __b) 2349{ 2350 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 2351} 2352 2353static __inline__ __m128i __DEFAULT_FN_ATTRS 2354_mm_unpacklo_epi16(__m128i __a, __m128i __b) 2355{ 2356 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 2357} 2358 2359static __inline__ __m128i __DEFAULT_FN_ATTRS 2360_mm_unpacklo_epi32(__m128i __a, __m128i __b) 2361{ 2362 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 2363} 2364 2365static __inline__ __m128i __DEFAULT_FN_ATTRS 2366_mm_unpacklo_epi64(__m128i __a, __m128i __b) 2367{ 2368 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 2369} 2370 2371static __inline__ __m64 __DEFAULT_FN_ATTRS 2372_mm_movepi64_pi64(__m128i __a) 2373{ 2374 return (__m64)__a[0]; 2375} 2376 2377static __inline__ __m128i __DEFAULT_FN_ATTRS 2378_mm_movpi64_epi64(__m64 __a) 2379{ 2380 return (__m128i){ (long long)__a, 0 }; 2381} 2382 2383static __inline__ __m128i __DEFAULT_FN_ATTRS 2384_mm_move_epi64(__m128i __a) 2385{ 2386 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); 2387} 2388 2389static __inline__ __m128d __DEFAULT_FN_ATTRS 2390_mm_unpackhi_pd(__m128d __a, __m128d __b) 2391{ 2392 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 2393} 2394 2395static __inline__ __m128d __DEFAULT_FN_ATTRS 2396_mm_unpacklo_pd(__m128d __a, __m128d __b) 2397{ 2398 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 2399} 2400 2401static __inline__ int __DEFAULT_FN_ATTRS 2402_mm_movemask_pd(__m128d __a) 2403{ 2404 return __builtin_ia32_movmskpd((__v2df)__a); 2405} 2406 2407#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 2408 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 2409 0 + (((i) >> 0) & 0x1), \ 2410 2 + (((i) >> 1) & 0x1)); }) 2411 2412static __inline__ __m128 __DEFAULT_FN_ATTRS 2413_mm_castpd_ps(__m128d __a) 2414{ 2415 return (__m128)__a; 2416} 2417 2418static __inline__ __m128i __DEFAULT_FN_ATTRS 2419_mm_castpd_si128(__m128d __a) 2420{ 2421 return (__m128i)__a; 2422} 2423 2424static __inline__ __m128d __DEFAULT_FN_ATTRS 2425_mm_castps_pd(__m128 __a) 2426{ 2427 return (__m128d)__a; 2428} 2429 2430static __inline__ __m128i __DEFAULT_FN_ATTRS 2431_mm_castps_si128(__m128 __a) 2432{ 2433 return (__m128i)__a; 2434} 2435 2436static __inline__ __m128 __DEFAULT_FN_ATTRS 2437_mm_castsi128_ps(__m128i __a) 2438{ 2439 return (__m128)__a; 2440} 2441 2442static __inline__ __m128d __DEFAULT_FN_ATTRS 2443_mm_castsi128_pd(__m128i __a) 2444{ 2445 return (__m128d)__a; 2446} 2447 2448static __inline__ void __DEFAULT_FN_ATTRS 2449_mm_pause(void) 2450{ 2451 __builtin_ia32_pause(); 2452} 2453 2454#undef __DEFAULT_FN_ATTRS 2455 2456#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 2457 2458#endif /* __EMMINTRIN_H */ 2459