1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36/* Type defines. */ 37typedef double __v2df __attribute__ ((__vector_size__ (16))); 38typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39typedef short __v8hi __attribute__((__vector_size__(16))); 40typedef char __v16qi __attribute__((__vector_size__(16))); 41 42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43_mm_add_sd(__m128d __a, __m128d __b) 44{ 45 __a[0] += __b[0]; 46 return __a; 47} 48 49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50_mm_add_pd(__m128d __a, __m128d __b) 51{ 52 return __a + __b; 53} 54 55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56_mm_sub_sd(__m128d __a, __m128d __b) 57{ 58 __a[0] -= __b[0]; 59 return __a; 60} 61 62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63_mm_sub_pd(__m128d __a, __m128d __b) 64{ 65 return __a - __b; 66} 67 68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69_mm_mul_sd(__m128d __a, __m128d __b) 70{ 71 __a[0] *= __b[0]; 72 return __a; 73} 74 75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76_mm_mul_pd(__m128d __a, __m128d __b) 77{ 78 return __a * __b; 79} 80 81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82_mm_div_sd(__m128d __a, __m128d __b) 83{ 84 __a[0] /= __b[0]; 85 return __a; 86} 87 88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89_mm_div_pd(__m128d __a, __m128d __b) 90{ 91 return __a / __b; 92} 93 94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95_mm_sqrt_sd(__m128d __a, __m128d __b) 96{ 97 __m128d __c = __builtin_ia32_sqrtsd(__b); 98 return (__m128d) { __c[0], __a[1] }; 99} 100 101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102_mm_sqrt_pd(__m128d __a) 103{ 104 return __builtin_ia32_sqrtpd(__a); 105} 106 107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108_mm_min_sd(__m128d __a, __m128d __b) 109{ 110 return __builtin_ia32_minsd(__a, __b); 111} 112 113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114_mm_min_pd(__m128d __a, __m128d __b) 115{ 116 return __builtin_ia32_minpd(__a, __b); 117} 118 119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120_mm_max_sd(__m128d __a, __m128d __b) 121{ 122 return __builtin_ia32_maxsd(__a, __b); 123} 124 125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126_mm_max_pd(__m128d __a, __m128d __b) 127{ 128 return __builtin_ia32_maxpd(__a, __b); 129} 130 131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132_mm_and_pd(__m128d __a, __m128d __b) 133{ 134 return (__m128d)((__v4si)__a & (__v4si)__b); 135} 136 137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138_mm_andnot_pd(__m128d __a, __m128d __b) 139{ 140 return (__m128d)(~(__v4si)__a & (__v4si)__b); 141} 142 143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144_mm_or_pd(__m128d __a, __m128d __b) 145{ 146 return (__m128d)((__v4si)__a | (__v4si)__b); 147} 148 149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150_mm_xor_pd(__m128d __a, __m128d __b) 151{ 152 return (__m128d)((__v4si)__a ^ (__v4si)__b); 153} 154 155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156_mm_cmpeq_pd(__m128d __a, __m128d __b) 157{ 158 return (__m128d)__builtin_ia32_cmpeqpd(__a, __b); 159} 160 161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162_mm_cmplt_pd(__m128d __a, __m128d __b) 163{ 164 return (__m128d)__builtin_ia32_cmpltpd(__a, __b); 165} 166 167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168_mm_cmple_pd(__m128d __a, __m128d __b) 169{ 170 return (__m128d)__builtin_ia32_cmplepd(__a, __b); 171} 172 173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174_mm_cmpgt_pd(__m128d __a, __m128d __b) 175{ 176 return (__m128d)__builtin_ia32_cmpltpd(__b, __a); 177} 178 179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180_mm_cmpge_pd(__m128d __a, __m128d __b) 181{ 182 return (__m128d)__builtin_ia32_cmplepd(__b, __a); 183} 184 185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186_mm_cmpord_pd(__m128d __a, __m128d __b) 187{ 188 return (__m128d)__builtin_ia32_cmpordpd(__a, __b); 189} 190 191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192_mm_cmpunord_pd(__m128d __a, __m128d __b) 193{ 194 return (__m128d)__builtin_ia32_cmpunordpd(__a, __b); 195} 196 197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198_mm_cmpneq_pd(__m128d __a, __m128d __b) 199{ 200 return (__m128d)__builtin_ia32_cmpneqpd(__a, __b); 201} 202 203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204_mm_cmpnlt_pd(__m128d __a, __m128d __b) 205{ 206 return (__m128d)__builtin_ia32_cmpnltpd(__a, __b); 207} 208 209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210_mm_cmpnle_pd(__m128d __a, __m128d __b) 211{ 212 return (__m128d)__builtin_ia32_cmpnlepd(__a, __b); 213} 214 215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216_mm_cmpngt_pd(__m128d __a, __m128d __b) 217{ 218 return (__m128d)__builtin_ia32_cmpnltpd(__b, __a); 219} 220 221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222_mm_cmpnge_pd(__m128d __a, __m128d __b) 223{ 224 return (__m128d)__builtin_ia32_cmpnlepd(__b, __a); 225} 226 227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228_mm_cmpeq_sd(__m128d __a, __m128d __b) 229{ 230 return (__m128d)__builtin_ia32_cmpeqsd(__a, __b); 231} 232 233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234_mm_cmplt_sd(__m128d __a, __m128d __b) 235{ 236 return (__m128d)__builtin_ia32_cmpltsd(__a, __b); 237} 238 239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240_mm_cmple_sd(__m128d __a, __m128d __b) 241{ 242 return (__m128d)__builtin_ia32_cmplesd(__a, __b); 243} 244 245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246_mm_cmpgt_sd(__m128d __a, __m128d __b) 247{ 248 __m128d __c = __builtin_ia32_cmpltsd(__b, __a); 249 return (__m128d) { __c[0], __a[1] }; 250} 251 252static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 253_mm_cmpge_sd(__m128d __a, __m128d __b) 254{ 255 __m128d __c = __builtin_ia32_cmplesd(__b, __a); 256 return (__m128d) { __c[0], __a[1] }; 257} 258 259static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 260_mm_cmpord_sd(__m128d __a, __m128d __b) 261{ 262 return (__m128d)__builtin_ia32_cmpordsd(__a, __b); 263} 264 265static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 266_mm_cmpunord_sd(__m128d __a, __m128d __b) 267{ 268 return (__m128d)__builtin_ia32_cmpunordsd(__a, __b); 269} 270 271static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 272_mm_cmpneq_sd(__m128d __a, __m128d __b) 273{ 274 return (__m128d)__builtin_ia32_cmpneqsd(__a, __b); 275} 276 277static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 278_mm_cmpnlt_sd(__m128d __a, __m128d __b) 279{ 280 return (__m128d)__builtin_ia32_cmpnltsd(__a, __b); 281} 282 283static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 284_mm_cmpnle_sd(__m128d __a, __m128d __b) 285{ 286 return (__m128d)__builtin_ia32_cmpnlesd(__a, __b); 287} 288 289static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 290_mm_cmpngt_sd(__m128d __a, __m128d __b) 291{ 292 __m128d __c = __builtin_ia32_cmpnltsd(__b, __a); 293 return (__m128d) { __c[0], __a[1] }; 294} 295 296static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 297_mm_cmpnge_sd(__m128d __a, __m128d __b) 298{ 299 __m128d __c = __builtin_ia32_cmpnlesd(__b, __a); 300 return (__m128d) { __c[0], __a[1] }; 301} 302 303static __inline__ int __attribute__((__always_inline__, __nodebug__)) 304_mm_comieq_sd(__m128d __a, __m128d __b) 305{ 306 return __builtin_ia32_comisdeq(__a, __b); 307} 308 309static __inline__ int __attribute__((__always_inline__, __nodebug__)) 310_mm_comilt_sd(__m128d __a, __m128d __b) 311{ 312 return __builtin_ia32_comisdlt(__a, __b); 313} 314 315static __inline__ int __attribute__((__always_inline__, __nodebug__)) 316_mm_comile_sd(__m128d __a, __m128d __b) 317{ 318 return __builtin_ia32_comisdle(__a, __b); 319} 320 321static __inline__ int __attribute__((__always_inline__, __nodebug__)) 322_mm_comigt_sd(__m128d __a, __m128d __b) 323{ 324 return __builtin_ia32_comisdgt(__a, __b); 325} 326 327static __inline__ int __attribute__((__always_inline__, __nodebug__)) 328_mm_comige_sd(__m128d __a, __m128d __b) 329{ 330 return __builtin_ia32_comisdge(__a, __b); 331} 332 333static __inline__ int __attribute__((__always_inline__, __nodebug__)) 334_mm_comineq_sd(__m128d __a, __m128d __b) 335{ 336 return __builtin_ia32_comisdneq(__a, __b); 337} 338 339static __inline__ int __attribute__((__always_inline__, __nodebug__)) 340_mm_ucomieq_sd(__m128d __a, __m128d __b) 341{ 342 return __builtin_ia32_ucomisdeq(__a, __b); 343} 344 345static __inline__ int __attribute__((__always_inline__, __nodebug__)) 346_mm_ucomilt_sd(__m128d __a, __m128d __b) 347{ 348 return __builtin_ia32_ucomisdlt(__a, __b); 349} 350 351static __inline__ int __attribute__((__always_inline__, __nodebug__)) 352_mm_ucomile_sd(__m128d __a, __m128d __b) 353{ 354 return __builtin_ia32_ucomisdle(__a, __b); 355} 356 357static __inline__ int __attribute__((__always_inline__, __nodebug__)) 358_mm_ucomigt_sd(__m128d __a, __m128d __b) 359{ 360 return __builtin_ia32_ucomisdgt(__a, __b); 361} 362 363static __inline__ int __attribute__((__always_inline__, __nodebug__)) 364_mm_ucomige_sd(__m128d __a, __m128d __b) 365{ 366 return __builtin_ia32_ucomisdge(__a, __b); 367} 368 369static __inline__ int __attribute__((__always_inline__, __nodebug__)) 370_mm_ucomineq_sd(__m128d __a, __m128d __b) 371{ 372 return __builtin_ia32_ucomisdneq(__a, __b); 373} 374 375static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 376_mm_cvtpd_ps(__m128d __a) 377{ 378 return __builtin_ia32_cvtpd2ps(__a); 379} 380 381static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 382_mm_cvtps_pd(__m128 __a) 383{ 384 return __builtin_ia32_cvtps2pd(__a); 385} 386 387static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 388_mm_cvtepi32_pd(__m128i __a) 389{ 390 return __builtin_ia32_cvtdq2pd((__v4si)__a); 391} 392 393static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 394_mm_cvtpd_epi32(__m128d __a) 395{ 396 return __builtin_ia32_cvtpd2dq(__a); 397} 398 399static __inline__ int __attribute__((__always_inline__, __nodebug__)) 400_mm_cvtsd_si32(__m128d __a) 401{ 402 return __builtin_ia32_cvtsd2si(__a); 403} 404 405static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 406_mm_cvtsd_ss(__m128 __a, __m128d __b) 407{ 408 __a[0] = __b[0]; 409 return __a; 410} 411 412static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 413_mm_cvtsi32_sd(__m128d __a, int __b) 414{ 415 __a[0] = __b; 416 return __a; 417} 418 419static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 420_mm_cvtss_sd(__m128d __a, __m128 __b) 421{ 422 __a[0] = __b[0]; 423 return __a; 424} 425 426static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 427_mm_cvttpd_epi32(__m128d __a) 428{ 429 return (__m128i)__builtin_ia32_cvttpd2dq(__a); 430} 431 432static __inline__ int __attribute__((__always_inline__, __nodebug__)) 433_mm_cvttsd_si32(__m128d __a) 434{ 435 return __a[0]; 436} 437 438static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 439_mm_cvtpd_pi32(__m128d __a) 440{ 441 return (__m64)__builtin_ia32_cvtpd2pi(__a); 442} 443 444static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 445_mm_cvttpd_pi32(__m128d __a) 446{ 447 return (__m64)__builtin_ia32_cvttpd2pi(__a); 448} 449 450static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 451_mm_cvtpi32_pd(__m64 __a) 452{ 453 return __builtin_ia32_cvtpi2pd((__v2si)__a); 454} 455 456static __inline__ double __attribute__((__always_inline__, __nodebug__)) 457_mm_cvtsd_f64(__m128d __a) 458{ 459 return __a[0]; 460} 461 462static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 463_mm_load_pd(double const *__dp) 464{ 465 return *(__m128d*)__dp; 466} 467 468static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 469_mm_load1_pd(double const *__dp) 470{ 471 struct __mm_load1_pd_struct { 472 double __u; 473 } __attribute__((__packed__, __may_alias__)); 474 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 475 return (__m128d){ __u, __u }; 476} 477 478#define _mm_load_pd1(dp) _mm_load1_pd(dp) 479 480static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 481_mm_loadr_pd(double const *__dp) 482{ 483 __m128d __u = *(__m128d*)__dp; 484 return __builtin_shufflevector(__u, __u, 1, 0); 485} 486 487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 488_mm_loadu_pd(double const *__dp) 489{ 490 struct __loadu_pd { 491 __m128d __v; 492 } __attribute__((__packed__, __may_alias__)); 493 return ((struct __loadu_pd*)__dp)->__v; 494} 495 496static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 497_mm_load_sd(double const *__dp) 498{ 499 struct __mm_load_sd_struct { 500 double __u; 501 } __attribute__((__packed__, __may_alias__)); 502 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 503 return (__m128d){ __u, 0 }; 504} 505 506static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 507_mm_loadh_pd(__m128d __a, double const *__dp) 508{ 509 struct __mm_loadh_pd_struct { 510 double __u; 511 } __attribute__((__packed__, __may_alias__)); 512 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 513 return (__m128d){ __a[0], __u }; 514} 515 516static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 517_mm_loadl_pd(__m128d __a, double const *__dp) 518{ 519 struct __mm_loadl_pd_struct { 520 double __u; 521 } __attribute__((__packed__, __may_alias__)); 522 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 523 return (__m128d){ __u, __a[1] }; 524} 525 526static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 527_mm_set_sd(double __w) 528{ 529 return (__m128d){ __w, 0 }; 530} 531 532static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 533_mm_set1_pd(double __w) 534{ 535 return (__m128d){ __w, __w }; 536} 537 538static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 539_mm_set_pd(double __w, double __x) 540{ 541 return (__m128d){ __x, __w }; 542} 543 544static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 545_mm_setr_pd(double __w, double __x) 546{ 547 return (__m128d){ __w, __x }; 548} 549 550static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 551_mm_setzero_pd(void) 552{ 553 return (__m128d){ 0, 0 }; 554} 555 556static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 557_mm_move_sd(__m128d __a, __m128d __b) 558{ 559 return (__m128d){ __b[0], __a[1] }; 560} 561 562static __inline__ void __attribute__((__always_inline__, __nodebug__)) 563_mm_store_sd(double *__dp, __m128d __a) 564{ 565 struct __mm_store_sd_struct { 566 double __u; 567 } __attribute__((__packed__, __may_alias__)); 568 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 569} 570 571static __inline__ void __attribute__((__always_inline__, __nodebug__)) 572_mm_store1_pd(double *__dp, __m128d __a) 573{ 574 struct __mm_store1_pd_struct { 575 double __u[2]; 576 } __attribute__((__packed__, __may_alias__)); 577 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0]; 578 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; 579} 580 581static __inline__ void __attribute__((__always_inline__, __nodebug__)) 582_mm_store_pd(double *__dp, __m128d __a) 583{ 584 *(__m128d *)__dp = __a; 585} 586 587static __inline__ void __attribute__((__always_inline__, __nodebug__)) 588_mm_storeu_pd(double *__dp, __m128d __a) 589{ 590 __builtin_ia32_storeupd(__dp, __a); 591} 592 593static __inline__ void __attribute__((__always_inline__, __nodebug__)) 594_mm_storer_pd(double *__dp, __m128d __a) 595{ 596 __a = __builtin_shufflevector(__a, __a, 1, 0); 597 *(__m128d *)__dp = __a; 598} 599 600static __inline__ void __attribute__((__always_inline__, __nodebug__)) 601_mm_storeh_pd(double *__dp, __m128d __a) 602{ 603 struct __mm_storeh_pd_struct { 604 double __u; 605 } __attribute__((__packed__, __may_alias__)); 606 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 607} 608 609static __inline__ void __attribute__((__always_inline__, __nodebug__)) 610_mm_storel_pd(double *__dp, __m128d __a) 611{ 612 struct __mm_storeh_pd_struct { 613 double __u; 614 } __attribute__((__packed__, __may_alias__)); 615 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 616} 617 618static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 619_mm_add_epi8(__m128i __a, __m128i __b) 620{ 621 return (__m128i)((__v16qi)__a + (__v16qi)__b); 622} 623 624static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 625_mm_add_epi16(__m128i __a, __m128i __b) 626{ 627 return (__m128i)((__v8hi)__a + (__v8hi)__b); 628} 629 630static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 631_mm_add_epi32(__m128i __a, __m128i __b) 632{ 633 return (__m128i)((__v4si)__a + (__v4si)__b); 634} 635 636static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 637_mm_add_si64(__m64 __a, __m64 __b) 638{ 639 return __a + __b; 640} 641 642static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 643_mm_add_epi64(__m128i __a, __m128i __b) 644{ 645 return __a + __b; 646} 647 648static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 649_mm_adds_epi8(__m128i __a, __m128i __b) 650{ 651 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 652} 653 654static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 655_mm_adds_epi16(__m128i __a, __m128i __b) 656{ 657 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 658} 659 660static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 661_mm_adds_epu8(__m128i __a, __m128i __b) 662{ 663 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 664} 665 666static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 667_mm_adds_epu16(__m128i __a, __m128i __b) 668{ 669 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 670} 671 672static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 673_mm_avg_epu8(__m128i __a, __m128i __b) 674{ 675 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 676} 677 678static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 679_mm_avg_epu16(__m128i __a, __m128i __b) 680{ 681 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 682} 683 684static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 685_mm_madd_epi16(__m128i __a, __m128i __b) 686{ 687 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 688} 689 690static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 691_mm_max_epi16(__m128i __a, __m128i __b) 692{ 693 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 694} 695 696static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 697_mm_max_epu8(__m128i __a, __m128i __b) 698{ 699 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 700} 701 702static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 703_mm_min_epi16(__m128i __a, __m128i __b) 704{ 705 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 706} 707 708static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 709_mm_min_epu8(__m128i __a, __m128i __b) 710{ 711 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 712} 713 714static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 715_mm_mulhi_epi16(__m128i __a, __m128i __b) 716{ 717 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 718} 719 720static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 721_mm_mulhi_epu16(__m128i __a, __m128i __b) 722{ 723 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 724} 725 726static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 727_mm_mullo_epi16(__m128i __a, __m128i __b) 728{ 729 return (__m128i)((__v8hi)__a * (__v8hi)__b); 730} 731 732static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 733_mm_mul_su32(__m64 __a, __m64 __b) 734{ 735 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 736} 737 738static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 739_mm_mul_epu32(__m128i __a, __m128i __b) 740{ 741 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 742} 743 744static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 745_mm_sad_epu8(__m128i __a, __m128i __b) 746{ 747 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 748} 749 750static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 751_mm_sub_epi8(__m128i __a, __m128i __b) 752{ 753 return (__m128i)((__v16qi)__a - (__v16qi)__b); 754} 755 756static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 757_mm_sub_epi16(__m128i __a, __m128i __b) 758{ 759 return (__m128i)((__v8hi)__a - (__v8hi)__b); 760} 761 762static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 763_mm_sub_epi32(__m128i __a, __m128i __b) 764{ 765 return (__m128i)((__v4si)__a - (__v4si)__b); 766} 767 768static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 769_mm_sub_si64(__m64 __a, __m64 __b) 770{ 771 return __a - __b; 772} 773 774static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 775_mm_sub_epi64(__m128i __a, __m128i __b) 776{ 777 return __a - __b; 778} 779 780static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 781_mm_subs_epi8(__m128i __a, __m128i __b) 782{ 783 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 784} 785 786static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 787_mm_subs_epi16(__m128i __a, __m128i __b) 788{ 789 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 790} 791 792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 793_mm_subs_epu8(__m128i __a, __m128i __b) 794{ 795 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 796} 797 798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 799_mm_subs_epu16(__m128i __a, __m128i __b) 800{ 801 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 802} 803 804static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 805_mm_and_si128(__m128i __a, __m128i __b) 806{ 807 return __a & __b; 808} 809 810static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 811_mm_andnot_si128(__m128i __a, __m128i __b) 812{ 813 return ~__a & __b; 814} 815 816static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 817_mm_or_si128(__m128i __a, __m128i __b) 818{ 819 return __a | __b; 820} 821 822static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 823_mm_xor_si128(__m128i __a, __m128i __b) 824{ 825 return __a ^ __b; 826} 827 828#define _mm_slli_si128(a, imm) __extension__ ({ \ 829 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ 830 (__v16qi)(__m128i)(a), \ 831 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \ 832 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \ 833 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \ 834 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \ 835 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \ 836 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \ 837 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \ 838 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \ 839 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \ 840 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \ 841 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \ 842 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \ 843 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \ 844 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \ 845 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \ 846 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); }) 847 848#define _mm_bslli_si128(a, imm) \ 849 _mm_slli_si128((a), (imm)) 850 851static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 852_mm_slli_epi16(__m128i __a, int __count) 853{ 854 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 855} 856 857static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 858_mm_sll_epi16(__m128i __a, __m128i __count) 859{ 860 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 861} 862 863static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 864_mm_slli_epi32(__m128i __a, int __count) 865{ 866 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 867} 868 869static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 870_mm_sll_epi32(__m128i __a, __m128i __count) 871{ 872 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 873} 874 875static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 876_mm_slli_epi64(__m128i __a, int __count) 877{ 878 return __builtin_ia32_psllqi128(__a, __count); 879} 880 881static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 882_mm_sll_epi64(__m128i __a, __m128i __count) 883{ 884 return __builtin_ia32_psllq128(__a, __count); 885} 886 887static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 888_mm_srai_epi16(__m128i __a, int __count) 889{ 890 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 891} 892 893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 894_mm_sra_epi16(__m128i __a, __m128i __count) 895{ 896 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 897} 898 899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 900_mm_srai_epi32(__m128i __a, int __count) 901{ 902 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 903} 904 905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 906_mm_sra_epi32(__m128i __a, __m128i __count) 907{ 908 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 909} 910 911#define _mm_srli_si128(a, imm) __extension__ ({ \ 912 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ 913 (__v16qi)_mm_setzero_si128(), \ 914 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \ 915 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \ 916 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \ 917 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \ 918 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \ 919 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \ 920 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \ 921 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \ 922 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \ 923 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \ 924 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \ 925 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \ 926 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \ 927 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \ 928 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \ 929 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); }) 930 931#define _mm_bsrli_si128(a, imm) \ 932 _mm_srli_si128((a), (imm)) 933 934static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 935_mm_srli_epi16(__m128i __a, int __count) 936{ 937 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 938} 939 940static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 941_mm_srl_epi16(__m128i __a, __m128i __count) 942{ 943 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 944} 945 946static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 947_mm_srli_epi32(__m128i __a, int __count) 948{ 949 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 950} 951 952static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 953_mm_srl_epi32(__m128i __a, __m128i __count) 954{ 955 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 956} 957 958static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 959_mm_srli_epi64(__m128i __a, int __count) 960{ 961 return __builtin_ia32_psrlqi128(__a, __count); 962} 963 964static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 965_mm_srl_epi64(__m128i __a, __m128i __count) 966{ 967 return __builtin_ia32_psrlq128(__a, __count); 968} 969 970static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 971_mm_cmpeq_epi8(__m128i __a, __m128i __b) 972{ 973 return (__m128i)((__v16qi)__a == (__v16qi)__b); 974} 975 976static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 977_mm_cmpeq_epi16(__m128i __a, __m128i __b) 978{ 979 return (__m128i)((__v8hi)__a == (__v8hi)__b); 980} 981 982static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 983_mm_cmpeq_epi32(__m128i __a, __m128i __b) 984{ 985 return (__m128i)((__v4si)__a == (__v4si)__b); 986} 987 988static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 989_mm_cmpgt_epi8(__m128i __a, __m128i __b) 990{ 991 /* This function always performs a signed comparison, but __v16qi is a char 992 which may be signed or unsigned. */ 993 typedef signed char __v16qs __attribute__((__vector_size__(16))); 994 return (__m128i)((__v16qs)__a > (__v16qs)__b); 995} 996 997static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 998_mm_cmpgt_epi16(__m128i __a, __m128i __b) 999{ 1000 return (__m128i)((__v8hi)__a > (__v8hi)__b); 1001} 1002 1003static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1004_mm_cmpgt_epi32(__m128i __a, __m128i __b) 1005{ 1006 return (__m128i)((__v4si)__a > (__v4si)__b); 1007} 1008 1009static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1010_mm_cmplt_epi8(__m128i __a, __m128i __b) 1011{ 1012 return _mm_cmpgt_epi8(__b, __a); 1013} 1014 1015static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1016_mm_cmplt_epi16(__m128i __a, __m128i __b) 1017{ 1018 return _mm_cmpgt_epi16(__b, __a); 1019} 1020 1021static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1022_mm_cmplt_epi32(__m128i __a, __m128i __b) 1023{ 1024 return _mm_cmpgt_epi32(__b, __a); 1025} 1026 1027#ifdef __x86_64__ 1028static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1029_mm_cvtsi64_sd(__m128d __a, long long __b) 1030{ 1031 __a[0] = __b; 1032 return __a; 1033} 1034 1035static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1036_mm_cvtsd_si64(__m128d __a) 1037{ 1038 return __builtin_ia32_cvtsd2si64(__a); 1039} 1040 1041static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1042_mm_cvttsd_si64(__m128d __a) 1043{ 1044 return __a[0]; 1045} 1046#endif 1047 1048static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1049_mm_cvtepi32_ps(__m128i __a) 1050{ 1051 return __builtin_ia32_cvtdq2ps((__v4si)__a); 1052} 1053 1054static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1055_mm_cvtps_epi32(__m128 __a) 1056{ 1057 return (__m128i)__builtin_ia32_cvtps2dq(__a); 1058} 1059 1060static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1061_mm_cvttps_epi32(__m128 __a) 1062{ 1063 return (__m128i)__builtin_ia32_cvttps2dq(__a); 1064} 1065 1066static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1067_mm_cvtsi32_si128(int __a) 1068{ 1069 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 1070} 1071 1072#ifdef __x86_64__ 1073static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1074_mm_cvtsi64_si128(long long __a) 1075{ 1076 return (__m128i){ __a, 0 }; 1077} 1078#endif 1079 1080static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1081_mm_cvtsi128_si32(__m128i __a) 1082{ 1083 __v4si __b = (__v4si)__a; 1084 return __b[0]; 1085} 1086 1087#ifdef __x86_64__ 1088static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1089_mm_cvtsi128_si64(__m128i __a) 1090{ 1091 return __a[0]; 1092} 1093#endif 1094 1095static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1096_mm_load_si128(__m128i const *__p) 1097{ 1098 return *__p; 1099} 1100 1101static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1102_mm_loadu_si128(__m128i const *__p) 1103{ 1104 struct __loadu_si128 { 1105 __m128i __v; 1106 } __attribute__((__packed__, __may_alias__)); 1107 return ((struct __loadu_si128*)__p)->__v; 1108} 1109 1110static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1111_mm_loadl_epi64(__m128i const *__p) 1112{ 1113 struct __mm_loadl_epi64_struct { 1114 long long __u; 1115 } __attribute__((__packed__, __may_alias__)); 1116 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 1117} 1118 1119static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1120_mm_set_epi64x(long long q1, long long q0) 1121{ 1122 return (__m128i){ q0, q1 }; 1123} 1124 1125static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1126_mm_set_epi64(__m64 q1, __m64 q0) 1127{ 1128 return (__m128i){ (long long)q0, (long long)q1 }; 1129} 1130 1131static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1132_mm_set_epi32(int i3, int i2, int i1, int i0) 1133{ 1134 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1135} 1136 1137static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1138_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1139{ 1140 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1141} 1142 1143static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1144_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1145{ 1146 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1147} 1148 1149static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1150_mm_set1_epi64x(long long __q) 1151{ 1152 return (__m128i){ __q, __q }; 1153} 1154 1155static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1156_mm_set1_epi64(__m64 __q) 1157{ 1158 return (__m128i){ (long long)__q, (long long)__q }; 1159} 1160 1161static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1162_mm_set1_epi32(int __i) 1163{ 1164 return (__m128i)(__v4si){ __i, __i, __i, __i }; 1165} 1166 1167static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1168_mm_set1_epi16(short __w) 1169{ 1170 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 1171} 1172 1173static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1174_mm_set1_epi8(char __b) 1175{ 1176 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 1177} 1178 1179static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1180_mm_setr_epi64(__m64 q0, __m64 q1) 1181{ 1182 return (__m128i){ (long long)q0, (long long)q1 }; 1183} 1184 1185static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1186_mm_setr_epi32(int i0, int i1, int i2, int i3) 1187{ 1188 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1189} 1190 1191static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1192_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1193{ 1194 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1195} 1196 1197static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1198_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1199{ 1200 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1201} 1202 1203static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1204_mm_setzero_si128(void) 1205{ 1206 return (__m128i){ 0LL, 0LL }; 1207} 1208 1209static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1210_mm_store_si128(__m128i *__p, __m128i __b) 1211{ 1212 *__p = __b; 1213} 1214 1215static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1216_mm_storeu_si128(__m128i *__p, __m128i __b) 1217{ 1218 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); 1219} 1220 1221static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1222_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 1223{ 1224 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 1225} 1226 1227static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1228_mm_storel_epi64(__m128i *__p, __m128i __a) 1229{ 1230 struct __mm_storel_epi64_struct { 1231 long long __u; 1232 } __attribute__((__packed__, __may_alias__)); 1233 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 1234} 1235 1236static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1237_mm_stream_pd(double *__p, __m128d __a) 1238{ 1239 __builtin_ia32_movntpd(__p, __a); 1240} 1241 1242static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1243_mm_stream_si128(__m128i *__p, __m128i __a) 1244{ 1245 __builtin_ia32_movntdq(__p, __a); 1246} 1247 1248static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1249_mm_stream_si32(int *__p, int __a) 1250{ 1251 __builtin_ia32_movnti(__p, __a); 1252} 1253 1254#ifdef __x86_64__ 1255static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1256_mm_stream_si64(long long *__p, long long __a) 1257{ 1258 __builtin_ia32_movnti64(__p, __a); 1259} 1260#endif 1261 1262static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1263_mm_clflush(void const *__p) 1264{ 1265 __builtin_ia32_clflush(__p); 1266} 1267 1268static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1269_mm_lfence(void) 1270{ 1271 __builtin_ia32_lfence(); 1272} 1273 1274static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1275_mm_mfence(void) 1276{ 1277 __builtin_ia32_mfence(); 1278} 1279 1280static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1281_mm_packs_epi16(__m128i __a, __m128i __b) 1282{ 1283 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 1284} 1285 1286static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1287_mm_packs_epi32(__m128i __a, __m128i __b) 1288{ 1289 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 1290} 1291 1292static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1293_mm_packus_epi16(__m128i __a, __m128i __b) 1294{ 1295 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 1296} 1297 1298static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1299_mm_extract_epi16(__m128i __a, int __imm) 1300{ 1301 __v8hi __b = (__v8hi)__a; 1302 return (unsigned short)__b[__imm & 7]; 1303} 1304 1305static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1306_mm_insert_epi16(__m128i __a, int __b, int __imm) 1307{ 1308 __v8hi __c = (__v8hi)__a; 1309 __c[__imm & 7] = __b; 1310 return (__m128i)__c; 1311} 1312 1313static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1314_mm_movemask_epi8(__m128i __a) 1315{ 1316 return __builtin_ia32_pmovmskb128((__v16qi)__a); 1317} 1318 1319#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 1320 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 1321 (__v4si)_mm_set1_epi32(0), \ 1322 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1323 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 1324 1325#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 1326 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 1327 (__v8hi)_mm_set1_epi16(0), \ 1328 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1329 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1330 4, 5, 6, 7); }) 1331 1332#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 1333 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 1334 (__v8hi)_mm_set1_epi16(0), \ 1335 0, 1, 2, 3, \ 1336 4 + (((imm) & 0x03) >> 0), \ 1337 4 + (((imm) & 0x0c) >> 2), \ 1338 4 + (((imm) & 0x30) >> 4), \ 1339 4 + (((imm) & 0xc0) >> 6)); }) 1340 1341static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1342_mm_unpackhi_epi8(__m128i __a, __m128i __b) 1343{ 1344 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1345} 1346 1347static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1348_mm_unpackhi_epi16(__m128i __a, __m128i __b) 1349{ 1350 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1351} 1352 1353static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1354_mm_unpackhi_epi32(__m128i __a, __m128i __b) 1355{ 1356 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 1357} 1358 1359static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1360_mm_unpackhi_epi64(__m128i __a, __m128i __b) 1361{ 1362 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); 1363} 1364 1365static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1366_mm_unpacklo_epi8(__m128i __a, __m128i __b) 1367{ 1368 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1369} 1370 1371static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1372_mm_unpacklo_epi16(__m128i __a, __m128i __b) 1373{ 1374 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1375} 1376 1377static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1378_mm_unpacklo_epi32(__m128i __a, __m128i __b) 1379{ 1380 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 1381} 1382 1383static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1384_mm_unpacklo_epi64(__m128i __a, __m128i __b) 1385{ 1386 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); 1387} 1388 1389static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1390_mm_movepi64_pi64(__m128i __a) 1391{ 1392 return (__m64)__a[0]; 1393} 1394 1395static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1396_mm_movpi64_epi64(__m64 __a) 1397{ 1398 return (__m128i){ (long long)__a, 0 }; 1399} 1400 1401static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1402_mm_move_epi64(__m128i __a) 1403{ 1404 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); 1405} 1406 1407static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1408_mm_unpackhi_pd(__m128d __a, __m128d __b) 1409{ 1410 return __builtin_shufflevector(__a, __b, 1, 2+1); 1411} 1412 1413static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1414_mm_unpacklo_pd(__m128d __a, __m128d __b) 1415{ 1416 return __builtin_shufflevector(__a, __b, 0, 2+0); 1417} 1418 1419static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1420_mm_movemask_pd(__m128d __a) 1421{ 1422 return __builtin_ia32_movmskpd(__a); 1423} 1424 1425#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 1426 __builtin_shufflevector((__m128d)(a), (__m128d)(b), \ 1427 (i) & 1, (((i) & 2) >> 1) + 2); }) 1428 1429static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1430_mm_castpd_ps(__m128d __a) 1431{ 1432 return (__m128)__a; 1433} 1434 1435static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1436_mm_castpd_si128(__m128d __a) 1437{ 1438 return (__m128i)__a; 1439} 1440 1441static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1442_mm_castps_pd(__m128 __a) 1443{ 1444 return (__m128d)__a; 1445} 1446 1447static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1448_mm_castps_si128(__m128 __a) 1449{ 1450 return (__m128i)__a; 1451} 1452 1453static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1454_mm_castsi128_ps(__m128i __a) 1455{ 1456 return (__m128)__a; 1457} 1458 1459static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1460_mm_castsi128_pd(__m128i __a) 1461{ 1462 return (__m128d)__a; 1463} 1464 1465static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1466_mm_pause(void) 1467{ 1468 __asm__ volatile ("pause"); 1469} 1470 1471#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1472 1473#endif /* __SSE2__ */ 1474 1475#endif /* __EMMINTRIN_H */ 1476