emmintrin.h revision f42f85ce6c2c1ddbe57535898dfbe3a37f7199af
1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36/* Type defines. */ 37typedef double __v2df __attribute__ ((__vector_size__ (16))); 38typedef long long __v2di __attribute__ ((__vector_size__ (16))); 39typedef short __v8hi __attribute__((__vector_size__(16))); 40typedef char __v16qi __attribute__((__vector_size__(16))); 41 42static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 43_mm_add_sd(__m128d a, __m128d b) 44{ 45 a[0] += b[0]; 46 return a; 47} 48 49static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 50_mm_add_pd(__m128d a, __m128d b) 51{ 52 return a + b; 53} 54 55static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 56_mm_sub_sd(__m128d a, __m128d b) 57{ 58 a[0] -= b[0]; 59 return a; 60} 61 62static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 63_mm_sub_pd(__m128d a, __m128d b) 64{ 65 return a - b; 66} 67 68static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 69_mm_mul_sd(__m128d a, __m128d b) 70{ 71 a[0] *= b[0]; 72 return a; 73} 74 75static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 76_mm_mul_pd(__m128d a, __m128d b) 77{ 78 return a * b; 79} 80 81static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 82_mm_div_sd(__m128d a, __m128d b) 83{ 84 a[0] /= b[0]; 85 return a; 86} 87 88static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 89_mm_div_pd(__m128d a, __m128d b) 90{ 91 return a / b; 92} 93 94static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 95_mm_sqrt_sd(__m128d a, __m128d b) 96{ 97 __m128d c = __builtin_ia32_sqrtsd(b); 98 return (__m128d) { c[0], a[1] }; 99} 100 101static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 102_mm_sqrt_pd(__m128d a) 103{ 104 return __builtin_ia32_sqrtpd(a); 105} 106 107static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 108_mm_min_sd(__m128d a, __m128d b) 109{ 110 return __builtin_ia32_minsd(a, b); 111} 112 113static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 114_mm_min_pd(__m128d a, __m128d b) 115{ 116 return __builtin_ia32_minpd(a, b); 117} 118 119static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 120_mm_max_sd(__m128d a, __m128d b) 121{ 122 return __builtin_ia32_maxsd(a, b); 123} 124 125static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 126_mm_max_pd(__m128d a, __m128d b) 127{ 128 return __builtin_ia32_maxpd(a, b); 129} 130 131static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 132_mm_and_pd(__m128d a, __m128d b) 133{ 134 return (__m128d)((__v4si)a & (__v4si)b); 135} 136 137static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 138_mm_andnot_pd(__m128d a, __m128d b) 139{ 140 return (__m128d)(~(__v4si)a & (__v4si)b); 141} 142 143static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 144_mm_or_pd(__m128d a, __m128d b) 145{ 146 return (__m128d)((__v4si)a | (__v4si)b); 147} 148 149static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 150_mm_xor_pd(__m128d a, __m128d b) 151{ 152 return (__m128d)((__v4si)a ^ (__v4si)b); 153} 154 155static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 156_mm_cmpeq_pd(__m128d a, __m128d b) 157{ 158 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 159} 160 161static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 162_mm_cmplt_pd(__m128d a, __m128d b) 163{ 164 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 165} 166 167static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 168_mm_cmple_pd(__m128d a, __m128d b) 169{ 170 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 171} 172 173static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 174_mm_cmpgt_pd(__m128d a, __m128d b) 175{ 176 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 177} 178 179static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 180_mm_cmpge_pd(__m128d a, __m128d b) 181{ 182 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 183} 184 185static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 186_mm_cmpord_pd(__m128d a, __m128d b) 187{ 188 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 189} 190 191static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 192_mm_cmpunord_pd(__m128d a, __m128d b) 193{ 194 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 195} 196 197static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 198_mm_cmpneq_pd(__m128d a, __m128d b) 199{ 200 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 201} 202 203static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 204_mm_cmpnlt_pd(__m128d a, __m128d b) 205{ 206 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 207} 208 209static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 210_mm_cmpnle_pd(__m128d a, __m128d b) 211{ 212 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 213} 214 215static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 216_mm_cmpngt_pd(__m128d a, __m128d b) 217{ 218 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 219} 220 221static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 222_mm_cmpnge_pd(__m128d a, __m128d b) 223{ 224 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 225} 226 227static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 228_mm_cmpeq_sd(__m128d a, __m128d b) 229{ 230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 231} 232 233static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 234_mm_cmplt_sd(__m128d a, __m128d b) 235{ 236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 237} 238 239static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 240_mm_cmple_sd(__m128d a, __m128d b) 241{ 242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 243} 244 245static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 246_mm_cmpgt_sd(__m128d a, __m128d b) 247{ 248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 249} 250 251static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 252_mm_cmpge_sd(__m128d a, __m128d b) 253{ 254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 255} 256 257static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 258_mm_cmpord_sd(__m128d a, __m128d b) 259{ 260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 261} 262 263static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 264_mm_cmpunord_sd(__m128d a, __m128d b) 265{ 266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 267} 268 269static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 270_mm_cmpneq_sd(__m128d a, __m128d b) 271{ 272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 273} 274 275static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 276_mm_cmpnlt_sd(__m128d a, __m128d b) 277{ 278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 279} 280 281static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 282_mm_cmpnle_sd(__m128d a, __m128d b) 283{ 284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 285} 286 287static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 288_mm_cmpngt_sd(__m128d a, __m128d b) 289{ 290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 291} 292 293static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 294_mm_cmpnge_sd(__m128d a, __m128d b) 295{ 296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 297} 298 299static __inline__ int __attribute__((__always_inline__, __nodebug__)) 300_mm_comieq_sd(__m128d a, __m128d b) 301{ 302 return __builtin_ia32_comisdeq(a, b); 303} 304 305static __inline__ int __attribute__((__always_inline__, __nodebug__)) 306_mm_comilt_sd(__m128d a, __m128d b) 307{ 308 return __builtin_ia32_comisdlt(a, b); 309} 310 311static __inline__ int __attribute__((__always_inline__, __nodebug__)) 312_mm_comile_sd(__m128d a, __m128d b) 313{ 314 return __builtin_ia32_comisdle(a, b); 315} 316 317static __inline__ int __attribute__((__always_inline__, __nodebug__)) 318_mm_comigt_sd(__m128d a, __m128d b) 319{ 320 return __builtin_ia32_comisdgt(a, b); 321} 322 323static __inline__ int __attribute__((__always_inline__, __nodebug__)) 324_mm_comige_sd(__m128d a, __m128d b) 325{ 326 return __builtin_ia32_comisdge(a, b); 327} 328 329static __inline__ int __attribute__((__always_inline__, __nodebug__)) 330_mm_comineq_sd(__m128d a, __m128d b) 331{ 332 return __builtin_ia32_comisdneq(a, b); 333} 334 335static __inline__ int __attribute__((__always_inline__, __nodebug__)) 336_mm_ucomieq_sd(__m128d a, __m128d b) 337{ 338 return __builtin_ia32_ucomisdeq(a, b); 339} 340 341static __inline__ int __attribute__((__always_inline__, __nodebug__)) 342_mm_ucomilt_sd(__m128d a, __m128d b) 343{ 344 return __builtin_ia32_ucomisdlt(a, b); 345} 346 347static __inline__ int __attribute__((__always_inline__, __nodebug__)) 348_mm_ucomile_sd(__m128d a, __m128d b) 349{ 350 return __builtin_ia32_ucomisdle(a, b); 351} 352 353static __inline__ int __attribute__((__always_inline__, __nodebug__)) 354_mm_ucomigt_sd(__m128d a, __m128d b) 355{ 356 return __builtin_ia32_ucomisdgt(a, b); 357} 358 359static __inline__ int __attribute__((__always_inline__, __nodebug__)) 360_mm_ucomige_sd(__m128d a, __m128d b) 361{ 362 return __builtin_ia32_ucomisdge(a, b); 363} 364 365static __inline__ int __attribute__((__always_inline__, __nodebug__)) 366_mm_ucomineq_sd(__m128d a, __m128d b) 367{ 368 return __builtin_ia32_ucomisdneq(a, b); 369} 370 371static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 372_mm_cvtpd_ps(__m128d a) 373{ 374 return __builtin_ia32_cvtpd2ps(a); 375} 376 377static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 378_mm_cvtps_pd(__m128 a) 379{ 380 return __builtin_ia32_cvtps2pd(a); 381} 382 383static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 384_mm_cvtepi32_pd(__m128i a) 385{ 386 return __builtin_ia32_cvtdq2pd((__v4si)a); 387} 388 389static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 390_mm_cvtpd_epi32(__m128d a) 391{ 392 return __builtin_ia32_cvtpd2dq(a); 393} 394 395static __inline__ int __attribute__((__always_inline__, __nodebug__)) 396_mm_cvtsd_si32(__m128d a) 397{ 398 return __builtin_ia32_cvtsd2si(a); 399} 400 401static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 402_mm_cvtsd_ss(__m128 a, __m128d b) 403{ 404 a[0] = b[0]; 405 return a; 406} 407 408static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 409_mm_cvtsi32_sd(__m128d a, int b) 410{ 411 a[0] = b; 412 return a; 413} 414 415static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 416_mm_cvtss_sd(__m128d a, __m128 b) 417{ 418 a[0] = b[0]; 419 return a; 420} 421 422static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 423_mm_cvttpd_epi32(__m128d a) 424{ 425 return (__m128i)__builtin_ia32_cvttpd2dq(a); 426} 427 428static __inline__ int __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttsd_si32(__m128d a) 430{ 431 return a[0]; 432} 433 434static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtpd_pi32(__m128d a) 436{ 437 return (__m64)__builtin_ia32_cvtpd2pi(a); 438} 439 440static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 441_mm_cvttpd_pi32(__m128d a) 442{ 443 return (__m64)__builtin_ia32_cvttpd2pi(a); 444} 445 446static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 447_mm_cvtpi32_pd(__m64 a) 448{ 449 return __builtin_ia32_cvtpi2pd((__v2si)a); 450} 451 452static __inline__ double __attribute__((__always_inline__, __nodebug__)) 453_mm_cvtsd_f64(__m128d a) 454{ 455 return a[0]; 456} 457 458static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 459_mm_load_pd(double const *dp) 460{ 461 return *(__m128d*)dp; 462} 463 464static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 465_mm_load1_pd(double const *dp) 466{ 467 struct __mm_load1_pd_struct { 468 double u; 469 } __attribute__((__packed__, __may_alias__)); 470 double u = ((struct __mm_load1_pd_struct*)dp)->u; 471 return (__m128d){ u, u }; 472} 473 474#define _mm_load_pd1(dp) _mm_load1_pd(dp) 475 476static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 477_mm_loadr_pd(double const *dp) 478{ 479 __m128d u = *(__m128d*)dp; 480 return __builtin_shufflevector(u, u, 1, 0); 481} 482 483static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 484_mm_loadu_pd(double const *dp) 485{ 486 struct __loadu_pd { 487 __m128d v; 488 } __attribute__((packed, may_alias)); 489 return ((struct __loadu_pd*)dp)->v; 490} 491 492static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 493_mm_load_sd(double const *dp) 494{ 495 struct __mm_load_sd_struct { 496 double u; 497 } __attribute__((__packed__, __may_alias__)); 498 double u = ((struct __mm_load_sd_struct*)dp)->u; 499 return (__m128d){ u, 0 }; 500} 501 502static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 503_mm_loadh_pd(__m128d a, double const *dp) 504{ 505 struct __mm_loadh_pd_struct { 506 double u; 507 } __attribute__((__packed__, __may_alias__)); 508 double u = ((struct __mm_loadh_pd_struct*)dp)->u; 509 return (__m128d){ a[0], u }; 510} 511 512static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 513_mm_loadl_pd(__m128d a, double const *dp) 514{ 515 struct __mm_loadl_pd_struct { 516 double u; 517 } __attribute__((__packed__, __may_alias__)); 518 double u = ((struct __mm_loadl_pd_struct*)dp)->u; 519 return (__m128d){ u, a[1] }; 520} 521 522static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 523_mm_set_sd(double w) 524{ 525 return (__m128d){ w, 0 }; 526} 527 528static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 529_mm_set1_pd(double w) 530{ 531 return (__m128d){ w, w }; 532} 533 534static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 535_mm_set_pd(double w, double x) 536{ 537 return (__m128d){ x, w }; 538} 539 540static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 541_mm_setr_pd(double w, double x) 542{ 543 return (__m128d){ w, x }; 544} 545 546static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 547_mm_setzero_pd(void) 548{ 549 return (__m128d){ 0, 0 }; 550} 551 552static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 553_mm_move_sd(__m128d a, __m128d b) 554{ 555 return (__m128d){ b[0], a[1] }; 556} 557 558static __inline__ void __attribute__((__always_inline__, __nodebug__)) 559_mm_store_sd(double *dp, __m128d a) 560{ 561 struct __mm_store_sd_struct { 562 double u; 563 } __attribute__((__packed__, __may_alias__)); 564 ((struct __mm_store_sd_struct*)dp)->u = a[0]; 565} 566 567static __inline__ void __attribute__((__always_inline__, __nodebug__)) 568_mm_store1_pd(double *dp, __m128d a) 569{ 570 struct __mm_store1_pd_struct { 571 double u[2]; 572 } __attribute__((__packed__, __may_alias__)); 573 ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0]; 574 ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0]; 575} 576 577static __inline__ void __attribute__((__always_inline__, __nodebug__)) 578_mm_store_pd(double *dp, __m128d a) 579{ 580 *(__m128d *)dp = a; 581} 582 583static __inline__ void __attribute__((__always_inline__, __nodebug__)) 584_mm_storeu_pd(double *dp, __m128d a) 585{ 586 __builtin_ia32_storeupd(dp, a); 587} 588 589static __inline__ void __attribute__((__always_inline__, __nodebug__)) 590_mm_storer_pd(double *dp, __m128d a) 591{ 592 a = __builtin_shufflevector(a, a, 1, 0); 593 *(__m128d *)dp = a; 594} 595 596static __inline__ void __attribute__((__always_inline__, __nodebug__)) 597_mm_storeh_pd(double *dp, __m128d a) 598{ 599 struct __mm_storeh_pd_struct { 600 double u; 601 } __attribute__((__packed__, __may_alias__)); 602 ((struct __mm_storeh_pd_struct*)dp)->u = a[1]; 603} 604 605static __inline__ void __attribute__((__always_inline__, __nodebug__)) 606_mm_storel_pd(double *dp, __m128d a) 607{ 608 struct __mm_storeh_pd_struct { 609 double u; 610 } __attribute__((__packed__, __may_alias__)); 611 ((struct __mm_storeh_pd_struct*)dp)->u = a[0]; 612} 613 614static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 615_mm_add_epi8(__m128i a, __m128i b) 616{ 617 return (__m128i)((__v16qi)a + (__v16qi)b); 618} 619 620static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 621_mm_add_epi16(__m128i a, __m128i b) 622{ 623 return (__m128i)((__v8hi)a + (__v8hi)b); 624} 625 626static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 627_mm_add_epi32(__m128i a, __m128i b) 628{ 629 return (__m128i)((__v4si)a + (__v4si)b); 630} 631 632static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 633_mm_add_si64(__m64 a, __m64 b) 634{ 635 return a + b; 636} 637 638static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 639_mm_add_epi64(__m128i a, __m128i b) 640{ 641 return a + b; 642} 643 644static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 645_mm_adds_epi8(__m128i a, __m128i b) 646{ 647 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 648} 649 650static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 651_mm_adds_epi16(__m128i a, __m128i b) 652{ 653 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 654} 655 656static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 657_mm_adds_epu8(__m128i a, __m128i b) 658{ 659 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 660} 661 662static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 663_mm_adds_epu16(__m128i a, __m128i b) 664{ 665 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 666} 667 668static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 669_mm_avg_epu8(__m128i a, __m128i b) 670{ 671 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 672} 673 674static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 675_mm_avg_epu16(__m128i a, __m128i b) 676{ 677 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 678} 679 680static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 681_mm_madd_epi16(__m128i a, __m128i b) 682{ 683 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 684} 685 686static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 687_mm_max_epi16(__m128i a, __m128i b) 688{ 689 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 690} 691 692static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 693_mm_max_epu8(__m128i a, __m128i b) 694{ 695 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 696} 697 698static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 699_mm_min_epi16(__m128i a, __m128i b) 700{ 701 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 702} 703 704static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 705_mm_min_epu8(__m128i a, __m128i b) 706{ 707 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 708} 709 710static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 711_mm_mulhi_epi16(__m128i a, __m128i b) 712{ 713 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 714} 715 716static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 717_mm_mulhi_epu16(__m128i a, __m128i b) 718{ 719 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 720} 721 722static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 723_mm_mullo_epi16(__m128i a, __m128i b) 724{ 725 return (__m128i)((__v8hi)a * (__v8hi)b); 726} 727 728static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 729_mm_mul_su32(__m64 a, __m64 b) 730{ 731 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 732} 733 734static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 735_mm_mul_epu32(__m128i a, __m128i b) 736{ 737 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 738} 739 740static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 741_mm_sad_epu8(__m128i a, __m128i b) 742{ 743 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 744} 745 746static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 747_mm_sub_epi8(__m128i a, __m128i b) 748{ 749 return (__m128i)((__v16qi)a - (__v16qi)b); 750} 751 752static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 753_mm_sub_epi16(__m128i a, __m128i b) 754{ 755 return (__m128i)((__v8hi)a - (__v8hi)b); 756} 757 758static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 759_mm_sub_epi32(__m128i a, __m128i b) 760{ 761 return (__m128i)((__v4si)a - (__v4si)b); 762} 763 764static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 765_mm_sub_si64(__m64 a, __m64 b) 766{ 767 return a - b; 768} 769 770static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 771_mm_sub_epi64(__m128i a, __m128i b) 772{ 773 return a - b; 774} 775 776static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 777_mm_subs_epi8(__m128i a, __m128i b) 778{ 779 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 780} 781 782static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 783_mm_subs_epi16(__m128i a, __m128i b) 784{ 785 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 786} 787 788static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 789_mm_subs_epu8(__m128i a, __m128i b) 790{ 791 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 792} 793 794static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 795_mm_subs_epu16(__m128i a, __m128i b) 796{ 797 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 798} 799 800static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 801_mm_and_si128(__m128i a, __m128i b) 802{ 803 return a & b; 804} 805 806static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 807_mm_andnot_si128(__m128i a, __m128i b) 808{ 809 return ~a & b; 810} 811 812static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 813_mm_or_si128(__m128i a, __m128i b) 814{ 815 return a | b; 816} 817 818static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 819_mm_xor_si128(__m128i a, __m128i b) 820{ 821 return a ^ b; 822} 823 824#define _mm_slli_si128(a, count) __extension__ ({ \ 825 __m128i __a = (a); \ 826 (__m128i)__builtin_ia32_pslldqi128(__a, (count)*8); }) 827 828static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 829_mm_slli_epi16(__m128i a, int count) 830{ 831 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 832} 833 834static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 835_mm_sll_epi16(__m128i a, __m128i count) 836{ 837 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 838} 839 840static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 841_mm_slli_epi32(__m128i a, int count) 842{ 843 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 844} 845 846static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 847_mm_sll_epi32(__m128i a, __m128i count) 848{ 849 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 850} 851 852static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 853_mm_slli_epi64(__m128i a, int count) 854{ 855 return __builtin_ia32_psllqi128(a, count); 856} 857 858static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 859_mm_sll_epi64(__m128i a, __m128i count) 860{ 861 return __builtin_ia32_psllq128(a, count); 862} 863 864static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 865_mm_srai_epi16(__m128i a, int count) 866{ 867 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 868} 869 870static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 871_mm_sra_epi16(__m128i a, __m128i count) 872{ 873 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 874} 875 876static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 877_mm_srai_epi32(__m128i a, int count) 878{ 879 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 880} 881 882static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 883_mm_sra_epi32(__m128i a, __m128i count) 884{ 885 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 886} 887 888 889#define _mm_srli_si128(a, count) __extension__ ({ \ 890 __m128i __a = (a); \ 891 (__m128i)__builtin_ia32_psrldqi128(__a, (count)*8); }) 892 893static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 894_mm_srli_epi16(__m128i a, int count) 895{ 896 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 897} 898 899static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 900_mm_srl_epi16(__m128i a, __m128i count) 901{ 902 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 903} 904 905static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 906_mm_srli_epi32(__m128i a, int count) 907{ 908 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 909} 910 911static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 912_mm_srl_epi32(__m128i a, __m128i count) 913{ 914 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 915} 916 917static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 918_mm_srli_epi64(__m128i a, int count) 919{ 920 return __builtin_ia32_psrlqi128(a, count); 921} 922 923static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 924_mm_srl_epi64(__m128i a, __m128i count) 925{ 926 return __builtin_ia32_psrlq128(a, count); 927} 928 929static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 930_mm_cmpeq_epi8(__m128i a, __m128i b) 931{ 932 return (__m128i)((__v16qi)a == (__v16qi)b); 933} 934 935static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 936_mm_cmpeq_epi16(__m128i a, __m128i b) 937{ 938 return (__m128i)((__v8hi)a == (__v8hi)b); 939} 940 941static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 942_mm_cmpeq_epi32(__m128i a, __m128i b) 943{ 944 return (__m128i)((__v4si)a == (__v4si)b); 945} 946 947static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 948_mm_cmpgt_epi8(__m128i a, __m128i b) 949{ 950 typedef signed char __v16qs __attribute__((__vector_size__(16))); 951 return (__m128i)((__v16qs)a > (__v16qs)b); 952} 953 954static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 955_mm_cmpgt_epi16(__m128i a, __m128i b) 956{ 957 return (__m128i)((__v8hi)a > (__v8hi)b); 958} 959 960static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 961_mm_cmpgt_epi32(__m128i a, __m128i b) 962{ 963 return (__m128i)((__v4si)a > (__v4si)b); 964} 965 966static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 967_mm_cmplt_epi8(__m128i a, __m128i b) 968{ 969 return _mm_cmpgt_epi8(b,a); 970} 971 972static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 973_mm_cmplt_epi16(__m128i a, __m128i b) 974{ 975 return _mm_cmpgt_epi16(b,a); 976} 977 978static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 979_mm_cmplt_epi32(__m128i a, __m128i b) 980{ 981 return _mm_cmpgt_epi32(b,a); 982} 983 984#ifdef __x86_64__ 985static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 986_mm_cvtsi64_sd(__m128d a, long long b) 987{ 988 a[0] = b; 989 return a; 990} 991 992static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 993_mm_cvtsd_si64(__m128d a) 994{ 995 return __builtin_ia32_cvtsd2si64(a); 996} 997 998static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 999_mm_cvttsd_si64(__m128d a) 1000{ 1001 return a[0]; 1002} 1003#endif 1004 1005static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1006_mm_cvtepi32_ps(__m128i a) 1007{ 1008 return __builtin_ia32_cvtdq2ps((__v4si)a); 1009} 1010 1011static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1012_mm_cvtps_epi32(__m128 a) 1013{ 1014 return (__m128i)__builtin_ia32_cvtps2dq(a); 1015} 1016 1017static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1018_mm_cvttps_epi32(__m128 a) 1019{ 1020 return (__m128i)__builtin_ia32_cvttps2dq(a); 1021} 1022 1023static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1024_mm_cvtsi32_si128(int a) 1025{ 1026 return (__m128i)(__v4si){ a, 0, 0, 0 }; 1027} 1028 1029#ifdef __x86_64__ 1030static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1031_mm_cvtsi64_si128(long long a) 1032{ 1033 return (__m128i){ a, 0 }; 1034} 1035#endif 1036 1037static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1038_mm_cvtsi128_si32(__m128i a) 1039{ 1040 __v4si b = (__v4si)a; 1041 return b[0]; 1042} 1043 1044#ifdef __x86_64__ 1045static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1046_mm_cvtsi128_si64(__m128i a) 1047{ 1048 return a[0]; 1049} 1050#endif 1051 1052static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1053_mm_load_si128(__m128i const *p) 1054{ 1055 return *p; 1056} 1057 1058static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1059_mm_loadu_si128(__m128i const *p) 1060{ 1061 struct __loadu_si128 { 1062 __m128i v; 1063 } __attribute__((packed, may_alias)); 1064 return ((struct __loadu_si128*)p)->v; 1065} 1066 1067static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1068_mm_loadl_epi64(__m128i const *p) 1069{ 1070 struct __mm_loadl_epi64_struct { 1071 long long u; 1072 } __attribute__((__packed__, __may_alias__)); 1073 return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0}; 1074} 1075 1076static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1077_mm_set_epi64x(long long q1, long long q0) 1078{ 1079 return (__m128i){ q0, q1 }; 1080} 1081 1082static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1083_mm_set_epi64(__m64 q1, __m64 q0) 1084{ 1085 return (__m128i){ (long long)q0, (long long)q1 }; 1086} 1087 1088static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1089_mm_set_epi32(int i3, int i2, int i1, int i0) 1090{ 1091 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1092} 1093 1094static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1095_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1096{ 1097 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1098} 1099 1100static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1101_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1102{ 1103 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1104} 1105 1106static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1107_mm_set1_epi64x(long long q) 1108{ 1109 return (__m128i){ q, q }; 1110} 1111 1112static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1113_mm_set1_epi64(__m64 q) 1114{ 1115 return (__m128i){ (long long)q, (long long)q }; 1116} 1117 1118static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1119_mm_set1_epi32(int i) 1120{ 1121 return (__m128i)(__v4si){ i, i, i, i }; 1122} 1123 1124static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1125_mm_set1_epi16(short w) 1126{ 1127 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1128} 1129 1130static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1131_mm_set1_epi8(char b) 1132{ 1133 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1134} 1135 1136static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1137_mm_setr_epi64(__m64 q0, __m64 q1) 1138{ 1139 return (__m128i){ (long long)q0, (long long)q1 }; 1140} 1141 1142static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1143_mm_setr_epi32(int i0, int i1, int i2, int i3) 1144{ 1145 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1146} 1147 1148static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1149_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1150{ 1151 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1152} 1153 1154static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1155_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1156{ 1157 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1158} 1159 1160static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1161_mm_setzero_si128(void) 1162{ 1163 return (__m128i){ 0LL, 0LL }; 1164} 1165 1166static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1167_mm_store_si128(__m128i *p, __m128i b) 1168{ 1169 *p = b; 1170} 1171 1172static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1173_mm_storeu_si128(__m128i *p, __m128i b) 1174{ 1175 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1176} 1177 1178static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1179_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1180{ 1181 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1182} 1183 1184static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1185_mm_storel_epi64(__m128i *p, __m128i a) 1186{ 1187 __builtin_ia32_storelv4si((__v2si *)p, a); 1188} 1189 1190static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1191_mm_stream_pd(double *p, __m128d a) 1192{ 1193 __builtin_ia32_movntpd(p, a); 1194} 1195 1196static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1197_mm_stream_si128(__m128i *p, __m128i a) 1198{ 1199 __builtin_ia32_movntdq(p, a); 1200} 1201 1202static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1203_mm_stream_si32(int *p, int a) 1204{ 1205 __builtin_ia32_movnti(p, a); 1206} 1207 1208static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1209_mm_clflush(void const *p) 1210{ 1211 __builtin_ia32_clflush(p); 1212} 1213 1214static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1215_mm_lfence(void) 1216{ 1217 __builtin_ia32_lfence(); 1218} 1219 1220static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1221_mm_mfence(void) 1222{ 1223 __builtin_ia32_mfence(); 1224} 1225 1226static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1227_mm_packs_epi16(__m128i a, __m128i b) 1228{ 1229 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1230} 1231 1232static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1233_mm_packs_epi32(__m128i a, __m128i b) 1234{ 1235 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1236} 1237 1238static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1239_mm_packus_epi16(__m128i a, __m128i b) 1240{ 1241 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1242} 1243 1244static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1245_mm_extract_epi16(__m128i a, int imm) 1246{ 1247 __v8hi b = (__v8hi)a; 1248 return (unsigned short)b[imm]; 1249} 1250 1251static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1252_mm_insert_epi16(__m128i a, int b, int imm) 1253{ 1254 __v8hi c = (__v8hi)a; 1255 c[imm & 7] = b; 1256 return (__m128i)c; 1257} 1258 1259static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1260_mm_movemask_epi8(__m128i a) 1261{ 1262 return __builtin_ia32_pmovmskb128((__v16qi)a); 1263} 1264 1265#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 1266 __m128i __a = (a); \ 1267 (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si) _mm_set1_epi32(0), \ 1268 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1269 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) 1270 1271#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 1272 __m128i __a = (a); \ 1273 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1274 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1275 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1276 4, 5, 6, 7); }) 1277 1278#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 1279 __m128i __a = (a); \ 1280 (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi) _mm_set1_epi16(0), \ 1281 0, 1, 2, 3, \ 1282 4 + (((imm) & 0x03) >> 0), \ 1283 4 + (((imm) & 0x0c) >> 2), \ 1284 4 + (((imm) & 0x30) >> 4), \ 1285 4 + (((imm) & 0xc0) >> 6)); }) 1286 1287static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1288_mm_unpackhi_epi8(__m128i a, __m128i b) 1289{ 1290 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1291} 1292 1293static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1294_mm_unpackhi_epi16(__m128i a, __m128i b) 1295{ 1296 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1297} 1298 1299static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1300_mm_unpackhi_epi32(__m128i a, __m128i b) 1301{ 1302 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1303} 1304 1305static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1306_mm_unpackhi_epi64(__m128i a, __m128i b) 1307{ 1308 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1309} 1310 1311static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1312_mm_unpacklo_epi8(__m128i a, __m128i b) 1313{ 1314 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1315} 1316 1317static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1318_mm_unpacklo_epi16(__m128i a, __m128i b) 1319{ 1320 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1321} 1322 1323static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1324_mm_unpacklo_epi32(__m128i a, __m128i b) 1325{ 1326 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1327} 1328 1329static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1330_mm_unpacklo_epi64(__m128i a, __m128i b) 1331{ 1332 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1333} 1334 1335static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1336_mm_movepi64_pi64(__m128i a) 1337{ 1338 return (__m64)a[0]; 1339} 1340 1341static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1342_mm_movpi64_pi64(__m64 a) 1343{ 1344 return (__m128i){ (long long)a, 0 }; 1345} 1346 1347static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1348_mm_move_epi64(__m128i a) 1349{ 1350 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1351} 1352 1353static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1354_mm_unpackhi_pd(__m128d a, __m128d b) 1355{ 1356 return __builtin_shufflevector(a, b, 1, 2+1); 1357} 1358 1359static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1360_mm_unpacklo_pd(__m128d a, __m128d b) 1361{ 1362 return __builtin_shufflevector(a, b, 0, 2+0); 1363} 1364 1365static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1366_mm_movemask_pd(__m128d a) 1367{ 1368 return __builtin_ia32_movmskpd(a); 1369} 1370 1371#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 1372 __m128d __a = (a); \ 1373 __m128d __b = (b); \ 1374 __builtin_shufflevector(__a, __b, (i) & 1, (((i) & 2) >> 1) + 2); }) 1375 1376static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1377_mm_castpd_ps(__m128d in) 1378{ 1379 return (__m128)in; 1380} 1381 1382static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1383_mm_castpd_si128(__m128d in) 1384{ 1385 return (__m128i)in; 1386} 1387 1388static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1389_mm_castps_pd(__m128 in) 1390{ 1391 return (__m128d)in; 1392} 1393 1394static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1395_mm_castps_si128(__m128 in) 1396{ 1397 return (__m128i)in; 1398} 1399 1400static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1401_mm_castsi128_ps(__m128i in) 1402{ 1403 return (__m128)in; 1404} 1405 1406static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1407_mm_castsi128_pd(__m128i in) 1408{ 1409 return (__m128d)in; 1410} 1411 1412static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1413_mm_pause(void) 1414{ 1415 __asm__ volatile ("pause"); 1416} 1417 1418#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1419 1420#endif /* __SSE2__ */ 1421 1422#endif /* __EMMINTRIN_H */ 1423