emmintrin.h revision 4fd3e63cb043cbd140a3e8028374bd2e4312b90e
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36typedef int __v4si __attribute__((__vector_size__(16))); 37typedef short __v8hi __attribute__((__vector_size__(16))); 38typedef char __v16qi __attribute__((__vector_size__(16))); 39 40static inline __m128d __attribute__((__always_inline__)) _mm_add_sd(__m128d a, __m128d b) 41{ 42 return __builtin_ia32_addsd(a, b); 43} 44 45static inline __m128d __attribute__((__always_inline__)) _mm_add_pd(__m128d a, __m128d b) 46{ 47 return a + b; 48} 49 50static inline __m128d __attribute__((__always_inline__)) _mm_sub_sd(__m128d a, __m128d b) 51{ 52 return __builtin_ia32_subsd(a, b); 53} 54 55static inline __m128d __attribute__((__always_inline__)) _mm_sub_pd(__m128d a, __m128d b) 56{ 57 return a - b; 58} 59 60static inline __m128d __attribute__((__always_inline__)) _mm_mul_sd(__m128d a, __m128d b) 61{ 62 return __builtin_ia32_mulsd(a, b); 63} 64 65static inline __m128d __attribute__((__always_inline__)) _mm_mul_pd(__m128d a, __m128d b) 66{ 67 return a * b; 68} 69 70static inline __m128d __attribute__((__always_inline__)) _mm_div_sd(__m128d a, __m128d b) 71{ 72 return __builtin_ia32_divsd(a, b); 73} 74 75static inline __m128d __attribute__((__always_inline__)) _mm_div_pd(__m128d a, __m128d b) 76{ 77 return a / b; 78} 79 80static inline __m128d __attribute__((__always_inline__)) _mm_sqrt_sd(__m128d a, __m128d b) 81{ 82 __m128d c = __builtin_ia32_sqrtsd(b); 83 return (__m128d) { c[0], a[1] }; 84} 85 86static inline __m128d __attribute__((__always_inline__)) _mm_sqrt_pd(__m128d a) 87{ 88 return __builtin_ia32_sqrtpd(a); 89} 90 91static inline __m128d __attribute__((__always_inline__)) _mm_min_sd(__m128d a, __m128d b) 92{ 93 return __builtin_ia32_minsd(a, b); 94} 95 96static inline __m128d __attribute__((__always_inline__)) _mm_min_pd(__m128d a, __m128d b) 97{ 98 return __builtin_ia32_minpd(a, b); 99} 100 101static inline __m128d __attribute__((__always_inline__)) _mm_max_sd(__m128d a, __m128d b) 102{ 103 return __builtin_ia32_maxsd(a, b); 104} 105 106static inline __m128d __attribute__((__always_inline__)) _mm_max_pd(__m128d a, __m128d b) 107{ 108 return __builtin_ia32_maxpd(a, b); 109} 110 111static inline __m128d __attribute__((__always_inline__)) _mm_and_pd(__m128d a, __m128d b) 112{ 113 return __builtin_ia32_andpd(a, b); 114} 115 116static inline __m128d __attribute__((__always_inline__)) _mm_andnot_pd(__m128d a, __m128d b) 117{ 118 return __builtin_ia32_andnpd(a, b); 119} 120 121static inline __m128d __attribute__((__always_inline__)) _mm_or_pd(__m128d a, __m128d b) 122{ 123 return __builtin_ia32_orpd(a, b); 124} 125 126static inline __m128d __attribute__((__always_inline__)) _mm_xor_pd(__m128d a, __m128d b) 127{ 128 return __builtin_ia32_xorpd(a, b); 129} 130 131static inline __m128d __attribute__((__always_inline__)) _mm_cmpeq_pd(__m128d a, __m128d b) 132{ 133 return (__m128d)__builtin_ia32_cmpeqpd(a, b); 134} 135 136static inline __m128d __attribute__((__always_inline__)) _mm_cmplt_pd(__m128d a, __m128d b) 137{ 138 return (__m128d)__builtin_ia32_cmpltpd(a, b); 139} 140 141static inline __m128d __attribute__((__always_inline__)) _mm_cmple_pd(__m128d a, __m128d b) 142{ 143 return (__m128d)__builtin_ia32_cmplepd(a, b); 144} 145 146static inline __m128d __attribute__((__always_inline__)) _mm_cmpgt_pd(__m128d a, __m128d b) 147{ 148 return (__m128d)__builtin_ia32_cmpltpd(b, a); 149} 150 151static inline __m128d __attribute__((__always_inline__)) _mm_cmpge_pd(__m128d a, __m128d b) 152{ 153 return (__m128d)__builtin_ia32_cmplepd(b, a); 154} 155 156static inline __m128d __attribute__((__always_inline__)) _mm_cmpord_pd(__m128d a, __m128d b) 157{ 158 return (__m128d)__builtin_ia32_cmpordpd(a, b); 159} 160 161static inline __m128d __attribute__((__always_inline__)) _mm_cmpunord_pd(__m128d a, __m128d b) 162{ 163 return (__m128d)__builtin_ia32_cmpunordpd(a, b); 164} 165 166static inline __m128d __attribute__((__always_inline__)) _mm_cmpneq_pd(__m128d a, __m128d b) 167{ 168 return (__m128d)__builtin_ia32_cmpneqpd(a, b); 169} 170 171static inline __m128d __attribute__((__always_inline__)) _mm_cmpnlt_pd(__m128d a, __m128d b) 172{ 173 return (__m128d)__builtin_ia32_cmpnltpd(a, b); 174} 175 176static inline __m128d __attribute__((__always_inline__)) _mm_cmpnle_pd(__m128d a, __m128d b) 177{ 178 return (__m128d)__builtin_ia32_cmpnlepd(a, b); 179} 180 181static inline __m128d __attribute__((__always_inline__)) _mm_cmpngt_pd(__m128d a, __m128d b) 182{ 183 return (__m128d)__builtin_ia32_cmpnltpd(b, a); 184} 185 186static inline __m128d __attribute__((__always_inline__)) _mm_cmpnge_pd(__m128d a, __m128d b) 187{ 188 return (__m128d)__builtin_ia32_cmpnlepd(b, a); 189} 190 191static inline __m128d __attribute__((__always_inline__)) _mm_cmpeq_sd(__m128d a, __m128d b) 192{ 193 return (__m128d)__builtin_ia32_cmpeqsd(a, b); 194} 195 196static inline __m128d __attribute__((__always_inline__)) _mm_cmplt_sd(__m128d a, __m128d b) 197{ 198 return (__m128d)__builtin_ia32_cmpltsd(a, b); 199} 200 201static inline __m128d __attribute__((__always_inline__)) _mm_cmple_sd(__m128d a, __m128d b) 202{ 203 return (__m128d)__builtin_ia32_cmplesd(a, b); 204} 205 206static inline __m128d __attribute__((__always_inline__)) _mm_cmpgt_sd(__m128d a, __m128d b) 207{ 208 return (__m128d)__builtin_ia32_cmpltsd(b, a); 209} 210 211static inline __m128d __attribute__((__always_inline__)) _mm_cmpge_sd(__m128d a, __m128d b) 212{ 213 return (__m128d)__builtin_ia32_cmplesd(b, a); 214} 215 216static inline __m128d __attribute__((__always_inline__)) _mm_cmpord_sd(__m128d a, __m128d b) 217{ 218 return (__m128d)__builtin_ia32_cmpordsd(a, b); 219} 220 221static inline __m128d __attribute__((__always_inline__)) _mm_cmpunord_sd(__m128d a, __m128d b) 222{ 223 return (__m128d)__builtin_ia32_cmpunordsd(a, b); 224} 225 226static inline __m128d __attribute__((__always_inline__)) _mm_cmpneq_sd(__m128d a, __m128d b) 227{ 228 return (__m128d)__builtin_ia32_cmpneqsd(a, b); 229} 230 231static inline __m128d __attribute__((__always_inline__)) _mm_cmpnlt_sd(__m128d a, __m128d b) 232{ 233 return (__m128d)__builtin_ia32_cmpnltsd(a, b); 234} 235 236static inline __m128d __attribute__((__always_inline__)) _mm_cmpnle_sd(__m128d a, __m128d b) 237{ 238 return (__m128d)__builtin_ia32_cmpnlesd(a, b); 239} 240 241static inline __m128d __attribute__((__always_inline__)) _mm_cmpngt_sd(__m128d a, __m128d b) 242{ 243 return (__m128d)__builtin_ia32_cmpnltsd(b, a); 244} 245 246static inline __m128d __attribute__((__always_inline__)) _mm_cmpnge_sd(__m128d a, __m128d b) 247{ 248 return (__m128d)__builtin_ia32_cmpnlesd(b, a); 249} 250 251static inline int __attribute__((__always_inline__)) _mm_comieq_sd(__m128d a, __m128d b) 252{ 253 return __builtin_ia32_comisdeq(a, b); 254} 255 256static inline int __attribute__((__always_inline__)) _mm_comilt_sd(__m128d a, __m128d b) 257{ 258 return __builtin_ia32_comisdlt(a, b); 259} 260 261static inline int __attribute__((__always_inline__)) _mm_comile_sd(__m128d a, __m128d b) 262{ 263 return __builtin_ia32_comisdle(a, b); 264} 265 266static inline int __attribute__((__always_inline__)) _mm_comigt_sd(__m128d a, __m128d b) 267{ 268 return __builtin_ia32_comisdgt(a, b); 269} 270 271static inline int __attribute__((__always_inline__)) _mm_comineq_sd(__m128d a, __m128d b) 272{ 273 return __builtin_ia32_comisdneq(a, b); 274} 275 276static inline int __attribute__((__always_inline__)) _mm_ucomieq_sd(__m128d a, __m128d b) 277{ 278 return __builtin_ia32_ucomisdeq(a, b); 279} 280 281static inline int __attribute__((__always_inline__)) _mm_ucomilt_sd(__m128d a, __m128d b) 282{ 283 return __builtin_ia32_ucomisdlt(a, b); 284} 285 286static inline int __attribute__((__always_inline__)) _mm_ucomile_sd(__m128d a, __m128d b) 287{ 288 return __builtin_ia32_ucomisdle(a, b); 289} 290 291static inline int __attribute__((__always_inline__)) _mm_ucomigt_sd(__m128d a, __m128d b) 292{ 293 return __builtin_ia32_ucomisdgt(a, b); 294} 295 296static inline int __attribute__((__always_inline__)) _mm_ucomineq_sd(__m128d a, __m128d b) 297{ 298 return __builtin_ia32_ucomisdneq(a, b); 299} 300 301static inline __m128 __attribute__((__always_inline__)) _mm_cvtpd_ps(__m128d a) 302{ 303 return __builtin_ia32_cvtpd2ps(a); 304} 305 306static inline __m128d __attribute__((__always_inline__)) _mm_cvtps_pd(__m128 a) 307{ 308 return __builtin_ia32_cvtps2pd(a); 309} 310 311static inline __m128d __attribute__((__always_inline__)) _mm_cvtepi32_pd(__m128i a) 312{ 313 return __builtin_ia32_cvtdq2pd((__v4si)a); 314} 315 316static inline __m128i __attribute__((__always_inline__)) _mm_cvtpd_epi32(__m128d a) 317{ 318 return __builtin_ia32_cvtpd2dq(a); 319} 320 321static inline int __attribute__((__always_inline__)) _mm_cvtsd_si32(__m128d a) 322{ 323 return __builtin_ia32_cvtsd2si(a); 324} 325 326static inline __m128 __attribute__((__always_inline__)) _mm_cvtsd_ss(__m128 a, __m128d b) 327{ 328 return __builtin_ia32_cvtsd2ss(a, b); 329} 330 331static inline __m128d __attribute__((__always_inline__)) _mm_cvtsi32_sd(__m128d a, int b) 332{ 333 return __builtin_ia32_cvtsi2sd(a, b); 334} 335 336static inline __m128d __attribute__((__always_inline__)) _mm_cvtss_sd(__m128d a, __m128 b) 337{ 338 return __builtin_ia32_cvtss2sd(a, b); 339} 340 341static inline __m128i __attribute__((__always_inline__)) _mm_cvttpd_epi32(__m128d a) 342{ 343 return (__m128i)__builtin_ia32_cvttpd2dq(a); 344} 345 346static inline int __attribute__((__always_inline__)) _mm_cvttsd_si32(__m128d a) 347{ 348 return __builtin_ia32_cvttsd2si(a); 349} 350 351static inline __m64 __attribute__((__always_inline__)) _mm_cvtpd_pi32(__m128d a) 352{ 353 return (__m64)__builtin_ia32_cvtpd2pi(a); 354} 355 356static inline __m64 __attribute__((__always_inline__)) _mm_cvttpd_pi32(__m128d a) 357{ 358 return (__m64)__builtin_ia32_cvttpd2pi(a); 359} 360 361static inline __m128d __attribute__((__always_inline__)) _mm_cvtpi32_pd(__m64 a) 362{ 363 return __builtin_ia32_cvtpi2pd((__v2si)a); 364} 365 366static inline double __attribute__((__always_inline__)) _mm_cvtsd_f64(__m128d a) 367{ 368 return a[0]; 369} 370 371static inline __m128d __attribute__((__always_inline__)) _mm_load_pd(double const *dp) 372{ 373 return *(__m128d*)dp; 374} 375 376static inline __m128d __attribute__((__always_inline__)) _mm_load1_pd(double const *dp) 377{ 378 return (__m128d){ dp[0], dp[0] }; 379} 380 381static inline __m128d __attribute__((__always_inline__)) _mm_loadr_pd(double const *dp) 382{ 383 return (__m128d){ dp[1], dp[0] }; 384} 385 386static inline __m128d __attribute__((__always_inline__)) _mm_loadu_pd(double const *dp) 387{ 388 return __builtin_ia32_loadupd(dp); 389} 390 391static inline __m128d __attribute__((__always_inline__)) _mm_load_sd(double const *dp) 392{ 393 return (__m128d){ *dp, 0.0 }; 394} 395 396static inline __m128d __attribute__((__always_inline__)) _mm_loadh_pd(__m128d a, double const *dp) 397{ 398 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); 399} 400 401static inline __m128d __attribute__((__always_inline__)) _mm_loadl_pd(__m128d a, double const *dp) 402{ 403 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); 404} 405 406static inline __m128d __attribute__((__always_inline__)) _mm_set_sd(double w) 407{ 408 return (__m128d){ w, 0 }; 409} 410 411static inline __m128d __attribute__((__always_inline__)) _mm_set1_pd(double w) 412{ 413 return (__m128d){ w, w }; 414} 415 416static inline __m128d __attribute__((__always_inline__)) _mm_set_pd(double w, double x) 417{ 418 return (__m128d){ w, x }; 419} 420 421static inline __m128d __attribute__((__always_inline__)) _mm_setr_pd(double w, double x) 422{ 423 return (__m128d){ x, w }; 424} 425 426static inline __m128d __attribute__((__always_inline__)) _mm_setzero_pd(void) 427{ 428 return (__m128d){ 0, 0 }; 429} 430 431static inline __m128d __attribute__((__always_inline__)) _mm_move_sd(__m128d a, __m128d b) 432{ 433 return (__m128d){ b[0], a[1] }; 434} 435 436static inline void __attribute__((__always_inline__)) _mm_store_sd(double *dp, __m128d a) 437{ 438 dp[0] = a[0]; 439} 440 441static inline void __attribute__((__always_inline__)) _mm_store1_pd(double *dp, __m128d a) 442{ 443 dp[0] = a[0]; 444 dp[1] = a[0]; 445} 446 447static inline void __attribute__((__always_inline__)) _mm_store_pd(double *dp, __m128d a) 448{ 449 *(__m128d *)dp = a; 450} 451 452static inline void __attribute__((__always_inline__)) _mm_storeu_pd(double *dp, __m128d a) 453{ 454 __builtin_ia32_storeupd(dp, a); 455} 456 457static inline void __attribute__((__always_inline__)) _mm_storer_pd(double *dp, __m128d a) 458{ 459 dp[0] = a[1]; 460 dp[1] = a[0]; 461} 462 463static inline void __attribute__((__always_inline__)) _mm_storeh_pd(double *dp, __m128d a) 464{ 465 dp[0] = a[1]; 466} 467 468static inline void __attribute__((__always_inline__)) _mm_storel_pd(double *dp, __m128d a) 469{ 470 dp[0] = a[0]; 471} 472 473static inline __m128i __attribute__((__always_inline__)) _mm_add_epi8(__m128i a, __m128i b) 474{ 475 return (__m128i)((__v16qi)a + (__v16qi)b); 476} 477 478static inline __m128i __attribute__((__always_inline__)) _mm_add_epi16(__m128i a, __m128i b) 479{ 480 return (__m128i)((__v8hi)a + (__v8hi)b); 481} 482 483static inline __m128i __attribute__((__always_inline__)) _mm_add_epi32(__m128i a, __m128i b) 484{ 485 return (__m128i)((__v4si)a + (__v4si)b); 486} 487 488static inline __m64 __attribute__((__always_inline__)) _mm_add_si64(__m64 a, __m64 b) 489{ 490 return a + b; 491} 492 493static inline __m128i __attribute__((__always_inline__)) _mm_add_epi64(__m128i a, __m128i b) 494{ 495 return a + b; 496} 497 498static inline __m128i __attribute__((__always_inline__)) _mm_adds_epi8(__m128i a, __m128i b) 499{ 500 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 501} 502 503static inline __m128i __attribute__((__always_inline__)) _mm_adds_epi16(__m128i a, __m128i b) 504{ 505 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 506} 507 508static inline __m128i __attribute__((__always_inline__)) _mm_adds_epu8(__m128i a, __m128i b) 509{ 510 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 511} 512 513static inline __m128i __attribute__((__always_inline__)) _mm_adds_epu16(__m128i a, __m128i b) 514{ 515 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 516} 517 518static inline __m128i __attribute__((__always_inline__)) _mm_avg_epu8(__m128i a, __m128i b) 519{ 520 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 521} 522 523static inline __m128i __attribute__((__always_inline__)) _mm_avg_epu16(__m128i a, __m128i b) 524{ 525 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 526} 527 528static inline __m128i __attribute__((__always_inline__)) _mm_madd_epi16(__m128i a, __m128i b) 529{ 530 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 531} 532 533static inline __m128i __attribute__((__always_inline__)) _mm_max_epi16(__m128i a, __m128i b) 534{ 535 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 536} 537 538static inline __m128i __attribute__((__always_inline__)) _mm_max_epu8(__m128i a, __m128i b) 539{ 540 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 541} 542 543static inline __m128i __attribute__((__always_inline__)) _mm_min_epi16(__m128i a, __m128i b) 544{ 545 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 546} 547 548static inline __m128i __attribute__((__always_inline__)) _mm_min_epu8(__m128i a, __m128i b) 549{ 550 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 551} 552 553static inline __m128i __attribute__((__always_inline__)) _mm_mulhi_epi16(__m128i a, __m128i b) 554{ 555 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 556} 557 558static inline __m128i __attribute__((__always_inline__)) _mm_mulhi_epu16(__m128i a, __m128i b) 559{ 560 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 561} 562 563static inline __m128i __attribute__((__always_inline__)) _mm_mullo_epi16(__m128i a, __m128i b) 564{ 565 return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b); 566} 567 568static inline __m64 __attribute__((__always_inline__)) _mm_mul_su32(__m64 a, __m64 b) 569{ 570 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 571} 572 573static inline __m128i __attribute__((__always_inline__)) _mm_mul_epu32(__m128i a, __m128i b) 574{ 575 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 576} 577 578static inline __m128i __attribute__((__always_inline__)) _mm_sad_epu(__m128i a, __m128i b) 579{ 580 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 581} 582 583static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi8(__m128i a, __m128i b) 584{ 585 return (__m128i)((__v16qi)a - (__v16qi)b); 586} 587 588static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi16(__m128i a, __m128i b) 589{ 590 return (__m128i)((__v8hi)a - (__v8hi)b); 591} 592 593static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi32(__m128i a, __m128i b) 594{ 595 return (__m128i)((__v4si)a - (__v4si)b); 596} 597 598static inline __m64 __attribute__((__always_inline__)) _mm_sub_si64(__m64 a, __m64 b) 599{ 600 return a - b; 601} 602 603static inline __m128i __attribute__((__always_inline__)) _mm_sub_epi64(__m128i a, __m128i b) 604{ 605 return a - b; 606} 607 608static inline __m128i __attribute__((__always_inline__)) _mm_subs_epi8(__m128i a, __m128i b) 609{ 610 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 611} 612 613static inline __m128i __attribute__((__always_inline__)) _mm_subs_epi16(__m128i a, __m128i b) 614{ 615 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 616} 617 618static inline __m128i __attribute__((__always_inline__)) _mm_subs_epu8(__m128i a, __m128i b) 619{ 620 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 621} 622 623static inline __m128i __attribute__((__always_inline__)) _mm_subs_epu16(__m128i a, __m128i b) 624{ 625 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 626} 627 628static inline __m128i __attribute__((__always_inline__)) _mm_and_si128(__m128i a, __m128i b) 629{ 630 return __builtin_ia32_pand128(a, b); 631} 632 633static inline __m128i __attribute__((__always_inline__)) _mm_andnot_si128(__m128i a, __m128i b) 634{ 635 return __builtin_ia32_pandn128(a, b); 636} 637 638static inline __m128i __attribute__((__always_inline__)) _mm_or_si128(__m128i a, __m128i b) 639{ 640 return __builtin_ia32_por128(a, b); 641} 642 643static inline __m128i __attribute__((__always_inline__)) _mm_xor_si128(__m128i a, __m128i b) 644{ 645 return __builtin_ia32_pxor128(a, b); 646} 647 648static inline __m128i __attribute__((__always_inline__)) _mm_slli_si128(__m128i a, int imm) 649{ 650 return __builtin_ia32_pslldqi128(a, imm * 8); 651} 652 653static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi16(__m128i a, int count) 654{ 655 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 656} 657 658static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi16(__m128i a, __m128i count) 659{ 660 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 661} 662 663static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi32(__m128i a, int count) 664{ 665 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 666} 667 668static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi32(__m128i a, __m128i count) 669{ 670 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 671} 672 673static inline __m128i __attribute__((__always_inline__)) _mm_slli_epi64(__m128i a, int count) 674{ 675 return __builtin_ia32_psllqi128(a, count); 676} 677 678static inline __m128i __attribute__((__always_inline__)) _mm_sll_epi64(__m128i a, __m128i count) 679{ 680 return __builtin_ia32_psllq128(a, count); 681} 682 683static inline __m128i __attribute__((__always_inline__)) _mm_srai_epi16(__m128i a, int count) 684{ 685 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 686} 687 688static inline __m128i __attribute__((__always_inline__)) _mm_sra_epi16(__m128i a, __m128i count) 689{ 690 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 691} 692 693static inline __m128i __attribute__((__always_inline__)) _mm_srai_epi32(__m128i a, int count) 694{ 695 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 696} 697 698static inline __m128i __attribute__((__always_inline__)) _mm_sra_epi32(__m128i a, __m128i count) 699{ 700 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 701} 702 703static inline __m128i __attribute__((__always_inline__)) _mm_srli_si128(__m128i a, int imm) 704{ 705 return __builtin_ia32_psrldqi128(a, imm * 8); 706} 707 708static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi16(__m128i a, int count) 709{ 710 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 711} 712 713static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi16(__m128i a, __m128i count) 714{ 715 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 716} 717 718static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi32(__m128i a, int count) 719{ 720 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 721} 722 723static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi32(__m128i a, __m128i count) 724{ 725 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 726} 727 728static inline __m128i __attribute__((__always_inline__)) _mm_srli_epi64(__m128i a, int count) 729{ 730 return __builtin_ia32_psrlqi128(a, count); 731} 732 733static inline __m128i __attribute__((__always_inline__)) _mm_srl_epi64(__m128i a, __m128i count) 734{ 735 return __builtin_ia32_psrlq128(a, count); 736} 737 738static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi8(__m128i a, __m128i b) 739{ 740 return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b); 741} 742 743static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi16(__m128i a, __m128i b) 744{ 745 return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b); 746} 747 748static inline __m128i __attribute__((__always_inline__)) _mm_cmpeq_epi32(__m128i a, __m128i b) 749{ 750 return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b); 751} 752 753static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi8(__m128i a, __m128i b) 754{ 755 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b); 756} 757 758static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi16(__m128i a, __m128i b) 759{ 760 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b); 761} 762 763static inline __m128i __attribute__((__always_inline__)) _mm_cmpgt_epi32(__m128i a, __m128i b) 764{ 765 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b); 766} 767 768static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi8(__m128i a, __m128i b) 769{ 770 return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a); 771} 772 773static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi16(__m128i a, __m128i b) 774{ 775 return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a); 776} 777 778static inline __m128i __attribute__((__always_inline__)) _mm_cmplt_epi32(__m128i a, __m128i b) 779{ 780 return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a); 781} 782 783#ifdef __x86_64__ 784static inline __m128d __attribute__((__always_inline__)) _mm_cvtsi64_sd(__m128d a, long long b) 785{ 786 return __builtin_ia32_cvtsi642sd(a, b); 787} 788 789static inline long long __attribute__((__always_inline__)) _mm_cvtsd_si64(__m128d a) 790{ 791 return __builtin_ia32_cvtsd2si64(a); 792} 793 794static inline long long __attribute__((__always_inline__)) _mm_cvttsd_si64(__m128d a) 795{ 796 return __builtin_ia32_cvttsd2si64(a); 797} 798#endif 799 800static inline __m128 __attribute__((__always_inline__)) _mm_cvtepi32_ps(__m128i a) 801{ 802 return __builtin_ia32_cvtdq2ps((__v4si)a); 803} 804 805static inline __m128i __attribute__((__always_inline__)) _mm_cvtps_epi32(__m128 a) 806{ 807 return (__m128i)__builtin_ia32_cvtps2dq(a); 808} 809 810static inline __m128i __attribute__((__always_inline__)) _mm_cvttps_epi32(__m128 a) 811{ 812 return (__m128i)__builtin_ia32_cvttps2dq(a); 813} 814 815static inline __m128i __attribute__((__always_inline__)) _mm_cvtsi32_si128(int a) 816{ 817 return (__m128i)(__v4si){ a, 0, 0, 0 }; 818} 819 820#ifdef __x86_64__ 821static inline __m128i __attribute__((__always_inline__)) _mm_cvtsi64_si128(long long a) 822{ 823 return (__m128i){ a, 0 }; 824} 825#endif 826 827static inline int __attribute__((__always_inline__)) _mm_cvtsi128_si32(__m128i a) 828{ 829 __v4si b = (__v4si)a; 830 return b[0]; 831} 832 833#ifdef __x86_64__ 834static inline long long __attribute__((__always_inline__)) _mm_cvtsi128_si64(__m128i a) 835{ 836 return a[0]; 837} 838#endif 839 840static inline __m128i __attribute__((__always_inline__)) _mm_load_si128(__m128i const *p) 841{ 842 return *p; 843} 844 845static inline __m128i __attribute__((__always_inline__)) _mm_loadu_si128(__m128i const *p) 846{ 847 return (__m128i)__builtin_ia32_loaddqu((char const *)p); 848} 849 850static inline __m128i __attribute__((__always_inline__)) _mm_loadl_epi64(__m128i const *p) 851{ 852 return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p); 853} 854 855static inline __m128i __attribute__((__always_inline__)) _mm_set_epi64(__m64 q1, __m64 q0) 856{ 857 return (__m128i){ (long long)q0, (long long)q1 }; 858} 859 860static inline __m128i __attribute__((__always_inline__)) _mm_set_epi32(int i3, int i2, int i1, int i0) 861{ 862 return (__m128i)(__v4si){ i0, i1, i2, i3}; 863} 864 865static inline __m128i __attribute__((__always_inline__)) _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 866{ 867 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 868} 869 870static inline __m128i __attribute__((__always_inline__)) _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 871{ 872 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 873} 874 875static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi64(__m64 q) 876{ 877 return (__m128i){ (long long)q, (long long)q }; 878} 879 880static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi32(int i) 881{ 882 return (__m128i)(__v4si){ i, i, i, i }; 883} 884 885static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi16(short w) 886{ 887 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 888} 889 890static inline __m128i __attribute__((__always_inline__)) _mm_set1_epi8(char b) 891{ 892 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 893} 894 895static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi64(__m64 q0, __m64 q1) 896{ 897 return (__m128i){ (long long)q0, (long long)q1 }; 898} 899 900static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi32(int i0, int i1, int i2, int i3) 901{ 902 return (__m128i)(__v4si){ i0, i1, i2, i3}; 903} 904 905static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 906{ 907 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 908} 909 910static inline __m128i __attribute__((__always_inline__)) _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 911{ 912 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 913} 914 915static inline __m128i __attribute__((__always_inline__)) _mm_setzero_si128(void) 916{ 917 return (__m128i){ 0LL, 0LL }; 918} 919 920static inline void __attribute__((__always_inline__)) _mm_store_si128(__m128i *p, __m128i b) 921{ 922 *p = b; 923} 924 925static inline void __attribute__((__always_inline__)) _mm_storeu_si128(__m128i *p, __m128i b) 926{ 927 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 928} 929 930static inline void __attribute__((__always_inline__)) _mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 931{ 932 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 933} 934 935static inline void __attribute__((__always_inline__)) _mm_storel_epi64(__m128i *p, __m128i a) 936{ 937 __builtin_ia32_storelv4si((__v2si *)p, a); 938} 939 940static inline void __attribute__((__always_inline__)) _mm_stream_pd(double *p, __m128d a) 941{ 942 __builtin_ia32_movntpd(p, a); 943} 944 945static inline void __attribute__((__always_inline__)) _mm_stream_si128(__m128i *p, __m128i a) 946{ 947 __builtin_ia32_movntdq(p, a); 948} 949 950static inline void __attribute__((__always_inline__)) _mm_stream_si32(int *p, int a) 951{ 952 __builtin_ia32_movnti(p, a); 953} 954 955static inline void __attribute__((__always_inline__)) _mm_clflush(void const *p) 956{ 957 __builtin_ia32_clflush(p); 958} 959 960static inline void __attribute__((__always_inline__)) _mm_lfence(void) 961{ 962 __builtin_ia32_lfence(); 963} 964 965static inline void __attribute__((__always_inline__)) _mm_mfence(void) 966{ 967 __builtin_ia32_mfence(); 968} 969 970static inline __m128i __attribute__((__always_inline__)) _mm_packs_epi16(__m128i a, __m128i b) 971{ 972 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 973} 974 975static inline __m128i __attribute__((__always_inline__)) _mm_packs_epi32(__m128i a, __m128i b) 976{ 977 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 978} 979 980static inline __m128i __attribute__((__always_inline__)) _mm_packus_epi16(__m128i a, __m128i b) 981{ 982 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 983} 984 985static inline int __attribute__((__always_inline__)) _mm_extract_epi16(__m128i a, int imm) 986{ 987 __v8hi b = (__v8hi)a; 988 return b[imm]; 989} 990 991static inline __m128i __attribute__((__always_inline__)) _mm_insert_epi16(__m128i a, int b, int imm) 992{ 993 return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm); 994} 995 996static inline int __attribute__((__always_inline__)) _mm_movemask_epi8(__m128i a) 997{ 998 return __builtin_ia32_pmovmskb128((__v16qi)a); 999} 1000 1001#define _mm_shuffle_epi32(a, imm) ((__m128i)__builtin_ia32_pshufd((__v4si)(a), (imm))) 1002#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm))) 1003#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm))) 1004 1005static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi8(__m128i a, __m128i b) 1006{ 1007 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1008} 1009 1010static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi16(__m128i a, __m128i b) 1011{ 1012 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1013} 1014 1015static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi32(__m128i a, __m128i b) 1016{ 1017 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1018} 1019 1020static inline __m128i __attribute__((__always_inline__)) _mm_unpackhi_epi64(__m128i a, __m128i b) 1021{ 1022 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1023} 1024 1025static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi8(__m128i a, __m128i b) 1026{ 1027 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1028} 1029 1030static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi16(__m128i a, __m128i b) 1031{ 1032 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1033} 1034 1035static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi32(__m128i a, __m128i b) 1036{ 1037 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1038} 1039 1040static inline __m128i __attribute__((__always_inline__)) _mm_unpacklo_epi64(__m128i a, __m128i b) 1041{ 1042 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1043} 1044 1045static inline __m64 __attribute__((__always_inline__)) _mm_movepi64_pi64(__m128i a) 1046{ 1047 return (__m64)a[0]; 1048} 1049 1050static inline __m128i __attribute__((__always_inline__)) _mm_movpi64_pi64(__m64 a) 1051{ 1052 return (__m128i){ (long long)a, 0 }; 1053} 1054 1055static inline __m128i __attribute__((__always_inline__)) _mm_move_epi64(__m128i a) 1056{ 1057 return (__m128i){ a[0], 0 }; 1058} 1059 1060static inline __m128d __attribute__((__always_inline__)) _mm_unpackhi_pd(__m128d a, __m128d b) 1061{ 1062 return __builtin_shufflevector(a, b, 1, 2+1); 1063} 1064 1065static inline __m128d __attribute__((__always_inline__)) _mm_unpacklo_pd(__m128d a, __m128d b) 1066{ 1067 return __builtin_shufflevector(a, b, 0, 2+0); 1068} 1069 1070static inline int __attribute__((__always_inline__)) _mm_movemask_pd(__m128d a) 1071{ 1072 return __builtin_ia32_movmskpd(a); 1073} 1074 1075#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i))) 1076 1077static inline __m128 __attribute__((__always_inline__)) _mm_castpd_ps(__m128d in) 1078{ 1079 return (__m128)in; 1080} 1081 1082static inline __m128i __attribute__((__always_inline__)) _mm_castpd_si128(__m128d in) 1083{ 1084 return (__m128i)in; 1085} 1086 1087static inline __m128d __attribute__((__always_inline__)) _mm_castps_pd(__m128 in) 1088{ 1089 return (__m128d)in; 1090} 1091 1092static inline __m128i __attribute__((__always_inline__)) _mm_castps_si128(__m128 in) 1093{ 1094 return (__m128i)in; 1095} 1096 1097static inline __m128 __attribute__((__always_inline__)) _mm_castsi128_ps(__m128i in) 1098{ 1099 return (__m128)in; 1100} 1101 1102static inline __m128d __attribute__((__always_inline__)) _mm_castsi128_pd(__m128i in) 1103{ 1104 return (__m128d)in; 1105} 1106 1107static inline void __attribute__((__always_inline__)) _mm_pause(void) 1108{ 1109 asm("pause"); 1110} 1111 1112#define _MM_SHUFFLE(x, y) (((x) << 1) | (y)) 1113 1114#endif /* __SSE2__ */ 1115 1116#endif /* __EMMINTRIN_H */ 1117