emmintrin.h revision 1bddbcbd11de09c7bcb48c3d661c447967db73a7
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#ifndef __SSE2__ 28#error "SSE2 instruction set not enabled" 29#else 30 31#include <xmmintrin.h> 32 33typedef double __m128d __attribute__((__vector_size__(16))); 34typedef long long __m128i __attribute__((__vector_size__(16))); 35 36typedef short __v8hi __attribute__((__vector_size__(16))); 37typedef char __v16qi __attribute__((__vector_size__(16))); 38 39static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 40_mm_add_sd(__m128d a, __m128d b) 41{ 42 a[0] += b[0]; 43 return a; 44} 45 46static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 47_mm_add_pd(__m128d a, __m128d b) 48{ 49 return a + b; 50} 51 52static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 53_mm_sub_sd(__m128d a, __m128d b) 54{ 55 a[0] -= b[0]; 56 return a; 57} 58 59static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 60_mm_sub_pd(__m128d a, __m128d b) 61{ 62 return a - b; 63} 64 65static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 66_mm_mul_sd(__m128d a, __m128d b) 67{ 68 a[0] *= b[0]; 69 return a; 70} 71 72static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 73_mm_mul_pd(__m128d a, __m128d b) 74{ 75 return a * b; 76} 77 78static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 79_mm_div_sd(__m128d a, __m128d b) 80{ 81 a[0] /= b[0]; 82 return a; 83} 84 85static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 86_mm_div_pd(__m128d a, __m128d b) 87{ 88 return a / b; 89} 90 91static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 92_mm_sqrt_sd(__m128d a, __m128d b) 93{ 94 __m128d c = __builtin_ia32_sqrtsd(b); 95 return (__m128d) { c[0], a[1] }; 96} 97 98static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 99_mm_sqrt_pd(__m128d a) 100{ 101 return __builtin_ia32_sqrtpd(a); 102} 103 104static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 105_mm_min_sd(__m128d a, __m128d b) 106{ 107 return __builtin_ia32_minsd(a, b); 108} 109 110static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 111_mm_min_pd(__m128d a, __m128d b) 112{ 113 return __builtin_ia32_minpd(a, b); 114} 115 116static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 117_mm_max_sd(__m128d a, __m128d b) 118{ 119 return __builtin_ia32_maxsd(a, b); 120} 121 122static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 123_mm_max_pd(__m128d a, __m128d b) 124{ 125 return __builtin_ia32_maxpd(a, b); 126} 127 128static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 129_mm_and_pd(__m128d a, __m128d b) 130{ 131 return (__m128d)((__v4si)a & (__v4si)b); 132} 133 134static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 135_mm_andnot_pd(__m128d a, __m128d b) 136{ 137 return (__m128d)(~(__v4si)a & (__v4si)b); 138} 139 140static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 141_mm_or_pd(__m128d a, __m128d b) 142{ 143 return (__m128d)((__v4si)a | (__v4si)b); 144} 145 146static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 147_mm_xor_pd(__m128d a, __m128d b) 148{ 149 return (__m128d)((__v4si)a ^ (__v4si)b); 150} 151 152static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 153_mm_cmpeq_pd(__m128d a, __m128d b) 154{ 155 return (__m128d)__builtin_ia32_cmppd(a, b, 0); 156} 157 158static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 159_mm_cmplt_pd(__m128d a, __m128d b) 160{ 161 return (__m128d)__builtin_ia32_cmppd(a, b, 1); 162} 163 164static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 165_mm_cmple_pd(__m128d a, __m128d b) 166{ 167 return (__m128d)__builtin_ia32_cmppd(a, b, 2); 168} 169 170static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 171_mm_cmpgt_pd(__m128d a, __m128d b) 172{ 173 return (__m128d)__builtin_ia32_cmppd(b, a, 1); 174} 175 176static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 177_mm_cmpge_pd(__m128d a, __m128d b) 178{ 179 return (__m128d)__builtin_ia32_cmppd(b, a, 2); 180} 181 182static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 183_mm_cmpord_pd(__m128d a, __m128d b) 184{ 185 return (__m128d)__builtin_ia32_cmppd(a, b, 7); 186} 187 188static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 189_mm_cmpunord_pd(__m128d a, __m128d b) 190{ 191 return (__m128d)__builtin_ia32_cmppd(a, b, 3); 192} 193 194static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 195_mm_cmpneq_pd(__m128d a, __m128d b) 196{ 197 return (__m128d)__builtin_ia32_cmppd(a, b, 4); 198} 199 200static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 201_mm_cmpnlt_pd(__m128d a, __m128d b) 202{ 203 return (__m128d)__builtin_ia32_cmppd(a, b, 5); 204} 205 206static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 207_mm_cmpnle_pd(__m128d a, __m128d b) 208{ 209 return (__m128d)__builtin_ia32_cmppd(a, b, 6); 210} 211 212static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 213_mm_cmpngt_pd(__m128d a, __m128d b) 214{ 215 return (__m128d)__builtin_ia32_cmppd(b, a, 5); 216} 217 218static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 219_mm_cmpnge_pd(__m128d a, __m128d b) 220{ 221 return (__m128d)__builtin_ia32_cmppd(b, a, 6); 222} 223 224static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 225_mm_cmpeq_sd(__m128d a, __m128d b) 226{ 227 return (__m128d)__builtin_ia32_cmpsd(a, b, 0); 228} 229 230static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 231_mm_cmplt_sd(__m128d a, __m128d b) 232{ 233 return (__m128d)__builtin_ia32_cmpsd(a, b, 1); 234} 235 236static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 237_mm_cmple_sd(__m128d a, __m128d b) 238{ 239 return (__m128d)__builtin_ia32_cmpsd(a, b, 2); 240} 241 242static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 243_mm_cmpgt_sd(__m128d a, __m128d b) 244{ 245 return (__m128d)__builtin_ia32_cmpsd(b, a, 1); 246} 247 248static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 249_mm_cmpge_sd(__m128d a, __m128d b) 250{ 251 return (__m128d)__builtin_ia32_cmpsd(b, a, 2); 252} 253 254static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 255_mm_cmpord_sd(__m128d a, __m128d b) 256{ 257 return (__m128d)__builtin_ia32_cmpsd(a, b, 7); 258} 259 260static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 261_mm_cmpunord_sd(__m128d a, __m128d b) 262{ 263 return (__m128d)__builtin_ia32_cmpsd(a, b, 3); 264} 265 266static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 267_mm_cmpneq_sd(__m128d a, __m128d b) 268{ 269 return (__m128d)__builtin_ia32_cmpsd(a, b, 4); 270} 271 272static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 273_mm_cmpnlt_sd(__m128d a, __m128d b) 274{ 275 return (__m128d)__builtin_ia32_cmpsd(a, b, 5); 276} 277 278static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 279_mm_cmpnle_sd(__m128d a, __m128d b) 280{ 281 return (__m128d)__builtin_ia32_cmpsd(a, b, 6); 282} 283 284static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 285_mm_cmpngt_sd(__m128d a, __m128d b) 286{ 287 return (__m128d)__builtin_ia32_cmpsd(b, a, 5); 288} 289 290static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 291_mm_cmpnge_sd(__m128d a, __m128d b) 292{ 293 return (__m128d)__builtin_ia32_cmpsd(b, a, 6); 294} 295 296static __inline__ int __attribute__((__always_inline__, __nodebug__)) 297_mm_comieq_sd(__m128d a, __m128d b) 298{ 299 return __builtin_ia32_comisdeq(a, b); 300} 301 302static __inline__ int __attribute__((__always_inline__, __nodebug__)) 303_mm_comilt_sd(__m128d a, __m128d b) 304{ 305 return __builtin_ia32_comisdlt(a, b); 306} 307 308static __inline__ int __attribute__((__always_inline__, __nodebug__)) 309_mm_comile_sd(__m128d a, __m128d b) 310{ 311 return __builtin_ia32_comisdle(a, b); 312} 313 314static __inline__ int __attribute__((__always_inline__, __nodebug__)) 315_mm_comigt_sd(__m128d a, __m128d b) 316{ 317 return __builtin_ia32_comisdgt(a, b); 318} 319 320static __inline__ int __attribute__((__always_inline__, __nodebug__)) 321_mm_comineq_sd(__m128d a, __m128d b) 322{ 323 return __builtin_ia32_comisdneq(a, b); 324} 325 326static __inline__ int __attribute__((__always_inline__, __nodebug__)) 327_mm_ucomieq_sd(__m128d a, __m128d b) 328{ 329 return __builtin_ia32_ucomisdeq(a, b); 330} 331 332static __inline__ int __attribute__((__always_inline__, __nodebug__)) 333_mm_ucomilt_sd(__m128d a, __m128d b) 334{ 335 return __builtin_ia32_ucomisdlt(a, b); 336} 337 338static __inline__ int __attribute__((__always_inline__, __nodebug__)) 339_mm_ucomile_sd(__m128d a, __m128d b) 340{ 341 return __builtin_ia32_ucomisdle(a, b); 342} 343 344static __inline__ int __attribute__((__always_inline__, __nodebug__)) 345_mm_ucomigt_sd(__m128d a, __m128d b) 346{ 347 return __builtin_ia32_ucomisdgt(a, b); 348} 349 350static __inline__ int __attribute__((__always_inline__, __nodebug__)) 351_mm_ucomineq_sd(__m128d a, __m128d b) 352{ 353 return __builtin_ia32_ucomisdneq(a, b); 354} 355 356static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 357_mm_cvtpd_ps(__m128d a) 358{ 359 return __builtin_ia32_cvtpd2ps(a); 360} 361 362static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 363_mm_cvtps_pd(__m128 a) 364{ 365 return __builtin_ia32_cvtps2pd(a); 366} 367 368static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 369_mm_cvtepi32_pd(__m128i a) 370{ 371 return __builtin_ia32_cvtdq2pd((__v4si)a); 372} 373 374static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 375_mm_cvtpd_epi32(__m128d a) 376{ 377 return __builtin_ia32_cvtpd2dq(a); 378} 379 380static __inline__ int __attribute__((__always_inline__, __nodebug__)) 381_mm_cvtsd_si32(__m128d a) 382{ 383 return __builtin_ia32_cvtsd2si(a); 384} 385 386static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 387_mm_cvtsd_ss(__m128 a, __m128d b) 388{ 389 a[0] = b[0]; 390 return a; 391} 392 393static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 394_mm_cvtsi32_sd(__m128d a, int b) 395{ 396 a[0] = b; 397 return a; 398} 399 400static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 401_mm_cvtss_sd(__m128d a, __m128 b) 402{ 403 a[0] = b[0]; 404 return a; 405} 406 407static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 408_mm_cvttpd_epi32(__m128d a) 409{ 410 return (__m128i)__builtin_ia32_cvttpd2dq(a); 411} 412 413static __inline__ int __attribute__((__always_inline__, __nodebug__)) 414_mm_cvttsd_si32(__m128d a) 415{ 416 return a[0]; 417} 418 419static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 420_mm_cvtpd_pi32(__m128d a) 421{ 422 return (__m64)__builtin_ia32_cvtpd2pi(a); 423} 424 425static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 426_mm_cvttpd_pi32(__m128d a) 427{ 428 return (__m64)__builtin_ia32_cvttpd2pi(a); 429} 430 431static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 432_mm_cvtpi32_pd(__m64 a) 433{ 434 return __builtin_ia32_cvtpi2pd((__v2si)a); 435} 436 437static __inline__ double __attribute__((__always_inline__, __nodebug__)) 438_mm_cvtsd_f64(__m128d a) 439{ 440 return a[0]; 441} 442 443static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 444_mm_load_pd(double const *dp) 445{ 446 return *(__m128d*)dp; 447} 448 449static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 450_mm_load1_pd(double const *dp) 451{ 452 return (__m128d){ dp[0], dp[0] }; 453} 454 455#define _mm_load_pd1(dp) _mm_load1_pd(dp) 456 457static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 458_mm_loadr_pd(double const *dp) 459{ 460 return (__m128d){ dp[1], dp[0] }; 461} 462 463static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 464_mm_loadu_pd(double const *dp) 465{ 466 return __builtin_ia32_loadupd(dp); 467} 468 469static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 470_mm_load_sd(double const *dp) 471{ 472 return (__m128d){ *dp, 0.0 }; 473} 474 475static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 476_mm_loadh_pd(__m128d a, double const *dp) 477{ 478 return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2); 479} 480 481static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 482_mm_loadl_pd(__m128d a, double const *dp) 483{ 484 return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1); 485} 486 487static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 488_mm_set_sd(double w) 489{ 490 return (__m128d){ w, 0 }; 491} 492 493static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 494_mm_set1_pd(double w) 495{ 496 return (__m128d){ w, w }; 497} 498 499static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 500_mm_set_pd(double w, double x) 501{ 502 return (__m128d){ x, w }; 503} 504 505static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 506_mm_setr_pd(double w, double x) 507{ 508 return (__m128d){ w, x }; 509} 510 511static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 512_mm_setzero_pd(void) 513{ 514 return (__m128d){ 0, 0 }; 515} 516 517static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 518_mm_move_sd(__m128d a, __m128d b) 519{ 520 return (__m128d){ b[0], a[1] }; 521} 522 523static __inline__ void __attribute__((__always_inline__, __nodebug__)) 524_mm_store_sd(double *dp, __m128d a) 525{ 526 dp[0] = a[0]; 527} 528 529static __inline__ void __attribute__((__always_inline__, __nodebug__)) 530_mm_store1_pd(double *dp, __m128d a) 531{ 532 dp[0] = a[0]; 533 dp[1] = a[0]; 534} 535 536static __inline__ void __attribute__((__always_inline__, __nodebug__)) 537_mm_store_pd(double *dp, __m128d a) 538{ 539 *(__m128d *)dp = a; 540} 541 542static __inline__ void __attribute__((__always_inline__, __nodebug__)) 543_mm_storeu_pd(double *dp, __m128d a) 544{ 545 __builtin_ia32_storeupd(dp, a); 546} 547 548static __inline__ void __attribute__((__always_inline__, __nodebug__)) 549_mm_storer_pd(double *dp, __m128d a) 550{ 551 dp[0] = a[1]; 552 dp[1] = a[0]; 553} 554 555static __inline__ void __attribute__((__always_inline__, __nodebug__)) 556_mm_storeh_pd(double *dp, __m128d a) 557{ 558 dp[0] = a[1]; 559} 560 561static __inline__ void __attribute__((__always_inline__, __nodebug__)) 562_mm_storel_pd(double *dp, __m128d a) 563{ 564 dp[0] = a[0]; 565} 566 567static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 568_mm_add_epi8(__m128i a, __m128i b) 569{ 570 return (__m128i)((__v16qi)a + (__v16qi)b); 571} 572 573static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 574_mm_add_epi16(__m128i a, __m128i b) 575{ 576 return (__m128i)((__v8hi)a + (__v8hi)b); 577} 578 579static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 580_mm_add_epi32(__m128i a, __m128i b) 581{ 582 return (__m128i)((__v4si)a + (__v4si)b); 583} 584 585static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 586_mm_add_si64(__m64 a, __m64 b) 587{ 588 return a + b; 589} 590 591static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 592_mm_add_epi64(__m128i a, __m128i b) 593{ 594 return a + b; 595} 596 597static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 598_mm_adds_epi8(__m128i a, __m128i b) 599{ 600 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b); 601} 602 603static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 604_mm_adds_epi16(__m128i a, __m128i b) 605{ 606 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b); 607} 608 609static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 610_mm_adds_epu8(__m128i a, __m128i b) 611{ 612 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b); 613} 614 615static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 616_mm_adds_epu16(__m128i a, __m128i b) 617{ 618 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b); 619} 620 621static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 622_mm_avg_epu8(__m128i a, __m128i b) 623{ 624 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b); 625} 626 627static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 628_mm_avg_epu16(__m128i a, __m128i b) 629{ 630 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b); 631} 632 633static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 634_mm_madd_epi16(__m128i a, __m128i b) 635{ 636 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b); 637} 638 639static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 640_mm_max_epi16(__m128i a, __m128i b) 641{ 642 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b); 643} 644 645static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 646_mm_max_epu8(__m128i a, __m128i b) 647{ 648 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b); 649} 650 651static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 652_mm_min_epi16(__m128i a, __m128i b) 653{ 654 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b); 655} 656 657static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 658_mm_min_epu8(__m128i a, __m128i b) 659{ 660 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b); 661} 662 663static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 664_mm_mulhi_epi16(__m128i a, __m128i b) 665{ 666 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b); 667} 668 669static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 670_mm_mulhi_epu16(__m128i a, __m128i b) 671{ 672 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b); 673} 674 675static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 676_mm_mullo_epi16(__m128i a, __m128i b) 677{ 678 return (__m128i)((__v8hi)a * (__v8hi)b); 679} 680 681static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 682_mm_mul_su32(__m64 a, __m64 b) 683{ 684 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b); 685} 686 687static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 688_mm_mul_epu32(__m128i a, __m128i b) 689{ 690 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b); 691} 692 693static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 694_mm_sad_epu8(__m128i a, __m128i b) 695{ 696 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b); 697} 698 699static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 700_mm_sub_epi8(__m128i a, __m128i b) 701{ 702 return (__m128i)((__v16qi)a - (__v16qi)b); 703} 704 705static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 706_mm_sub_epi16(__m128i a, __m128i b) 707{ 708 return (__m128i)((__v8hi)a - (__v8hi)b); 709} 710 711static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 712_mm_sub_epi32(__m128i a, __m128i b) 713{ 714 return (__m128i)((__v4si)a - (__v4si)b); 715} 716 717static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 718_mm_sub_si64(__m64 a, __m64 b) 719{ 720 return a - b; 721} 722 723static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 724_mm_sub_epi64(__m128i a, __m128i b) 725{ 726 return a - b; 727} 728 729static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 730_mm_subs_epi8(__m128i a, __m128i b) 731{ 732 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b); 733} 734 735static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 736_mm_subs_epi16(__m128i a, __m128i b) 737{ 738 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b); 739} 740 741static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 742_mm_subs_epu8(__m128i a, __m128i b) 743{ 744 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b); 745} 746 747static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 748_mm_subs_epu16(__m128i a, __m128i b) 749{ 750 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b); 751} 752 753static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 754_mm_and_si128(__m128i a, __m128i b) 755{ 756 return a & b; 757} 758 759static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 760_mm_andnot_si128(__m128i a, __m128i b) 761{ 762 return ~a & b; 763} 764 765static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 766_mm_or_si128(__m128i a, __m128i b) 767{ 768 return a | b; 769} 770 771static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 772_mm_xor_si128(__m128i a, __m128i b) 773{ 774 return a ^ b; 775} 776 777static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 778_mm_slli_si128(__m128i a, int imm) 779{ 780 return __builtin_ia32_pslldqi128(a, imm * 8); 781} 782 783static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 784_mm_slli_epi16(__m128i a, int count) 785{ 786 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count); 787} 788 789static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 790_mm_sll_epi16(__m128i a, __m128i count) 791{ 792 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count); 793} 794 795static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 796_mm_slli_epi32(__m128i a, int count) 797{ 798 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count); 799} 800 801static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 802_mm_sll_epi32(__m128i a, __m128i count) 803{ 804 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count); 805} 806 807static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 808_mm_slli_epi64(__m128i a, int count) 809{ 810 return __builtin_ia32_psllqi128(a, count); 811} 812 813static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 814_mm_sll_epi64(__m128i a, __m128i count) 815{ 816 return __builtin_ia32_psllq128(a, count); 817} 818 819static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 820_mm_srai_epi16(__m128i a, int count) 821{ 822 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count); 823} 824 825static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 826_mm_sra_epi16(__m128i a, __m128i count) 827{ 828 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count); 829} 830 831static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 832_mm_srai_epi32(__m128i a, int count) 833{ 834 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count); 835} 836 837static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 838_mm_sra_epi32(__m128i a, __m128i count) 839{ 840 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count); 841} 842 843static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 844_mm_srli_si128(__m128i a, int imm) 845{ 846 return __builtin_ia32_psrldqi128(a, imm * 8); 847} 848 849static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 850_mm_srli_epi16(__m128i a, int count) 851{ 852 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count); 853} 854 855static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 856_mm_srl_epi16(__m128i a, __m128i count) 857{ 858 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count); 859} 860 861static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 862_mm_srli_epi32(__m128i a, int count) 863{ 864 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count); 865} 866 867static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 868_mm_srl_epi32(__m128i a, __m128i count) 869{ 870 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count); 871} 872 873static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 874_mm_srli_epi64(__m128i a, int count) 875{ 876 return __builtin_ia32_psrlqi128(a, count); 877} 878 879static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 880_mm_srl_epi64(__m128i a, __m128i count) 881{ 882 return __builtin_ia32_psrlq128(a, count); 883} 884 885static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 886_mm_cmpeq_epi8(__m128i a, __m128i b) 887{ 888 return (__m128i)((__v16qi)a == (__v16qi)b); 889} 890 891static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 892_mm_cmpeq_epi16(__m128i a, __m128i b) 893{ 894 return (__m128i)((__v8hi)a == (__v8hi)b); 895} 896 897static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 898_mm_cmpeq_epi32(__m128i a, __m128i b) 899{ 900 return (__m128i)((__v4si)a == (__v4si)b); 901} 902 903static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 904_mm_cmpgt_epi8(__m128i a, __m128i b) 905{ 906 return (__m128i)((__v16qi)a > (__v16qi)b); 907} 908 909static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 910_mm_cmpgt_epi16(__m128i a, __m128i b) 911{ 912 return (__m128i)((__v8hi)a > (__v8hi)b); 913} 914 915static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 916_mm_cmpgt_epi32(__m128i a, __m128i b) 917{ 918 return (__m128i)((__v4si)a > (__v4si)b); 919} 920 921static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 922_mm_cmplt_epi8(__m128i a, __m128i b) 923{ 924 return _mm_cmpgt_epi8(b,a); 925} 926 927static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 928_mm_cmplt_epi16(__m128i a, __m128i b) 929{ 930 return _mm_cmpgt_epi16(b,a); 931} 932 933static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 934_mm_cmplt_epi32(__m128i a, __m128i b) 935{ 936 return _mm_cmpgt_epi32(b,a); 937} 938 939#ifdef __x86_64__ 940static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 941_mm_cvtsi64_sd(__m128d a, long long b) 942{ 943 a[0] = b; 944 return a; 945} 946 947static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 948_mm_cvtsd_si64(__m128d a) 949{ 950 return __builtin_ia32_cvtsd2si64(a); 951} 952 953static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 954_mm_cvttsd_si64(__m128d a) 955{ 956 return a[0]; 957} 958#endif 959 960static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 961_mm_cvtepi32_ps(__m128i a) 962{ 963 return __builtin_ia32_cvtdq2ps((__v4si)a); 964} 965 966static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 967_mm_cvtps_epi32(__m128 a) 968{ 969 return (__m128i)__builtin_ia32_cvtps2dq(a); 970} 971 972static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 973_mm_cvttps_epi32(__m128 a) 974{ 975 return (__m128i)__builtin_ia32_cvttps2dq(a); 976} 977 978static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 979_mm_cvtsi32_si128(int a) 980{ 981 return (__m128i)(__v4si){ a, 0, 0, 0 }; 982} 983 984#ifdef __x86_64__ 985static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 986_mm_cvtsi64_si128(long long a) 987{ 988 return (__m128i){ a, 0 }; 989} 990#endif 991 992static __inline__ int __attribute__((__always_inline__, __nodebug__)) 993_mm_cvtsi128_si32(__m128i a) 994{ 995 __v4si b = (__v4si)a; 996 return b[0]; 997} 998 999#ifdef __x86_64__ 1000static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 1001_mm_cvtsi128_si64(__m128i a) 1002{ 1003 return a[0]; 1004} 1005#endif 1006 1007static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1008_mm_load_si128(__m128i const *p) 1009{ 1010 return *p; 1011} 1012 1013static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1014_mm_loadu_si128(__m128i const *p) 1015{ 1016 return (__m128i)__builtin_ia32_loaddqu((char const *)p); 1017} 1018 1019static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1020_mm_loadl_epi64(__m128i const *p) 1021{ 1022 return (__m128i) { *(long long*)p, 0}; 1023} 1024 1025static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1026_mm_set_epi64x(long long q1, long long q0) 1027{ 1028 return (__m128i){ q0, q1 }; 1029} 1030 1031static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1032_mm_set_epi64(__m64 q1, __m64 q0) 1033{ 1034 return (__m128i){ (long long)q0, (long long)q1 }; 1035} 1036 1037static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1038_mm_set_epi32(int i3, int i2, int i1, int i0) 1039{ 1040 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1041} 1042 1043static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1044_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0) 1045{ 1046 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1047} 1048 1049static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1050_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0) 1051{ 1052 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1053} 1054 1055static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1056_mm_set1_epi64x(long long q) 1057{ 1058 return (__m128i){ q, q }; 1059} 1060 1061static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1062_mm_set1_epi64(__m64 q) 1063{ 1064 return (__m128i){ (long long)q, (long long)q }; 1065} 1066 1067static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1068_mm_set1_epi32(int i) 1069{ 1070 return (__m128i)(__v4si){ i, i, i, i }; 1071} 1072 1073static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1074_mm_set1_epi16(short w) 1075{ 1076 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w }; 1077} 1078 1079static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1080_mm_set1_epi8(char b) 1081{ 1082 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1083} 1084 1085static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1086_mm_setr_epi64(__m64 q0, __m64 q1) 1087{ 1088 return (__m128i){ (long long)q0, (long long)q1 }; 1089} 1090 1091static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1092_mm_setr_epi32(int i0, int i1, int i2, int i3) 1093{ 1094 return (__m128i)(__v4si){ i0, i1, i2, i3}; 1095} 1096 1097static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1098_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) 1099{ 1100 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 }; 1101} 1102 1103static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1104_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15) 1105{ 1106 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 }; 1107} 1108 1109static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1110_mm_setzero_si128(void) 1111{ 1112 return (__m128i){ 0LL, 0LL }; 1113} 1114 1115static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1116_mm_store_si128(__m128i *p, __m128i b) 1117{ 1118 *p = b; 1119} 1120 1121static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1122_mm_storeu_si128(__m128i *p, __m128i b) 1123{ 1124 __builtin_ia32_storedqu((char *)p, (__v16qi)b); 1125} 1126 1127static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1128_mm_maskmoveu_si128(__m128i d, __m128i n, char *p) 1129{ 1130 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p); 1131} 1132 1133static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1134_mm_storel_epi64(__m128i *p, __m128i a) 1135{ 1136 __builtin_ia32_storelv4si((__v2si *)p, a); 1137} 1138 1139static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1140_mm_stream_pd(double *p, __m128d a) 1141{ 1142 __builtin_ia32_movntpd(p, a); 1143} 1144 1145static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1146_mm_stream_si128(__m128i *p, __m128i a) 1147{ 1148 __builtin_ia32_movntdq(p, a); 1149} 1150 1151static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1152_mm_stream_si32(int *p, int a) 1153{ 1154 __builtin_ia32_movnti(p, a); 1155} 1156 1157static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1158_mm_clflush(void const *p) 1159{ 1160 __builtin_ia32_clflush(p); 1161} 1162 1163static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1164_mm_lfence(void) 1165{ 1166 __builtin_ia32_lfence(); 1167} 1168 1169static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1170_mm_mfence(void) 1171{ 1172 __builtin_ia32_mfence(); 1173} 1174 1175static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1176_mm_packs_epi16(__m128i a, __m128i b) 1177{ 1178 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b); 1179} 1180 1181static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1182_mm_packs_epi32(__m128i a, __m128i b) 1183{ 1184 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b); 1185} 1186 1187static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1188_mm_packus_epi16(__m128i a, __m128i b) 1189{ 1190 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b); 1191} 1192 1193static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1194_mm_extract_epi16(__m128i a, int imm) 1195{ 1196 __v8hi b = (__v8hi)a; 1197 return b[imm]; 1198} 1199 1200static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1201_mm_insert_epi16(__m128i a, int b, int imm) 1202{ 1203 __v8hi c = (__v8hi)a; 1204 c[imm & 7] = b; 1205 return (__m128i)c; 1206} 1207 1208static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1209_mm_movemask_epi8(__m128i a) 1210{ 1211 return __builtin_ia32_pmovmskb128((__v16qi)a); 1212} 1213 1214#define _mm_shuffle_epi32(a, imm) \ 1215 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) {0}, \ 1216 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1217 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6)) 1218#define _mm_shufflelo_epi16(a, imm) \ 1219 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, \ 1220 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 1221 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 1222 4, 5, 6, 7)) 1223#define _mm_shufflehi_epi16(a, imm) \ 1224 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) {0}, 0, 1, 2, 3, \ 1225 4 + ((imm) & 0x3), 4 + ((imm) & 0xc) >> 2, \ 1226 4 + ((imm) & 0x30) >> 4, \ 1227 4 + ((imm) & 0xc0) >> 6)) 1228 1229static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1230_mm_unpackhi_epi8(__m128i a, __m128i b) 1231{ 1232 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 1233} 1234 1235static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1236_mm_unpackhi_epi16(__m128i a, __m128i b) 1237{ 1238 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 1239} 1240 1241static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1242_mm_unpackhi_epi32(__m128i a, __m128i b) 1243{ 1244 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3); 1245} 1246 1247static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1248_mm_unpackhi_epi64(__m128i a, __m128i b) 1249{ 1250 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1); 1251} 1252 1253static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1254_mm_unpacklo_epi8(__m128i a, __m128i b) 1255{ 1256 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 1257} 1258 1259static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1260_mm_unpacklo_epi16(__m128i a, __m128i b) 1261{ 1262 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 1263} 1264 1265static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1266_mm_unpacklo_epi32(__m128i a, __m128i b) 1267{ 1268 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1); 1269} 1270 1271static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1272_mm_unpacklo_epi64(__m128i a, __m128i b) 1273{ 1274 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0); 1275} 1276 1277static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 1278_mm_movepi64_pi64(__m128i a) 1279{ 1280 return (__m64)a[0]; 1281} 1282 1283static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1284_mm_movpi64_pi64(__m64 a) 1285{ 1286 return (__m128i){ (long long)a, 0 }; 1287} 1288 1289static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1290_mm_move_epi64(__m128i a) 1291{ 1292 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2); 1293} 1294 1295static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1296_mm_unpackhi_pd(__m128d a, __m128d b) 1297{ 1298 return __builtin_shufflevector(a, b, 1, 2+1); 1299} 1300 1301static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1302_mm_unpacklo_pd(__m128d a, __m128d b) 1303{ 1304 return __builtin_shufflevector(a, b, 0, 2+0); 1305} 1306 1307static __inline__ int __attribute__((__always_inline__, __nodebug__)) 1308_mm_movemask_pd(__m128d a) 1309{ 1310 return __builtin_ia32_movmskpd(a); 1311} 1312 1313#define _mm_shuffle_pd(a, b, i) (__builtin_shufflevector((a), (b), (i) & 1, \ 1314 (((i) & 2) >> 1) + 2)) 1315 1316static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1317_mm_castpd_ps(__m128d in) 1318{ 1319 return (__m128)in; 1320} 1321 1322static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1323_mm_castpd_si128(__m128d in) 1324{ 1325 return (__m128i)in; 1326} 1327 1328static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1329_mm_castps_pd(__m128 in) 1330{ 1331 return (__m128d)in; 1332} 1333 1334static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 1335_mm_castps_si128(__m128 in) 1336{ 1337 return (__m128i)in; 1338} 1339 1340static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 1341_mm_castsi128_ps(__m128i in) 1342{ 1343 return (__m128)in; 1344} 1345 1346static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) 1347_mm_castsi128_pd(__m128i in) 1348{ 1349 return (__m128d)in; 1350} 1351 1352static __inline__ void __attribute__((__always_inline__, __nodebug__)) 1353_mm_pause(void) 1354{ 1355 __asm__ volatile ("pause"); 1356} 1357 1358#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 1359 1360#endif /* __SSE2__ */ 1361 1362#endif /* __EMMINTRIN_H */ 1363