1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#include <mmintrin.h> 28 29typedef int __v4si __attribute__((__vector_size__(16))); 30typedef float __v4sf __attribute__((__vector_size__(16))); 31typedef float __m128 __attribute__((__vector_size__(16))); 32 33/* This header should only be included in a hosted environment as it depends on 34 * a standard library to provide allocation routines. */ 35#if __STDC_HOSTED__ 36#include <mm_malloc.h> 37#endif 38 39/* Define the default attributes for the functions in this file. */ 40#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"))) 41 42static __inline__ __m128 __DEFAULT_FN_ATTRS 43_mm_add_ss(__m128 __a, __m128 __b) 44{ 45 __a[0] += __b[0]; 46 return __a; 47} 48 49static __inline__ __m128 __DEFAULT_FN_ATTRS 50_mm_add_ps(__m128 __a, __m128 __b) 51{ 52 return __a + __b; 53} 54 55static __inline__ __m128 __DEFAULT_FN_ATTRS 56_mm_sub_ss(__m128 __a, __m128 __b) 57{ 58 __a[0] -= __b[0]; 59 return __a; 60} 61 62static __inline__ __m128 __DEFAULT_FN_ATTRS 63_mm_sub_ps(__m128 __a, __m128 __b) 64{ 65 return __a - __b; 66} 67 68static __inline__ __m128 __DEFAULT_FN_ATTRS 69_mm_mul_ss(__m128 __a, __m128 __b) 70{ 71 __a[0] *= __b[0]; 72 return __a; 73} 74 75static __inline__ __m128 __DEFAULT_FN_ATTRS 76_mm_mul_ps(__m128 __a, __m128 __b) 77{ 78 return __a * __b; 79} 80 81static __inline__ __m128 __DEFAULT_FN_ATTRS 82_mm_div_ss(__m128 __a, __m128 __b) 83{ 84 __a[0] /= __b[0]; 85 return __a; 86} 87 88static __inline__ __m128 __DEFAULT_FN_ATTRS 89_mm_div_ps(__m128 __a, __m128 __b) 90{ 91 return __a / __b; 92} 93 94static __inline__ __m128 __DEFAULT_FN_ATTRS 95_mm_sqrt_ss(__m128 __a) 96{ 97 __m128 __c = __builtin_ia32_sqrtss(__a); 98 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 99} 100 101static __inline__ __m128 __DEFAULT_FN_ATTRS 102_mm_sqrt_ps(__m128 __a) 103{ 104 return __builtin_ia32_sqrtps(__a); 105} 106 107static __inline__ __m128 __DEFAULT_FN_ATTRS 108_mm_rcp_ss(__m128 __a) 109{ 110 __m128 __c = __builtin_ia32_rcpss(__a); 111 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 112} 113 114static __inline__ __m128 __DEFAULT_FN_ATTRS 115_mm_rcp_ps(__m128 __a) 116{ 117 return __builtin_ia32_rcpps(__a); 118} 119 120static __inline__ __m128 __DEFAULT_FN_ATTRS 121_mm_rsqrt_ss(__m128 __a) 122{ 123 __m128 __c = __builtin_ia32_rsqrtss(__a); 124 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 125} 126 127static __inline__ __m128 __DEFAULT_FN_ATTRS 128_mm_rsqrt_ps(__m128 __a) 129{ 130 return __builtin_ia32_rsqrtps(__a); 131} 132 133static __inline__ __m128 __DEFAULT_FN_ATTRS 134_mm_min_ss(__m128 __a, __m128 __b) 135{ 136 return __builtin_ia32_minss(__a, __b); 137} 138 139static __inline__ __m128 __DEFAULT_FN_ATTRS 140_mm_min_ps(__m128 __a, __m128 __b) 141{ 142 return __builtin_ia32_minps(__a, __b); 143} 144 145static __inline__ __m128 __DEFAULT_FN_ATTRS 146_mm_max_ss(__m128 __a, __m128 __b) 147{ 148 return __builtin_ia32_maxss(__a, __b); 149} 150 151static __inline__ __m128 __DEFAULT_FN_ATTRS 152_mm_max_ps(__m128 __a, __m128 __b) 153{ 154 return __builtin_ia32_maxps(__a, __b); 155} 156 157static __inline__ __m128 __DEFAULT_FN_ATTRS 158_mm_and_ps(__m128 __a, __m128 __b) 159{ 160 return (__m128)((__v4si)__a & (__v4si)__b); 161} 162 163static __inline__ __m128 __DEFAULT_FN_ATTRS 164_mm_andnot_ps(__m128 __a, __m128 __b) 165{ 166 return (__m128)(~(__v4si)__a & (__v4si)__b); 167} 168 169static __inline__ __m128 __DEFAULT_FN_ATTRS 170_mm_or_ps(__m128 __a, __m128 __b) 171{ 172 return (__m128)((__v4si)__a | (__v4si)__b); 173} 174 175static __inline__ __m128 __DEFAULT_FN_ATTRS 176_mm_xor_ps(__m128 __a, __m128 __b) 177{ 178 return (__m128)((__v4si)__a ^ (__v4si)__b); 179} 180 181static __inline__ __m128 __DEFAULT_FN_ATTRS 182_mm_cmpeq_ss(__m128 __a, __m128 __b) 183{ 184 return (__m128)__builtin_ia32_cmpeqss(__a, __b); 185} 186 187static __inline__ __m128 __DEFAULT_FN_ATTRS 188_mm_cmpeq_ps(__m128 __a, __m128 __b) 189{ 190 return (__m128)__builtin_ia32_cmpeqps(__a, __b); 191} 192 193static __inline__ __m128 __DEFAULT_FN_ATTRS 194_mm_cmplt_ss(__m128 __a, __m128 __b) 195{ 196 return (__m128)__builtin_ia32_cmpltss(__a, __b); 197} 198 199static __inline__ __m128 __DEFAULT_FN_ATTRS 200_mm_cmplt_ps(__m128 __a, __m128 __b) 201{ 202 return (__m128)__builtin_ia32_cmpltps(__a, __b); 203} 204 205static __inline__ __m128 __DEFAULT_FN_ATTRS 206_mm_cmple_ss(__m128 __a, __m128 __b) 207{ 208 return (__m128)__builtin_ia32_cmpless(__a, __b); 209} 210 211static __inline__ __m128 __DEFAULT_FN_ATTRS 212_mm_cmple_ps(__m128 __a, __m128 __b) 213{ 214 return (__m128)__builtin_ia32_cmpleps(__a, __b); 215} 216 217static __inline__ __m128 __DEFAULT_FN_ATTRS 218_mm_cmpgt_ss(__m128 __a, __m128 __b) 219{ 220 return (__m128)__builtin_shufflevector(__a, 221 __builtin_ia32_cmpltss(__b, __a), 222 4, 1, 2, 3); 223} 224 225static __inline__ __m128 __DEFAULT_FN_ATTRS 226_mm_cmpgt_ps(__m128 __a, __m128 __b) 227{ 228 return (__m128)__builtin_ia32_cmpltps(__b, __a); 229} 230 231static __inline__ __m128 __DEFAULT_FN_ATTRS 232_mm_cmpge_ss(__m128 __a, __m128 __b) 233{ 234 return (__m128)__builtin_shufflevector(__a, 235 __builtin_ia32_cmpless(__b, __a), 236 4, 1, 2, 3); 237} 238 239static __inline__ __m128 __DEFAULT_FN_ATTRS 240_mm_cmpge_ps(__m128 __a, __m128 __b) 241{ 242 return (__m128)__builtin_ia32_cmpleps(__b, __a); 243} 244 245static __inline__ __m128 __DEFAULT_FN_ATTRS 246_mm_cmpneq_ss(__m128 __a, __m128 __b) 247{ 248 return (__m128)__builtin_ia32_cmpneqss(__a, __b); 249} 250 251static __inline__ __m128 __DEFAULT_FN_ATTRS 252_mm_cmpneq_ps(__m128 __a, __m128 __b) 253{ 254 return (__m128)__builtin_ia32_cmpneqps(__a, __b); 255} 256 257static __inline__ __m128 __DEFAULT_FN_ATTRS 258_mm_cmpnlt_ss(__m128 __a, __m128 __b) 259{ 260 return (__m128)__builtin_ia32_cmpnltss(__a, __b); 261} 262 263static __inline__ __m128 __DEFAULT_FN_ATTRS 264_mm_cmpnlt_ps(__m128 __a, __m128 __b) 265{ 266 return (__m128)__builtin_ia32_cmpnltps(__a, __b); 267} 268 269static __inline__ __m128 __DEFAULT_FN_ATTRS 270_mm_cmpnle_ss(__m128 __a, __m128 __b) 271{ 272 return (__m128)__builtin_ia32_cmpnless(__a, __b); 273} 274 275static __inline__ __m128 __DEFAULT_FN_ATTRS 276_mm_cmpnle_ps(__m128 __a, __m128 __b) 277{ 278 return (__m128)__builtin_ia32_cmpnleps(__a, __b); 279} 280 281static __inline__ __m128 __DEFAULT_FN_ATTRS 282_mm_cmpngt_ss(__m128 __a, __m128 __b) 283{ 284 return (__m128)__builtin_shufflevector(__a, 285 __builtin_ia32_cmpnltss(__b, __a), 286 4, 1, 2, 3); 287} 288 289static __inline__ __m128 __DEFAULT_FN_ATTRS 290_mm_cmpngt_ps(__m128 __a, __m128 __b) 291{ 292 return (__m128)__builtin_ia32_cmpnltps(__b, __a); 293} 294 295static __inline__ __m128 __DEFAULT_FN_ATTRS 296_mm_cmpnge_ss(__m128 __a, __m128 __b) 297{ 298 return (__m128)__builtin_shufflevector(__a, 299 __builtin_ia32_cmpnless(__b, __a), 300 4, 1, 2, 3); 301} 302 303static __inline__ __m128 __DEFAULT_FN_ATTRS 304_mm_cmpnge_ps(__m128 __a, __m128 __b) 305{ 306 return (__m128)__builtin_ia32_cmpnleps(__b, __a); 307} 308 309static __inline__ __m128 __DEFAULT_FN_ATTRS 310_mm_cmpord_ss(__m128 __a, __m128 __b) 311{ 312 return (__m128)__builtin_ia32_cmpordss(__a, __b); 313} 314 315static __inline__ __m128 __DEFAULT_FN_ATTRS 316_mm_cmpord_ps(__m128 __a, __m128 __b) 317{ 318 return (__m128)__builtin_ia32_cmpordps(__a, __b); 319} 320 321static __inline__ __m128 __DEFAULT_FN_ATTRS 322_mm_cmpunord_ss(__m128 __a, __m128 __b) 323{ 324 return (__m128)__builtin_ia32_cmpunordss(__a, __b); 325} 326 327static __inline__ __m128 __DEFAULT_FN_ATTRS 328_mm_cmpunord_ps(__m128 __a, __m128 __b) 329{ 330 return (__m128)__builtin_ia32_cmpunordps(__a, __b); 331} 332 333static __inline__ int __DEFAULT_FN_ATTRS 334_mm_comieq_ss(__m128 __a, __m128 __b) 335{ 336 return __builtin_ia32_comieq(__a, __b); 337} 338 339static __inline__ int __DEFAULT_FN_ATTRS 340_mm_comilt_ss(__m128 __a, __m128 __b) 341{ 342 return __builtin_ia32_comilt(__a, __b); 343} 344 345static __inline__ int __DEFAULT_FN_ATTRS 346_mm_comile_ss(__m128 __a, __m128 __b) 347{ 348 return __builtin_ia32_comile(__a, __b); 349} 350 351static __inline__ int __DEFAULT_FN_ATTRS 352_mm_comigt_ss(__m128 __a, __m128 __b) 353{ 354 return __builtin_ia32_comigt(__a, __b); 355} 356 357static __inline__ int __DEFAULT_FN_ATTRS 358_mm_comige_ss(__m128 __a, __m128 __b) 359{ 360 return __builtin_ia32_comige(__a, __b); 361} 362 363static __inline__ int __DEFAULT_FN_ATTRS 364_mm_comineq_ss(__m128 __a, __m128 __b) 365{ 366 return __builtin_ia32_comineq(__a, __b); 367} 368 369static __inline__ int __DEFAULT_FN_ATTRS 370_mm_ucomieq_ss(__m128 __a, __m128 __b) 371{ 372 return __builtin_ia32_ucomieq(__a, __b); 373} 374 375static __inline__ int __DEFAULT_FN_ATTRS 376_mm_ucomilt_ss(__m128 __a, __m128 __b) 377{ 378 return __builtin_ia32_ucomilt(__a, __b); 379} 380 381static __inline__ int __DEFAULT_FN_ATTRS 382_mm_ucomile_ss(__m128 __a, __m128 __b) 383{ 384 return __builtin_ia32_ucomile(__a, __b); 385} 386 387static __inline__ int __DEFAULT_FN_ATTRS 388_mm_ucomigt_ss(__m128 __a, __m128 __b) 389{ 390 return __builtin_ia32_ucomigt(__a, __b); 391} 392 393static __inline__ int __DEFAULT_FN_ATTRS 394_mm_ucomige_ss(__m128 __a, __m128 __b) 395{ 396 return __builtin_ia32_ucomige(__a, __b); 397} 398 399static __inline__ int __DEFAULT_FN_ATTRS 400_mm_ucomineq_ss(__m128 __a, __m128 __b) 401{ 402 return __builtin_ia32_ucomineq(__a, __b); 403} 404 405static __inline__ int __DEFAULT_FN_ATTRS 406_mm_cvtss_si32(__m128 __a) 407{ 408 return __builtin_ia32_cvtss2si(__a); 409} 410 411static __inline__ int __DEFAULT_FN_ATTRS 412_mm_cvt_ss2si(__m128 __a) 413{ 414 return _mm_cvtss_si32(__a); 415} 416 417#ifdef __x86_64__ 418 419static __inline__ long long __DEFAULT_FN_ATTRS 420_mm_cvtss_si64(__m128 __a) 421{ 422 return __builtin_ia32_cvtss2si64(__a); 423} 424 425#endif 426 427static __inline__ __m64 __DEFAULT_FN_ATTRS 428_mm_cvtps_pi32(__m128 __a) 429{ 430 return (__m64)__builtin_ia32_cvtps2pi(__a); 431} 432 433static __inline__ __m64 __DEFAULT_FN_ATTRS 434_mm_cvt_ps2pi(__m128 __a) 435{ 436 return _mm_cvtps_pi32(__a); 437} 438 439static __inline__ int __DEFAULT_FN_ATTRS 440_mm_cvttss_si32(__m128 __a) 441{ 442 return __a[0]; 443} 444 445static __inline__ int __DEFAULT_FN_ATTRS 446_mm_cvtt_ss2si(__m128 __a) 447{ 448 return _mm_cvttss_si32(__a); 449} 450 451static __inline__ long long __DEFAULT_FN_ATTRS 452_mm_cvttss_si64(__m128 __a) 453{ 454 return __a[0]; 455} 456 457static __inline__ __m64 __DEFAULT_FN_ATTRS 458_mm_cvttps_pi32(__m128 __a) 459{ 460 return (__m64)__builtin_ia32_cvttps2pi(__a); 461} 462 463static __inline__ __m64 __DEFAULT_FN_ATTRS 464_mm_cvtt_ps2pi(__m128 __a) 465{ 466 return _mm_cvttps_pi32(__a); 467} 468 469static __inline__ __m128 __DEFAULT_FN_ATTRS 470_mm_cvtsi32_ss(__m128 __a, int __b) 471{ 472 __a[0] = __b; 473 return __a; 474} 475 476static __inline__ __m128 __DEFAULT_FN_ATTRS 477_mm_cvt_si2ss(__m128 __a, int __b) 478{ 479 return _mm_cvtsi32_ss(__a, __b); 480} 481 482#ifdef __x86_64__ 483 484static __inline__ __m128 __DEFAULT_FN_ATTRS 485_mm_cvtsi64_ss(__m128 __a, long long __b) 486{ 487 __a[0] = __b; 488 return __a; 489} 490 491#endif 492 493static __inline__ __m128 __DEFAULT_FN_ATTRS 494_mm_cvtpi32_ps(__m128 __a, __m64 __b) 495{ 496 return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); 497} 498 499static __inline__ __m128 __DEFAULT_FN_ATTRS 500_mm_cvt_pi2ps(__m128 __a, __m64 __b) 501{ 502 return _mm_cvtpi32_ps(__a, __b); 503} 504 505static __inline__ float __DEFAULT_FN_ATTRS 506_mm_cvtss_f32(__m128 __a) 507{ 508 return __a[0]; 509} 510 511static __inline__ __m128 __DEFAULT_FN_ATTRS 512_mm_loadh_pi(__m128 __a, const __m64 *__p) 513{ 514 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 515 struct __mm_loadh_pi_struct { 516 __mm_loadh_pi_v2f32 __u; 517 } __attribute__((__packed__, __may_alias__)); 518 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 519 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 520 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 521} 522 523static __inline__ __m128 __DEFAULT_FN_ATTRS 524_mm_loadl_pi(__m128 __a, const __m64 *__p) 525{ 526 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 527 struct __mm_loadl_pi_struct { 528 __mm_loadl_pi_v2f32 __u; 529 } __attribute__((__packed__, __may_alias__)); 530 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 531 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 532 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 533} 534 535static __inline__ __m128 __DEFAULT_FN_ATTRS 536_mm_load_ss(const float *__p) 537{ 538 struct __mm_load_ss_struct { 539 float __u; 540 } __attribute__((__packed__, __may_alias__)); 541 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 542 return (__m128){ __u, 0, 0, 0 }; 543} 544 545static __inline__ __m128 __DEFAULT_FN_ATTRS 546_mm_load1_ps(const float *__p) 547{ 548 struct __mm_load1_ps_struct { 549 float __u; 550 } __attribute__((__packed__, __may_alias__)); 551 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 552 return (__m128){ __u, __u, __u, __u }; 553} 554 555#define _mm_load_ps1(p) _mm_load1_ps(p) 556 557static __inline__ __m128 __DEFAULT_FN_ATTRS 558_mm_load_ps(const float *__p) 559{ 560 return *(__m128*)__p; 561} 562 563static __inline__ __m128 __DEFAULT_FN_ATTRS 564_mm_loadu_ps(const float *__p) 565{ 566 struct __loadu_ps { 567 __m128 __v; 568 } __attribute__((__packed__, __may_alias__)); 569 return ((struct __loadu_ps*)__p)->__v; 570} 571 572static __inline__ __m128 __DEFAULT_FN_ATTRS 573_mm_loadr_ps(const float *__p) 574{ 575 __m128 __a = _mm_load_ps(__p); 576 return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 577} 578 579static __inline__ __m128 __DEFAULT_FN_ATTRS 580_mm_undefined_ps() 581{ 582 return (__m128)__builtin_ia32_undef128(); 583} 584 585static __inline__ __m128 __DEFAULT_FN_ATTRS 586_mm_set_ss(float __w) 587{ 588 return (__m128){ __w, 0, 0, 0 }; 589} 590 591static __inline__ __m128 __DEFAULT_FN_ATTRS 592_mm_set1_ps(float __w) 593{ 594 return (__m128){ __w, __w, __w, __w }; 595} 596 597/* Microsoft specific. */ 598static __inline__ __m128 __DEFAULT_FN_ATTRS 599_mm_set_ps1(float __w) 600{ 601 return _mm_set1_ps(__w); 602} 603 604static __inline__ __m128 __DEFAULT_FN_ATTRS 605_mm_set_ps(float __z, float __y, float __x, float __w) 606{ 607 return (__m128){ __w, __x, __y, __z }; 608} 609 610static __inline__ __m128 __DEFAULT_FN_ATTRS 611_mm_setr_ps(float __z, float __y, float __x, float __w) 612{ 613 return (__m128){ __z, __y, __x, __w }; 614} 615 616static __inline__ __m128 __DEFAULT_FN_ATTRS 617_mm_setzero_ps(void) 618{ 619 return (__m128){ 0, 0, 0, 0 }; 620} 621 622static __inline__ void __DEFAULT_FN_ATTRS 623_mm_storeh_pi(__m64 *__p, __m128 __a) 624{ 625 __builtin_ia32_storehps((__v2si *)__p, __a); 626} 627 628static __inline__ void __DEFAULT_FN_ATTRS 629_mm_storel_pi(__m64 *__p, __m128 __a) 630{ 631 __builtin_ia32_storelps((__v2si *)__p, __a); 632} 633 634static __inline__ void __DEFAULT_FN_ATTRS 635_mm_store_ss(float *__p, __m128 __a) 636{ 637 struct __mm_store_ss_struct { 638 float __u; 639 } __attribute__((__packed__, __may_alias__)); 640 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 641} 642 643static __inline__ void __DEFAULT_FN_ATTRS 644_mm_storeu_ps(float *__p, __m128 __a) 645{ 646 __builtin_ia32_storeups(__p, __a); 647} 648 649static __inline__ void __DEFAULT_FN_ATTRS 650_mm_store1_ps(float *__p, __m128 __a) 651{ 652 __a = __builtin_shufflevector(__a, __a, 0, 0, 0, 0); 653 _mm_storeu_ps(__p, __a); 654} 655 656static __inline__ void __DEFAULT_FN_ATTRS 657_mm_store_ps1(float *__p, __m128 __a) 658{ 659 return _mm_store1_ps(__p, __a); 660} 661 662static __inline__ void __DEFAULT_FN_ATTRS 663_mm_store_ps(float *__p, __m128 __a) 664{ 665 *(__m128 *)__p = __a; 666} 667 668static __inline__ void __DEFAULT_FN_ATTRS 669_mm_storer_ps(float *__p, __m128 __a) 670{ 671 __a = __builtin_shufflevector(__a, __a, 3, 2, 1, 0); 672 _mm_store_ps(__p, __a); 673} 674 675#define _MM_HINT_T0 3 676#define _MM_HINT_T1 2 677#define _MM_HINT_T2 1 678#define _MM_HINT_NTA 0 679 680#ifndef _MSC_VER 681/* FIXME: We have to #define this because "sel" must be a constant integer, and 682 Sema doesn't do any form of constant propagation yet. */ 683 684#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 685#endif 686 687static __inline__ void __DEFAULT_FN_ATTRS 688_mm_stream_pi(__m64 *__p, __m64 __a) 689{ 690 __builtin_ia32_movntq(__p, __a); 691} 692 693static __inline__ void __DEFAULT_FN_ATTRS 694_mm_stream_ps(float *__p, __m128 __a) 695{ 696 __builtin_ia32_movntps(__p, __a); 697} 698 699static __inline__ void __DEFAULT_FN_ATTRS 700_mm_sfence(void) 701{ 702 __builtin_ia32_sfence(); 703} 704 705static __inline__ int __DEFAULT_FN_ATTRS 706_mm_extract_pi16(__m64 __a, int __n) 707{ 708 __v4hi __b = (__v4hi)__a; 709 return (unsigned short)__b[__n & 3]; 710} 711 712static __inline__ __m64 __DEFAULT_FN_ATTRS 713_mm_insert_pi16(__m64 __a, int __d, int __n) 714{ 715 __v4hi __b = (__v4hi)__a; 716 __b[__n & 3] = __d; 717 return (__m64)__b; 718} 719 720static __inline__ __m64 __DEFAULT_FN_ATTRS 721_mm_max_pi16(__m64 __a, __m64 __b) 722{ 723 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 724} 725 726static __inline__ __m64 __DEFAULT_FN_ATTRS 727_mm_max_pu8(__m64 __a, __m64 __b) 728{ 729 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 730} 731 732static __inline__ __m64 __DEFAULT_FN_ATTRS 733_mm_min_pi16(__m64 __a, __m64 __b) 734{ 735 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 736} 737 738static __inline__ __m64 __DEFAULT_FN_ATTRS 739_mm_min_pu8(__m64 __a, __m64 __b) 740{ 741 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 742} 743 744static __inline__ int __DEFAULT_FN_ATTRS 745_mm_movemask_pi8(__m64 __a) 746{ 747 return __builtin_ia32_pmovmskb((__v8qi)__a); 748} 749 750static __inline__ __m64 __DEFAULT_FN_ATTRS 751_mm_mulhi_pu16(__m64 __a, __m64 __b) 752{ 753 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 754} 755 756#define _mm_shuffle_pi16(a, n) __extension__ ({ \ 757 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) 758 759static __inline__ void __DEFAULT_FN_ATTRS 760_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 761{ 762 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 763} 764 765static __inline__ __m64 __DEFAULT_FN_ATTRS 766_mm_avg_pu8(__m64 __a, __m64 __b) 767{ 768 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 769} 770 771static __inline__ __m64 __DEFAULT_FN_ATTRS 772_mm_avg_pu16(__m64 __a, __m64 __b) 773{ 774 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 775} 776 777static __inline__ __m64 __DEFAULT_FN_ATTRS 778_mm_sad_pu8(__m64 __a, __m64 __b) 779{ 780 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 781} 782 783static __inline__ unsigned int __DEFAULT_FN_ATTRS 784_mm_getcsr(void) 785{ 786 return __builtin_ia32_stmxcsr(); 787} 788 789static __inline__ void __DEFAULT_FN_ATTRS 790_mm_setcsr(unsigned int __i) 791{ 792 __builtin_ia32_ldmxcsr(__i); 793} 794 795#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 796 (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 797 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 798 (((mask) & 0x30) >> 4) + 4, \ 799 (((mask) & 0xc0) >> 6) + 4); }) 800 801static __inline__ __m128 __DEFAULT_FN_ATTRS 802_mm_unpackhi_ps(__m128 __a, __m128 __b) 803{ 804 return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); 805} 806 807static __inline__ __m128 __DEFAULT_FN_ATTRS 808_mm_unpacklo_ps(__m128 __a, __m128 __b) 809{ 810 return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); 811} 812 813static __inline__ __m128 __DEFAULT_FN_ATTRS 814_mm_move_ss(__m128 __a, __m128 __b) 815{ 816 return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); 817} 818 819static __inline__ __m128 __DEFAULT_FN_ATTRS 820_mm_movehl_ps(__m128 __a, __m128 __b) 821{ 822 return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); 823} 824 825static __inline__ __m128 __DEFAULT_FN_ATTRS 826_mm_movelh_ps(__m128 __a, __m128 __b) 827{ 828 return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); 829} 830 831static __inline__ __m128 __DEFAULT_FN_ATTRS 832_mm_cvtpi16_ps(__m64 __a) 833{ 834 __m64 __b, __c; 835 __m128 __r; 836 837 __b = _mm_setzero_si64(); 838 __b = _mm_cmpgt_pi16(__b, __a); 839 __c = _mm_unpackhi_pi16(__a, __b); 840 __r = _mm_setzero_ps(); 841 __r = _mm_cvtpi32_ps(__r, __c); 842 __r = _mm_movelh_ps(__r, __r); 843 __c = _mm_unpacklo_pi16(__a, __b); 844 __r = _mm_cvtpi32_ps(__r, __c); 845 846 return __r; 847} 848 849static __inline__ __m128 __DEFAULT_FN_ATTRS 850_mm_cvtpu16_ps(__m64 __a) 851{ 852 __m64 __b, __c; 853 __m128 __r; 854 855 __b = _mm_setzero_si64(); 856 __c = _mm_unpackhi_pi16(__a, __b); 857 __r = _mm_setzero_ps(); 858 __r = _mm_cvtpi32_ps(__r, __c); 859 __r = _mm_movelh_ps(__r, __r); 860 __c = _mm_unpacklo_pi16(__a, __b); 861 __r = _mm_cvtpi32_ps(__r, __c); 862 863 return __r; 864} 865 866static __inline__ __m128 __DEFAULT_FN_ATTRS 867_mm_cvtpi8_ps(__m64 __a) 868{ 869 __m64 __b; 870 871 __b = _mm_setzero_si64(); 872 __b = _mm_cmpgt_pi8(__b, __a); 873 __b = _mm_unpacklo_pi8(__a, __b); 874 875 return _mm_cvtpi16_ps(__b); 876} 877 878static __inline__ __m128 __DEFAULT_FN_ATTRS 879_mm_cvtpu8_ps(__m64 __a) 880{ 881 __m64 __b; 882 883 __b = _mm_setzero_si64(); 884 __b = _mm_unpacklo_pi8(__a, __b); 885 886 return _mm_cvtpi16_ps(__b); 887} 888 889static __inline__ __m128 __DEFAULT_FN_ATTRS 890_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 891{ 892 __m128 __c; 893 894 __c = _mm_setzero_ps(); 895 __c = _mm_cvtpi32_ps(__c, __b); 896 __c = _mm_movelh_ps(__c, __c); 897 898 return _mm_cvtpi32_ps(__c, __a); 899} 900 901static __inline__ __m64 __DEFAULT_FN_ATTRS 902_mm_cvtps_pi16(__m128 __a) 903{ 904 __m64 __b, __c; 905 906 __b = _mm_cvtps_pi32(__a); 907 __a = _mm_movehl_ps(__a, __a); 908 __c = _mm_cvtps_pi32(__a); 909 910 return _mm_packs_pi32(__b, __c); 911} 912 913static __inline__ __m64 __DEFAULT_FN_ATTRS 914_mm_cvtps_pi8(__m128 __a) 915{ 916 __m64 __b, __c; 917 918 __b = _mm_cvtps_pi16(__a); 919 __c = _mm_setzero_si64(); 920 921 return _mm_packs_pi16(__b, __c); 922} 923 924static __inline__ int __DEFAULT_FN_ATTRS 925_mm_movemask_ps(__m128 __a) 926{ 927 return __builtin_ia32_movmskps(__a); 928} 929 930 931#ifdef _MSC_VER 932#define _MM_ALIGN16 __declspec(align(16)) 933#endif 934 935#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 936 937#define _MM_EXCEPT_INVALID (0x0001) 938#define _MM_EXCEPT_DENORM (0x0002) 939#define _MM_EXCEPT_DIV_ZERO (0x0004) 940#define _MM_EXCEPT_OVERFLOW (0x0008) 941#define _MM_EXCEPT_UNDERFLOW (0x0010) 942#define _MM_EXCEPT_INEXACT (0x0020) 943#define _MM_EXCEPT_MASK (0x003f) 944 945#define _MM_MASK_INVALID (0x0080) 946#define _MM_MASK_DENORM (0x0100) 947#define _MM_MASK_DIV_ZERO (0x0200) 948#define _MM_MASK_OVERFLOW (0x0400) 949#define _MM_MASK_UNDERFLOW (0x0800) 950#define _MM_MASK_INEXACT (0x1000) 951#define _MM_MASK_MASK (0x1f80) 952 953#define _MM_ROUND_NEAREST (0x0000) 954#define _MM_ROUND_DOWN (0x2000) 955#define _MM_ROUND_UP (0x4000) 956#define _MM_ROUND_TOWARD_ZERO (0x6000) 957#define _MM_ROUND_MASK (0x6000) 958 959#define _MM_FLUSH_ZERO_MASK (0x8000) 960#define _MM_FLUSH_ZERO_ON (0x8000) 961#define _MM_FLUSH_ZERO_OFF (0x0000) 962 963#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 964#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 965#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 966#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 967 968#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 969#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 970#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 971#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 972 973#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 974do { \ 975 __m128 tmp3, tmp2, tmp1, tmp0; \ 976 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 977 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 978 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 979 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 980 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 981 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 982 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 983 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 984} while (0) 985 986/* Aliases for compatibility. */ 987#define _m_pextrw _mm_extract_pi16 988#define _m_pinsrw _mm_insert_pi16 989#define _m_pmaxsw _mm_max_pi16 990#define _m_pmaxub _mm_max_pu8 991#define _m_pminsw _mm_min_pi16 992#define _m_pminub _mm_min_pu8 993#define _m_pmovmskb _mm_movemask_pi8 994#define _m_pmulhuw _mm_mulhi_pu16 995#define _m_pshufw _mm_shuffle_pi16 996#define _m_maskmovq _mm_maskmove_si64 997#define _m_pavgb _mm_avg_pu8 998#define _m_pavgw _mm_avg_pu16 999#define _m_psadbw _mm_sad_pu8 1000#define _m_ _mm_ 1001#define _m_ _mm_ 1002 1003#undef __DEFAULT_FN_ATTRS 1004 1005/* Ugly hack for backwards-compatibility (compatible with gcc) */ 1006#if defined(__SSE2__) && !__has_feature(modules) 1007#include <emmintrin.h> 1008#endif 1009 1010#endif /* __XMMINTRIN_H */ 1011