xmmintrin.h revision 69993392d2e9e0560a184e65bdbe64527de3046f
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef int __v4si __attribute__((__vector_size__(16))); 34typedef float __v4sf __attribute__((__vector_size__(16))); 35typedef float __m128 __attribute__((__vector_size__(16))); 36 37#include <mm_malloc.h> 38 39static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 40_mm_add_ss(__m128 a, __m128 b) 41{ 42 a[0] += b[0]; 43 return a; 44} 45 46static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 47_mm_add_ps(__m128 a, __m128 b) 48{ 49 return a + b; 50} 51 52static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 53_mm_sub_ss(__m128 a, __m128 b) 54{ 55 a[0] -= b[0]; 56 return a; 57} 58 59static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 60_mm_sub_ps(__m128 a, __m128 b) 61{ 62 return a - b; 63} 64 65static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 66_mm_mul_ss(__m128 a, __m128 b) 67{ 68 a[0] *= b[0]; 69 return a; 70} 71 72static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 73_mm_mul_ps(__m128 a, __m128 b) 74{ 75 return a * b; 76} 77 78static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 79_mm_div_ss(__m128 a, __m128 b) 80{ 81 a[0] /= b[0]; 82 return a; 83} 84 85static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 86_mm_div_ps(__m128 a, __m128 b) 87{ 88 return a / b; 89} 90 91static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 92_mm_sqrt_ss(__m128 a) 93{ 94 return __builtin_ia32_sqrtss(a); 95} 96 97static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 98_mm_sqrt_ps(__m128 a) 99{ 100 return __builtin_ia32_sqrtps(a); 101} 102 103static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 104_mm_rcp_ss(__m128 a) 105{ 106 return __builtin_ia32_rcpss(a); 107} 108 109static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 110_mm_rcp_ps(__m128 a) 111{ 112 return __builtin_ia32_rcpps(a); 113} 114 115static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116_mm_rsqrt_ss(__m128 a) 117{ 118 return __builtin_ia32_rsqrtss(a); 119} 120 121static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122_mm_rsqrt_ps(__m128 a) 123{ 124 return __builtin_ia32_rsqrtps(a); 125} 126 127static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 128_mm_min_ss(__m128 a, __m128 b) 129{ 130 return __builtin_ia32_minss(a, b); 131} 132 133static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 134_mm_min_ps(__m128 a, __m128 b) 135{ 136 return __builtin_ia32_minps(a, b); 137} 138 139static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 140_mm_max_ss(__m128 a, __m128 b) 141{ 142 return __builtin_ia32_maxss(a, b); 143} 144 145static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 146_mm_max_ps(__m128 a, __m128 b) 147{ 148 return __builtin_ia32_maxps(a, b); 149} 150 151static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 152_mm_and_ps(__m128 a, __m128 b) 153{ 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 return (__m128)(~(__v4si)a & (__v4si)b); 161} 162 163static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 164_mm_or_ps(__m128 a, __m128 b) 165{ 166 return (__m128)((__v4si)a | (__v4si)b); 167} 168 169static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 170_mm_xor_ps(__m128 a, __m128 b) 171{ 172 return (__m128)((__v4si)a ^ (__v4si)b); 173} 174 175static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 176_mm_cmpeq_ss(__m128 a, __m128 b) 177{ 178 return (__m128)__builtin_ia32_cmpss(a, b, 0); 179} 180 181static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 182_mm_cmpeq_ps(__m128 a, __m128 b) 183{ 184 return (__m128)__builtin_ia32_cmpps(a, b, 0); 185} 186 187static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 188_mm_cmplt_ss(__m128 a, __m128 b) 189{ 190 return (__m128)__builtin_ia32_cmpss(a, b, 1); 191} 192 193static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 194_mm_cmplt_ps(__m128 a, __m128 b) 195{ 196 return (__m128)__builtin_ia32_cmpps(a, b, 1); 197} 198 199static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 200_mm_cmple_ss(__m128 a, __m128 b) 201{ 202 return (__m128)__builtin_ia32_cmpss(a, b, 2); 203} 204 205static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 206_mm_cmple_ps(__m128 a, __m128 b) 207{ 208 return (__m128)__builtin_ia32_cmpps(a, b, 2); 209} 210 211static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 212_mm_cmpgt_ss(__m128 a, __m128 b) 213{ 214 return (__m128)__builtin_ia32_cmpss(b, a, 1); 215} 216 217static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 218_mm_cmpgt_ps(__m128 a, __m128 b) 219{ 220 return (__m128)__builtin_ia32_cmpps(b, a, 1); 221} 222 223static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 224_mm_cmpge_ss(__m128 a, __m128 b) 225{ 226 return (__m128)__builtin_ia32_cmpss(b, a, 2); 227} 228 229static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 230_mm_cmpge_ps(__m128 a, __m128 b) 231{ 232 return (__m128)__builtin_ia32_cmpps(b, a, 2); 233} 234 235static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 236_mm_cmpneq_ss(__m128 a, __m128 b) 237{ 238 return (__m128)__builtin_ia32_cmpss(a, b, 4); 239} 240 241static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 242_mm_cmpneq_ps(__m128 a, __m128 b) 243{ 244 return (__m128)__builtin_ia32_cmpps(a, b, 4); 245} 246 247static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 248_mm_cmpnlt_ss(__m128 a, __m128 b) 249{ 250 return (__m128)__builtin_ia32_cmpss(a, b, 5); 251} 252 253static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 254_mm_cmpnlt_ps(__m128 a, __m128 b) 255{ 256 return (__m128)__builtin_ia32_cmpps(a, b, 5); 257} 258 259static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 260_mm_cmpnle_ss(__m128 a, __m128 b) 261{ 262 return (__m128)__builtin_ia32_cmpss(a, b, 6); 263} 264 265static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 266_mm_cmpnle_ps(__m128 a, __m128 b) 267{ 268 return (__m128)__builtin_ia32_cmpps(a, b, 6); 269} 270 271static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 272_mm_cmpngt_ss(__m128 a, __m128 b) 273{ 274 return (__m128)__builtin_ia32_cmpss(b, a, 5); 275} 276 277static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 278_mm_cmpngt_ps(__m128 a, __m128 b) 279{ 280 return (__m128)__builtin_ia32_cmpps(b, a, 5); 281} 282 283static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 284_mm_cmpnge_ss(__m128 a, __m128 b) 285{ 286 return (__m128)__builtin_ia32_cmpss(b, a, 6); 287} 288 289static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 290_mm_cmpnge_ps(__m128 a, __m128 b) 291{ 292 return (__m128)__builtin_ia32_cmpps(b, a, 6); 293} 294 295static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 296_mm_cmpord_ss(__m128 a, __m128 b) 297{ 298 return (__m128)__builtin_ia32_cmpss(a, b, 7); 299} 300 301static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 302_mm_cmpord_ps(__m128 a, __m128 b) 303{ 304 return (__m128)__builtin_ia32_cmpps(a, b, 7); 305} 306 307static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 308_mm_cmpunord_ss(__m128 a, __m128 b) 309{ 310 return (__m128)__builtin_ia32_cmpss(a, b, 3); 311} 312 313static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 314_mm_cmpunord_ps(__m128 a, __m128 b) 315{ 316 return (__m128)__builtin_ia32_cmpps(a, b, 3); 317} 318 319static __inline__ int __attribute__((__always_inline__, __nodebug__)) 320_mm_comieq_ss(__m128 a, __m128 b) 321{ 322 return __builtin_ia32_comieq(a, b); 323} 324 325static __inline__ int __attribute__((__always_inline__, __nodebug__)) 326_mm_comilt_ss(__m128 a, __m128 b) 327{ 328 return __builtin_ia32_comilt(a, b); 329} 330 331static __inline__ int __attribute__((__always_inline__, __nodebug__)) 332_mm_comile_ss(__m128 a, __m128 b) 333{ 334 return __builtin_ia32_comile(a, b); 335} 336 337static __inline__ int __attribute__((__always_inline__, __nodebug__)) 338_mm_comigt_ss(__m128 a, __m128 b) 339{ 340 return __builtin_ia32_comigt(a, b); 341} 342 343static __inline__ int __attribute__((__always_inline__, __nodebug__)) 344_mm_comige_ss(__m128 a, __m128 b) 345{ 346 return __builtin_ia32_comige(a, b); 347} 348 349static __inline__ int __attribute__((__always_inline__, __nodebug__)) 350_mm_comineq_ss(__m128 a, __m128 b) 351{ 352 return __builtin_ia32_comineq(a, b); 353} 354 355static __inline__ int __attribute__((__always_inline__, __nodebug__)) 356_mm_ucomieq_ss(__m128 a, __m128 b) 357{ 358 return __builtin_ia32_ucomieq(a, b); 359} 360 361static __inline__ int __attribute__((__always_inline__, __nodebug__)) 362_mm_ucomilt_ss(__m128 a, __m128 b) 363{ 364 return __builtin_ia32_ucomilt(a, b); 365} 366 367static __inline__ int __attribute__((__always_inline__, __nodebug__)) 368_mm_ucomile_ss(__m128 a, __m128 b) 369{ 370 return __builtin_ia32_ucomile(a, b); 371} 372 373static __inline__ int __attribute__((__always_inline__, __nodebug__)) 374_mm_ucomigt_ss(__m128 a, __m128 b) 375{ 376 return __builtin_ia32_ucomigt(a, b); 377} 378 379static __inline__ int __attribute__((__always_inline__, __nodebug__)) 380_mm_ucomige_ss(__m128 a, __m128 b) 381{ 382 return __builtin_ia32_ucomige(a, b); 383} 384 385static __inline__ int __attribute__((__always_inline__, __nodebug__)) 386_mm_ucomineq_ss(__m128 a, __m128 b) 387{ 388 return __builtin_ia32_ucomineq(a, b); 389} 390 391static __inline__ int __attribute__((__always_inline__, __nodebug__)) 392_mm_cvtss_si32(__m128 a) 393{ 394 return __builtin_ia32_cvtss2si(a); 395} 396 397static __inline__ int __attribute__((__always_inline__, __nodebug__)) 398_mm_cvt_ss2si(__m128 a) 399{ 400 return _mm_cvtss_si32(a); 401} 402 403#ifdef __x86_64__ 404 405static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 406_mm_cvtss_si64(__m128 a) 407{ 408 return __builtin_ia32_cvtss2si64(a); 409} 410 411#endif 412 413static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 414_mm_cvtps_pi32(__m128 a) 415{ 416 return (__m64)__builtin_ia32_cvtps2pi(a); 417} 418 419static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 420_mm_cvt_ps2pi(__m128 a) 421{ 422 return _mm_cvtps_pi32(a); 423} 424 425static __inline__ int __attribute__((__always_inline__, __nodebug__)) 426_mm_cvttss_si32(__m128 a) 427{ 428 return a[0]; 429} 430 431static __inline__ int __attribute__((__always_inline__, __nodebug__)) 432_mm_cvtt_ss2si(__m128 a) 433{ 434 return _mm_cvttss_si32(a); 435} 436 437static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 438_mm_cvttss_si64(__m128 a) 439{ 440 return a[0]; 441} 442 443static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 444_mm_cvttps_pi32(__m128 a) 445{ 446 return (__m64)__builtin_ia32_cvttps2pi(a); 447} 448 449static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 450_mm_cvtt_ps2pi(__m128 a) 451{ 452 return _mm_cvttps_pi32(a); 453} 454 455static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 456_mm_cvtsi32_ss(__m128 a, int b) 457{ 458 a[0] = b; 459 return a; 460} 461 462static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 463_mm_cvt_si2ss(__m128 a, int b) 464{ 465 return _mm_cvtsi32_ss(a, b); 466} 467 468#ifdef __x86_64__ 469 470static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 471_mm_cvtsi64_ss(__m128 a, long long b) 472{ 473 a[0] = b; 474 return a; 475} 476 477#endif 478 479static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 480_mm_cvtpi32_ps(__m128 a, __m64 b) 481{ 482 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 483} 484 485static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 486_mm_cvt_pi2ps(__m128 a, __m64 b) 487{ 488 return _mm_cvtpi32_ps(a, b); 489} 490 491static __inline__ float __attribute__((__always_inline__, __nodebug__)) 492_mm_cvtss_f32(__m128 a) 493{ 494 return a[0]; 495} 496 497static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 498_mm_loadh_pi(__m128 a, const __m64 *p) 499{ 500 __m128 b; 501 b[0] = *(float*)p; 502 b[1] = *((float*)p+1); 503 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 504} 505 506static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 507_mm_loadl_pi(__m128 a, const __m64 *p) 508{ 509 __m128 b; 510 b[0] = *(float*)p; 511 b[1] = *((float*)p+1); 512 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 513} 514 515static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 516_mm_load_ss(const float *p) 517{ 518 return (__m128){ *p, 0, 0, 0 }; 519} 520 521static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 522_mm_load1_ps(const float *p) 523{ 524 return (__m128){ *p, *p, *p, *p }; 525} 526 527#define _mm_load_ps1(p) _mm_load1_ps(p) 528 529static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 530_mm_load_ps(const float *p) 531{ 532 return *(__m128*)p; 533} 534 535static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 536_mm_loadu_ps(const float *p) 537{ 538 return __builtin_ia32_loadups(p); 539} 540 541static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 542_mm_loadr_ps(const float *p) 543{ 544 __m128 a = _mm_load_ps(p); 545 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 546} 547 548static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 549_mm_set_ss(float w) 550{ 551 return (__m128){ w, 0, 0, 0 }; 552} 553 554static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 555_mm_set1_ps(float w) 556{ 557 return (__m128){ w, w, w, w }; 558} 559 560// Microsoft specific. 561static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 562_mm_set_ps1(float w) 563{ 564 return _mm_set1_ps(w); 565} 566 567static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 568_mm_set_ps(float z, float y, float x, float w) 569{ 570 return (__m128){ w, x, y, z }; 571} 572 573static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 574_mm_setr_ps(float z, float y, float x, float w) 575{ 576 return (__m128){ z, y, x, w }; 577} 578 579static __inline__ __m128 __attribute__((__always_inline__)) 580_mm_setzero_ps(void) 581{ 582 return (__m128){ 0, 0, 0, 0 }; 583} 584 585static __inline__ void __attribute__((__always_inline__)) 586_mm_storeh_pi(__m64 *p, __m128 a) 587{ 588 __builtin_ia32_storehps((__v2si *)p, a); 589} 590 591static __inline__ void __attribute__((__always_inline__)) 592_mm_storel_pi(__m64 *p, __m128 a) 593{ 594 __builtin_ia32_storelps((__v2si *)p, a); 595} 596 597static __inline__ void __attribute__((__always_inline__)) 598_mm_store_ss(float *p, __m128 a) 599{ 600 *p = a[0]; 601} 602 603static __inline__ void __attribute__((__always_inline__, __nodebug__)) 604_mm_storeu_ps(float *p, __m128 a) 605{ 606 __builtin_ia32_storeups(p, a); 607} 608 609static __inline__ void __attribute__((__always_inline__, __nodebug__)) 610_mm_store1_ps(float *p, __m128 a) 611{ 612 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 613 _mm_storeu_ps(p, a); 614} 615 616static __inline__ void __attribute__((__always_inline__, __nodebug__)) 617_mm_store_ps1(float *p, __m128 a) 618{ 619 return _mm_store1_ps(p, a); 620} 621 622static __inline__ void __attribute__((__always_inline__, __nodebug__)) 623_mm_store_ps(float *p, __m128 a) 624{ 625 *(__m128 *)p = a; 626} 627 628static __inline__ void __attribute__((__always_inline__, __nodebug__)) 629_mm_storer_ps(float *p, __m128 a) 630{ 631 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 632 _mm_store_ps(p, a); 633} 634 635#define _MM_HINT_T0 3 636#define _MM_HINT_T1 2 637#define _MM_HINT_T2 1 638#define _MM_HINT_NTA 0 639 640/* FIXME: We have to #define this because "sel" must be a constant integer, and 641 Sema doesn't do any form of constant propagation yet. */ 642 643#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel)) 644 645static __inline__ void __attribute__((__always_inline__, __nodebug__)) 646_mm_stream_pi(__m64 *p, __m64 a) 647{ 648 __builtin_ia32_movntq(p, a); 649} 650 651static __inline__ void __attribute__((__always_inline__, __nodebug__)) 652_mm_stream_ps(float *p, __m128 a) 653{ 654 __builtin_ia32_movntps(p, a); 655} 656 657static __inline__ void __attribute__((__always_inline__, __nodebug__)) 658_mm_sfence(void) 659{ 660 __builtin_ia32_sfence(); 661} 662 663static __inline__ int __attribute__((__always_inline__, __nodebug__)) 664_mm_extract_pi16(__m64 a, int n) 665{ 666 __v4hi b = (__v4hi)a; 667 return (unsigned short)b[n & 3]; 668} 669 670static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 671_mm_insert_pi16(__m64 a, int d, int n) 672{ 673 __v4hi b = (__v4hi)a; 674 b[n & 3] = d; 675 return (__m64)b; 676} 677 678static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 679_mm_max_pi16(__m64 a, __m64 b) 680{ 681 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 682} 683 684static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 685_mm_max_pu8(__m64 a, __m64 b) 686{ 687 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 688} 689 690static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 691_mm_min_pi16(__m64 a, __m64 b) 692{ 693 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 694} 695 696static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 697_mm_min_pu8(__m64 a, __m64 b) 698{ 699 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 700} 701 702static __inline__ int __attribute__((__always_inline__, __nodebug__)) 703_mm_movemask_pi8(__m64 a) 704{ 705 return __builtin_ia32_pmovmskb((__v8qi)a); 706} 707 708static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 709_mm_mulhi_pu16(__m64 a, __m64 b) 710{ 711 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 712} 713 714#define _mm_shuffle_pi16(a, n) \ 715 ((__m64)__builtin_ia32_pshufw(a, n)) 716 717static __inline__ void __attribute__((__always_inline__, __nodebug__)) 718_mm_maskmove_si64(__m64 d, __m64 n, char *p) 719{ 720 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 721} 722 723static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 724_mm_avg_pu8(__m64 a, __m64 b) 725{ 726 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 727} 728 729static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 730_mm_avg_pu16(__m64 a, __m64 b) 731{ 732 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 733} 734 735static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 736_mm_sad_pu8(__m64 a, __m64 b) 737{ 738 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 739} 740 741static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 742_mm_getcsr(void) 743{ 744 return __builtin_ia32_stmxcsr(); 745} 746 747static __inline__ void __attribute__((__always_inline__, __nodebug__)) 748_mm_setcsr(unsigned int i) 749{ 750 __builtin_ia32_ldmxcsr(i); 751} 752 753#define _mm_shuffle_ps(a, b, mask) \ 754 (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b), \ 755 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 756 (((mask) & 0x30) >> 4) + 4, \ 757 (((mask) & 0xc0) >> 6) + 4)) 758 759static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 760_mm_unpackhi_ps(__m128 a, __m128 b) 761{ 762 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 763} 764 765static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 766_mm_unpacklo_ps(__m128 a, __m128 b) 767{ 768 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 769} 770 771static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 772_mm_move_ss(__m128 a, __m128 b) 773{ 774 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 775} 776 777static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 778_mm_movehl_ps(__m128 a, __m128 b) 779{ 780 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 781} 782 783static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 784_mm_movelh_ps(__m128 a, __m128 b) 785{ 786 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 787} 788 789static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 790_mm_cvtpi16_ps(__m64 a) 791{ 792 __m64 b, c; 793 __m128 r; 794 795 b = _mm_setzero_si64(); 796 b = _mm_cmpgt_pi16(b, a); 797 c = _mm_unpackhi_pi16(a, b); 798 r = _mm_setzero_ps(); 799 r = _mm_cvtpi32_ps(r, c); 800 r = _mm_movelh_ps(r, r); 801 c = _mm_unpacklo_pi16(a, b); 802 r = _mm_cvtpi32_ps(r, c); 803 804 return r; 805} 806 807static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 808_mm_cvtpu16_ps(__m64 a) 809{ 810 __m64 b, c; 811 __m128 r; 812 813 b = _mm_setzero_si64(); 814 c = _mm_unpackhi_pi16(a, b); 815 r = _mm_setzero_ps(); 816 r = _mm_cvtpi32_ps(r, c); 817 r = _mm_movelh_ps(r, r); 818 c = _mm_unpacklo_pi16(a, b); 819 r = _mm_cvtpi32_ps(r, c); 820 821 return r; 822} 823 824static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 825_mm_cvtpi8_ps(__m64 a) 826{ 827 __m64 b; 828 829 b = _mm_setzero_si64(); 830 b = _mm_cmpgt_pi8(b, a); 831 b = _mm_unpacklo_pi8(a, b); 832 833 return _mm_cvtpi16_ps(b); 834} 835 836static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 837_mm_cvtpu8_ps(__m64 a) 838{ 839 __m64 b; 840 841 b = _mm_setzero_si64(); 842 b = _mm_unpacklo_pi8(a, b); 843 844 return _mm_cvtpi16_ps(b); 845} 846 847static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 848_mm_cvtpi32x2_ps(__m64 a, __m64 b) 849{ 850 __m128 c; 851 852 c = _mm_setzero_ps(); 853 c = _mm_cvtpi32_ps(c, b); 854 c = _mm_movelh_ps(c, c); 855 856 return _mm_cvtpi32_ps(c, a); 857} 858 859static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 860_mm_cvtps_pi16(__m128 a) 861{ 862 __m64 b, c; 863 864 b = _mm_cvtps_pi32(a); 865 a = _mm_movehl_ps(a, a); 866 c = _mm_cvtps_pi32(a); 867 868 return _mm_packs_pi16(b, c); 869} 870 871static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 872_mm_cvtps_pi8(__m128 a) 873{ 874 __m64 b, c; 875 876 b = _mm_cvtps_pi16(a); 877 c = _mm_setzero_si64(); 878 879 return _mm_packs_pi16(b, c); 880} 881 882static __inline__ int __attribute__((__always_inline__, __nodebug__)) 883_mm_movemask_ps(__m128 a) 884{ 885 return __builtin_ia32_movmskps(a); 886} 887 888#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 889 890#define _MM_EXCEPT_INVALID (0x0001) 891#define _MM_EXCEPT_DENORM (0x0002) 892#define _MM_EXCEPT_DIV_ZERO (0x0004) 893#define _MM_EXCEPT_OVERFLOW (0x0008) 894#define _MM_EXCEPT_UNDERFLOW (0x0010) 895#define _MM_EXCEPT_INEXACT (0x0020) 896#define _MM_EXCEPT_MASK (0x003f) 897 898#define _MM_MASK_INVALID (0x0080) 899#define _MM_MASK_DENORM (0x0100) 900#define _MM_MASK_DIV_ZERO (0x0200) 901#define _MM_MASK_OVERFLOW (0x0400) 902#define _MM_MASK_UNDERFLOW (0x0800) 903#define _MM_MASK_INEXACT (0x1000) 904#define _MM_MASK_MASK (0x1f80) 905 906#define _MM_ROUND_NEAREST (0x0000) 907#define _MM_ROUND_DOWN (0x2000) 908#define _MM_ROUND_UP (0x4000) 909#define _MM_ROUND_TOWARD_ZERO (0x6000) 910#define _MM_ROUND_MASK (0x6000) 911 912#define _MM_FLUSH_ZERO_MASK (0x8000) 913#define _MM_FLUSH_ZERO_ON (0x8000) 914#define _MM_FLUSH_ZERO_OFF (0x8000) 915 916#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 917#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 918#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 919#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 920 921#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 922#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 923#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 924#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 925 926#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 927do { \ 928 __m128 tmp3, tmp2, tmp1, tmp0; \ 929 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 930 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 931 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 932 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 933 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 934 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 935 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 936 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 937} while (0) 938 939/* Aliases for compatibility. */ 940#define _m_pextrw _mm_extract_pi16 941#define _m_pinsrw _mm_insert_pi16 942#define _m_pmaxsw _mm_max_pi16 943#define _m_pmaxub _mm_max_pu8 944#define _m_pminsw _mm_min_pi16 945#define _m_pminub _mm_min_pu8 946#define _m_pmovmskb _mm_movemask_pi8 947#define _m_pmulhuw _mm_mulhi_pu16 948#define _m_pshufw _mm_shuffle_pi16 949#define _m_maskmovq _mm_maskmove_si64 950#define _m_pavgb _mm_avg_pu8 951#define _m_pavgw _mm_avg_pu16 952#define _m_psadbw _mm_sad_pu8 953#define _m_ _mm_ 954#define _m_ _mm_ 955 956/* Ugly hack for backwards-compatibility (compatible with gcc) */ 957#ifdef __SSE2__ 958#include <emmintrin.h> 959#endif 960 961#endif /* __SSE__ */ 962 963#endif /* __XMMINTRIN_H */ 964