xmmintrin.h revision 79e5ab7a537987348a9ba01424d8bbe7080eac57
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef int __v4si __attribute__((__vector_size__(16))); 34typedef float __v4sf __attribute__((__vector_size__(16))); 35typedef float __m128 __attribute__((__vector_size__(16))); 36 37#include <mm_malloc.h> 38 39static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 40_mm_add_ss(__m128 a, __m128 b) 41{ 42 a[0] += b[0]; 43 return a; 44} 45 46static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 47_mm_add_ps(__m128 a, __m128 b) 48{ 49 return a + b; 50} 51 52static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 53_mm_sub_ss(__m128 a, __m128 b) 54{ 55 a[0] -= b[0]; 56 return a; 57} 58 59static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 60_mm_sub_ps(__m128 a, __m128 b) 61{ 62 return a - b; 63} 64 65static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 66_mm_mul_ss(__m128 a, __m128 b) 67{ 68 a[0] *= b[0]; 69 return a; 70} 71 72static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 73_mm_mul_ps(__m128 a, __m128 b) 74{ 75 return a * b; 76} 77 78static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 79_mm_div_ss(__m128 a, __m128 b) 80{ 81 a[0] /= b[0]; 82 return a; 83} 84 85static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 86_mm_div_ps(__m128 a, __m128 b) 87{ 88 return a / b; 89} 90 91static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 92_mm_sqrt_ss(__m128 a) 93{ 94 return __builtin_ia32_sqrtss(a); 95} 96 97static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 98_mm_sqrt_ps(__m128 a) 99{ 100 return __builtin_ia32_sqrtps(a); 101} 102 103static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 104_mm_rcp_ss(__m128 a) 105{ 106 return __builtin_ia32_rcpss(a); 107} 108 109static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 110_mm_rcp_ps(__m128 a) 111{ 112 return __builtin_ia32_rcpps(a); 113} 114 115static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 116_mm_rsqrt_ss(__m128 a) 117{ 118 return __builtin_ia32_rsqrtss(a); 119} 120 121static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 122_mm_rsqrt_ps(__m128 a) 123{ 124 return __builtin_ia32_rsqrtps(a); 125} 126 127static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 128_mm_min_ss(__m128 a, __m128 b) 129{ 130 return __builtin_ia32_minss(a, b); 131} 132 133static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 134_mm_min_ps(__m128 a, __m128 b) 135{ 136 return __builtin_ia32_minps(a, b); 137} 138 139static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 140_mm_max_ss(__m128 a, __m128 b) 141{ 142 return __builtin_ia32_maxss(a, b); 143} 144 145static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 146_mm_max_ps(__m128 a, __m128 b) 147{ 148 return __builtin_ia32_maxps(a, b); 149} 150 151static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 152_mm_and_ps(__m128 a, __m128 b) 153{ 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 return (__m128)(~(__v4si)a & (__v4si)b); 161} 162 163static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 164_mm_or_ps(__m128 a, __m128 b) 165{ 166 return (__m128)((__v4si)a | (__v4si)b); 167} 168 169static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 170_mm_xor_ps(__m128 a, __m128 b) 171{ 172 return (__m128)((__v4si)a ^ (__v4si)b); 173} 174 175static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 176_mm_cmpeq_ss(__m128 a, __m128 b) 177{ 178 return (__m128)__builtin_ia32_cmpss(a, b, 0); 179} 180 181static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 182_mm_cmpeq_ps(__m128 a, __m128 b) 183{ 184 return (__m128)__builtin_ia32_cmpps(a, b, 0); 185} 186 187static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 188_mm_cmplt_ss(__m128 a, __m128 b) 189{ 190 return (__m128)__builtin_ia32_cmpss(a, b, 1); 191} 192 193static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 194_mm_cmplt_ps(__m128 a, __m128 b) 195{ 196 return (__m128)__builtin_ia32_cmpps(a, b, 1); 197} 198 199static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 200_mm_cmple_ss(__m128 a, __m128 b) 201{ 202 return (__m128)__builtin_ia32_cmpss(a, b, 2); 203} 204 205static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 206_mm_cmple_ps(__m128 a, __m128 b) 207{ 208 return (__m128)__builtin_ia32_cmpps(a, b, 2); 209} 210 211static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 212_mm_cmpgt_ss(__m128 a, __m128 b) 213{ 214 return (__m128)__builtin_ia32_cmpss(b, a, 1); 215} 216 217static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 218_mm_cmpgt_ps(__m128 a, __m128 b) 219{ 220 return (__m128)__builtin_ia32_cmpps(b, a, 1); 221} 222 223static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 224_mm_cmpge_ss(__m128 a, __m128 b) 225{ 226 return (__m128)__builtin_ia32_cmpss(b, a, 2); 227} 228 229static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 230_mm_cmpge_ps(__m128 a, __m128 b) 231{ 232 return (__m128)__builtin_ia32_cmpps(b, a, 2); 233} 234 235static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 236_mm_cmpneq_ss(__m128 a, __m128 b) 237{ 238 return (__m128)__builtin_ia32_cmpss(a, b, 4); 239} 240 241static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 242_mm_cmpneq_ps(__m128 a, __m128 b) 243{ 244 return (__m128)__builtin_ia32_cmpps(a, b, 4); 245} 246 247static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 248_mm_cmpnlt_ss(__m128 a, __m128 b) 249{ 250 return (__m128)__builtin_ia32_cmpss(a, b, 5); 251} 252 253static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 254_mm_cmpnlt_ps(__m128 a, __m128 b) 255{ 256 return (__m128)__builtin_ia32_cmpps(a, b, 5); 257} 258 259static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 260_mm_cmpnle_ss(__m128 a, __m128 b) 261{ 262 return (__m128)__builtin_ia32_cmpss(a, b, 6); 263} 264 265static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 266_mm_cmpnle_ps(__m128 a, __m128 b) 267{ 268 return (__m128)__builtin_ia32_cmpps(a, b, 6); 269} 270 271static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 272_mm_cmpngt_ss(__m128 a, __m128 b) 273{ 274 return (__m128)__builtin_ia32_cmpss(b, a, 5); 275} 276 277static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 278_mm_cmpngt_ps(__m128 a, __m128 b) 279{ 280 return (__m128)__builtin_ia32_cmpps(b, a, 5); 281} 282 283static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 284_mm_cmpnge_ss(__m128 a, __m128 b) 285{ 286 return (__m128)__builtin_ia32_cmpss(b, a, 6); 287} 288 289static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 290_mm_cmpnge_ps(__m128 a, __m128 b) 291{ 292 return (__m128)__builtin_ia32_cmpps(b, a, 6); 293} 294 295static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 296_mm_cmpord_ss(__m128 a, __m128 b) 297{ 298 return (__m128)__builtin_ia32_cmpss(a, b, 7); 299} 300 301static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 302_mm_cmpord_ps(__m128 a, __m128 b) 303{ 304 return (__m128)__builtin_ia32_cmpps(a, b, 7); 305} 306 307static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 308_mm_cmpunord_ss(__m128 a, __m128 b) 309{ 310 return (__m128)__builtin_ia32_cmpss(a, b, 3); 311} 312 313static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 314_mm_cmpunord_ps(__m128 a, __m128 b) 315{ 316 return (__m128)__builtin_ia32_cmpps(a, b, 3); 317} 318 319static __inline__ int __attribute__((__always_inline__, __nodebug__)) 320_mm_comieq_ss(__m128 a, __m128 b) 321{ 322 return __builtin_ia32_comieq(a, b); 323} 324 325static __inline__ int __attribute__((__always_inline__, __nodebug__)) 326_mm_comilt_ss(__m128 a, __m128 b) 327{ 328 return __builtin_ia32_comilt(a, b); 329} 330 331static __inline__ int __attribute__((__always_inline__, __nodebug__)) 332_mm_comile_ss(__m128 a, __m128 b) 333{ 334 return __builtin_ia32_comile(a, b); 335} 336 337static __inline__ int __attribute__((__always_inline__, __nodebug__)) 338_mm_comigt_ss(__m128 a, __m128 b) 339{ 340 return __builtin_ia32_comigt(a, b); 341} 342 343static __inline__ int __attribute__((__always_inline__, __nodebug__)) 344_mm_comige_ss(__m128 a, __m128 b) 345{ 346 return __builtin_ia32_comige(a, b); 347} 348 349static __inline__ int __attribute__((__always_inline__, __nodebug__)) 350_mm_comineq_ss(__m128 a, __m128 b) 351{ 352 return __builtin_ia32_comineq(a, b); 353} 354 355static __inline__ int __attribute__((__always_inline__, __nodebug__)) 356_mm_ucomieq_ss(__m128 a, __m128 b) 357{ 358 return __builtin_ia32_ucomieq(a, b); 359} 360 361static __inline__ int __attribute__((__always_inline__, __nodebug__)) 362_mm_ucomilt_ss(__m128 a, __m128 b) 363{ 364 return __builtin_ia32_ucomilt(a, b); 365} 366 367static __inline__ int __attribute__((__always_inline__, __nodebug__)) 368_mm_ucomile_ss(__m128 a, __m128 b) 369{ 370 return __builtin_ia32_ucomile(a, b); 371} 372 373static __inline__ int __attribute__((__always_inline__, __nodebug__)) 374_mm_ucomigt_ss(__m128 a, __m128 b) 375{ 376 return __builtin_ia32_ucomigt(a, b); 377} 378 379static __inline__ int __attribute__((__always_inline__, __nodebug__)) 380_mm_ucomige_ss(__m128 a, __m128 b) 381{ 382 return __builtin_ia32_ucomige(a, b); 383} 384 385static __inline__ int __attribute__((__always_inline__, __nodebug__)) 386_mm_ucomineq_ss(__m128 a, __m128 b) 387{ 388 return __builtin_ia32_ucomineq(a, b); 389} 390 391static __inline__ int __attribute__((__always_inline__, __nodebug__)) 392_mm_cvtss_si32(__m128 a) 393{ 394 return __builtin_ia32_cvtss2si(a); 395} 396 397static __inline__ int __attribute__((__always_inline__, __nodebug__)) 398_mm_cvt_ss2si(__m128 a) 399{ 400 return _mm_cvtss_si32(a); 401} 402 403#ifdef __x86_64__ 404 405static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 406_mm_cvtss_si64(__m128 a) 407{ 408 return __builtin_ia32_cvtss2si64(a); 409} 410 411#endif 412 413static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 414_mm_cvtps_pi32(__m128 a) 415{ 416 return (__m64)__builtin_ia32_cvtps2pi(a); 417} 418 419static __inline__ int __attribute__((__always_inline__, __nodebug__)) 420_mm_cvttss_si32(__m128 a) 421{ 422 return a[0]; 423} 424 425static __inline__ int __attribute__((__always_inline__, __nodebug__)) 426_mm_cvtt_ss2si(__m128 a) 427{ 428 return _mm_cvttss_si32(a); 429} 430 431static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 432_mm_cvttss_si64(__m128 a) 433{ 434 return a[0]; 435} 436 437static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 438_mm_cvttps_pi32(__m128 a) 439{ 440 return (__m64)__builtin_ia32_cvttps2pi(a); 441} 442 443static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 444_mm_cvtsi32_ss(__m128 a, int b) 445{ 446 a[0] = b; 447 return a; 448} 449 450#ifdef __x86_64__ 451 452static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 453_mm_cvtsi64_ss(__m128 a, long long b) 454{ 455 a[0] = b; 456 return a; 457} 458 459#endif 460 461static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 462_mm_cvtpi32_ps(__m128 a, __m64 b) 463{ 464 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 465} 466 467static __inline__ float __attribute__((__always_inline__, __nodebug__)) 468_mm_cvtss_f32(__m128 a) 469{ 470 return a[0]; 471} 472 473static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 474_mm_loadh_pi(__m128 a, const __m64 *p) 475{ 476 __m128 b; 477 b[0] = *(float*)p; 478 b[1] = *((float*)p+1); 479 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 480} 481 482static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 483_mm_loadl_pi(__m128 a, const __m64 *p) 484{ 485 __m128 b; 486 b[0] = *(float*)p; 487 b[1] = *((float*)p+1); 488 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 489} 490 491static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 492_mm_load_ss(const float *p) 493{ 494 return (__m128){ *p, 0, 0, 0 }; 495} 496 497static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 498_mm_load1_ps(const float *p) 499{ 500 return (__m128){ *p, *p, *p, *p }; 501} 502 503#define _mm_load_ps1(p) _mm_load1_ps(p) 504 505static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 506_mm_load_ps(const float *p) 507{ 508 return *(__m128*)p; 509} 510 511static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 512_mm_loadu_ps(const float *p) 513{ 514 return __builtin_ia32_loadups(p); 515} 516 517static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 518_mm_loadr_ps(const float *p) 519{ 520 __m128 a = _mm_load_ps(p); 521 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 522} 523 524static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 525_mm_set_ss(float w) 526{ 527 return (__m128){ w, 0, 0, 0 }; 528} 529 530static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 531_mm_set1_ps(float w) 532{ 533 return (__m128){ w, w, w, w }; 534} 535 536// Microsoft specific. 537static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 538_mm_set_ps1(float w) 539{ 540 return _mm_set1_ps(w); 541} 542 543static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 544_mm_set_ps(float z, float y, float x, float w) 545{ 546 return (__m128){ w, x, y, z }; 547} 548 549static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 550_mm_setr_ps(float z, float y, float x, float w) 551{ 552 return (__m128){ z, y, x, w }; 553} 554 555static __inline__ __m128 __attribute__((__always_inline__)) 556_mm_setzero_ps(void) 557{ 558 return (__m128){ 0, 0, 0, 0 }; 559} 560 561static __inline__ void __attribute__((__always_inline__)) 562_mm_storeh_pi(__m64 *p, __m128 a) 563{ 564 __builtin_ia32_storehps((__v2si *)p, a); 565} 566 567static __inline__ void __attribute__((__always_inline__)) 568_mm_storel_pi(__m64 *p, __m128 a) 569{ 570 __builtin_ia32_storelps((__v2si *)p, a); 571} 572 573static __inline__ void __attribute__((__always_inline__)) 574_mm_store_ss(float *p, __m128 a) 575{ 576 *p = a[0]; 577} 578 579static __inline__ void __attribute__((__always_inline__, __nodebug__)) 580_mm_storeu_ps(float *p, __m128 a) 581{ 582 __builtin_ia32_storeups(p, a); 583} 584 585static __inline__ void __attribute__((__always_inline__, __nodebug__)) 586_mm_store1_ps(float *p, __m128 a) 587{ 588 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 589 _mm_storeu_ps(p, a); 590} 591 592static __inline__ void __attribute__((__always_inline__, __nodebug__)) 593_mm_store_ps(float *p, __m128 a) 594{ 595 *(__m128 *)p = a; 596} 597 598static __inline__ void __attribute__((__always_inline__, __nodebug__)) 599_mm_storer_ps(float *p, __m128 a) 600{ 601 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 602 _mm_store_ps(p, a); 603} 604 605#define _MM_HINT_T0 1 606#define _MM_HINT_T1 2 607#define _MM_HINT_T2 3 608#define _MM_HINT_NTA 0 609 610/* FIXME: We have to #define this because "sel" must be a constant integer, and 611 Sema doesn't do any form of constant propagation yet. */ 612 613#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, sel)) 614 615static __inline__ void __attribute__((__always_inline__, __nodebug__)) 616_mm_stream_pi(__m64 *p, __m64 a) 617{ 618 __builtin_ia32_movntq(p, a); 619} 620 621static __inline__ void __attribute__((__always_inline__, __nodebug__)) 622_mm_stream_ps(float *p, __m128 a) 623{ 624 __builtin_ia32_movntps(p, a); 625} 626 627static __inline__ void __attribute__((__always_inline__, __nodebug__)) 628_mm_sfence(void) 629{ 630 __builtin_ia32_sfence(); 631} 632 633static __inline__ int __attribute__((__always_inline__, __nodebug__)) 634_mm_extract_pi16(__m64 a, int n) 635{ 636 __v4hi b = (__v4hi)a; 637 return (unsigned short)b[n & 3]; 638} 639 640static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 641_mm_insert_pi16(__m64 a, int d, int n) 642{ 643 __v4hi b = (__v4hi)a; 644 b[n & 3] = d; 645 return (__m64)b; 646} 647 648static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 649_mm_max_pi16(__m64 a, __m64 b) 650{ 651 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 652} 653 654static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 655_mm_max_pu8(__m64 a, __m64 b) 656{ 657 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 658} 659 660static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 661_mm_min_pi16(__m64 a, __m64 b) 662{ 663 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 664} 665 666static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 667_mm_min_pu8(__m64 a, __m64 b) 668{ 669 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 670} 671 672static __inline__ int __attribute__((__always_inline__, __nodebug__)) 673_mm_movemask_pi8(__m64 a) 674{ 675 return __builtin_ia32_pmovmskb((__v8qi)a); 676} 677 678static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 679_mm_mulhi_pu16(__m64 a, __m64 b) 680{ 681 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 682} 683 684#define _mm_shuffle_pi16(a, n) \ 685 ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 686 (n) & 0x3, ((n) & 0xc) >> 2, \ 687 ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 688 689static __inline__ void __attribute__((__always_inline__, __nodebug__)) 690_mm_maskmove_si64(__m64 d, __m64 n, char *p) 691{ 692 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 693} 694 695static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 696_mm_avg_pu8(__m64 a, __m64 b) 697{ 698 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 699} 700 701static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 702_mm_avg_pu16(__m64 a, __m64 b) 703{ 704 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 705} 706 707static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 708_mm_sad_pu8(__m64 a, __m64 b) 709{ 710 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 711} 712 713static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) 714_mm_getcsr(void) 715{ 716 return __builtin_ia32_stmxcsr(); 717} 718 719static __inline__ void __attribute__((__always_inline__, __nodebug__)) 720_mm_setcsr(unsigned int i) 721{ 722 __builtin_ia32_ldmxcsr(i); 723} 724 725#define _mm_shuffle_ps(a, b, mask) \ 726 (__builtin_shufflevector((__v4sf)(a), (__v4sf)(b), \ 727 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 728 (((mask) & 0x30) >> 4) + 4, \ 729 (((mask) & 0xc0) >> 6) + 4)) 730 731static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 732_mm_unpackhi_ps(__m128 a, __m128 b) 733{ 734 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 735} 736 737static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 738_mm_unpacklo_ps(__m128 a, __m128 b) 739{ 740 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 741} 742 743static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 744_mm_move_ss(__m128 a, __m128 b) 745{ 746 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 747} 748 749static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 750_mm_movehl_ps(__m128 a, __m128 b) 751{ 752 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 753} 754 755static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 756_mm_movelh_ps(__m128 a, __m128 b) 757{ 758 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 759} 760 761static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 762_mm_cvtpi16_ps(__m64 a) 763{ 764 __m64 b, c; 765 __m128 r; 766 767 b = _mm_setzero_si64(); 768 b = _mm_cmpgt_pi16(b, a); 769 c = _mm_unpackhi_pi16(a, b); 770 r = _mm_setzero_ps(); 771 r = _mm_cvtpi32_ps(r, c); 772 r = _mm_movelh_ps(r, r); 773 c = _mm_unpacklo_pi16(a, b); 774 r = _mm_cvtpi32_ps(r, c); 775 776 return r; 777} 778 779static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 780_mm_cvtpu16_ps(__m64 a) 781{ 782 __m64 b, c; 783 __m128 r; 784 785 b = _mm_setzero_si64(); 786 c = _mm_unpackhi_pi16(a, b); 787 r = _mm_setzero_ps(); 788 r = _mm_cvtpi32_ps(r, c); 789 r = _mm_movelh_ps(r, r); 790 c = _mm_unpacklo_pi16(a, b); 791 r = _mm_cvtpi32_ps(r, c); 792 793 return r; 794} 795 796static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 797_mm_cvtpi8_ps(__m64 a) 798{ 799 __m64 b; 800 801 b = _mm_setzero_si64(); 802 b = _mm_cmpgt_pi8(b, a); 803 b = _mm_unpacklo_pi8(a, b); 804 805 return _mm_cvtpi16_ps(b); 806} 807 808static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 809_mm_cvtpu8_ps(__m64 a) 810{ 811 __m64 b; 812 813 b = _mm_setzero_si64(); 814 b = _mm_unpacklo_pi8(a, b); 815 816 return _mm_cvtpi16_ps(b); 817} 818 819static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 820_mm_cvtpi32x2_ps(__m64 a, __m64 b) 821{ 822 __m128 c; 823 824 c = _mm_setzero_ps(); 825 c = _mm_cvtpi32_ps(c, b); 826 c = _mm_movelh_ps(c, c); 827 828 return _mm_cvtpi32_ps(c, a); 829} 830 831static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 832_mm_cvtps_pi16(__m128 a) 833{ 834 __m64 b, c; 835 836 b = _mm_cvtps_pi32(a); 837 a = _mm_movehl_ps(a, a); 838 c = _mm_cvtps_pi32(a); 839 840 return _mm_packs_pi16(b, c); 841} 842 843static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 844_mm_cvtps_pi8(__m128 a) 845{ 846 __m64 b, c; 847 848 b = _mm_cvtps_pi16(a); 849 c = _mm_setzero_si64(); 850 851 return _mm_packs_pi16(b, c); 852} 853 854static __inline__ int __attribute__((__always_inline__, __nodebug__)) 855_mm_movemask_ps(__m128 a) 856{ 857 return __builtin_ia32_movmskps(a); 858} 859 860#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 861 862#define _MM_EXCEPT_INVALID (0x0001) 863#define _MM_EXCEPT_DENORM (0x0002) 864#define _MM_EXCEPT_DIV_ZERO (0x0004) 865#define _MM_EXCEPT_OVERFLOW (0x0008) 866#define _MM_EXCEPT_UNDERFLOW (0x0010) 867#define _MM_EXCEPT_INEXACT (0x0020) 868#define _MM_EXCEPT_MASK (0x003f) 869 870#define _MM_MASK_INVALID (0x0080) 871#define _MM_MASK_DENORM (0x0100) 872#define _MM_MASK_DIV_ZERO (0x0200) 873#define _MM_MASK_OVERFLOW (0x0400) 874#define _MM_MASK_UNDERFLOW (0x0800) 875#define _MM_MASK_INEXACT (0x1000) 876#define _MM_MASK_MASK (0x1f80) 877 878#define _MM_ROUND_NEAREST (0x0000) 879#define _MM_ROUND_DOWN (0x2000) 880#define _MM_ROUND_UP (0x4000) 881#define _MM_ROUND_TOWARD_ZERO (0x6000) 882#define _MM_ROUND_MASK (0x6000) 883 884#define _MM_FLUSH_ZERO_MASK (0x8000) 885#define _MM_FLUSH_ZERO_ON (0x8000) 886#define _MM_FLUSH_ZERO_OFF (0x8000) 887 888#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 889#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 890#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 891#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 892 893#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 894#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 895#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 896#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 897 898#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 899do { \ 900 __m128 tmp3, tmp2, tmp1, tmp0; \ 901 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 902 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 903 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 904 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 905 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 906 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 907 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 908 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 909} while (0) 910 911/* Ugly hack for backwards-compatibility (compatible with gcc) */ 912#ifdef __SSE2__ 913#include <emmintrin.h> 914#endif 915 916#endif /* __SSE__ */ 917 918#endif /* __XMMINTRIN_H */ 919