xmmintrin.h revision 2c48345cf9d4cfe5fe73a37de684825f6015518b
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef float __v4sf __attribute__((__vector_size__(16))); 34typedef float __m128 __attribute__((__vector_size__(16))); 35 36#include <mm_malloc.h> 37 38static inline __m128 __attribute__((__always_inline__, __nodebug__)) 39_mm_add_ss(__m128 a, __m128 b) 40{ 41 a[0] += b[0]; 42 return a; 43} 44 45static inline __m128 __attribute__((__always_inline__, __nodebug__)) 46_mm_add_ps(__m128 a, __m128 b) 47{ 48 return a + b; 49} 50 51static inline __m128 __attribute__((__always_inline__, __nodebug__)) 52_mm_sub_ss(__m128 a, __m128 b) 53{ 54 a[0] -= b[0]; 55 return a; 56} 57 58static inline __m128 __attribute__((__always_inline__, __nodebug__)) 59_mm_sub_ps(__m128 a, __m128 b) 60{ 61 return a - b; 62} 63 64static inline __m128 __attribute__((__always_inline__, __nodebug__)) 65_mm_mul_ss(__m128 a, __m128 b) 66{ 67 a[0] *= b[0]; 68 return a; 69} 70 71static inline __m128 __attribute__((__always_inline__, __nodebug__)) 72_mm_mul_ps(__m128 a, __m128 b) 73{ 74 return a * b; 75} 76 77static inline __m128 __attribute__((__always_inline__, __nodebug__)) 78_mm_div_ss(__m128 a, __m128 b) 79{ 80 a[0] /= b[0]; 81 return a; 82} 83 84static inline __m128 __attribute__((__always_inline__, __nodebug__)) 85_mm_div_ps(__m128 a, __m128 b) 86{ 87 return a / b; 88} 89 90static inline __m128 __attribute__((__always_inline__, __nodebug__)) 91_mm_sqrt_ss(__m128 a) 92{ 93 return __builtin_ia32_sqrtss(a); 94} 95 96static inline __m128 __attribute__((__always_inline__, __nodebug__)) 97_mm_sqrt_ps(__m128 a) 98{ 99 return __builtin_ia32_sqrtps(a); 100} 101 102static inline __m128 __attribute__((__always_inline__, __nodebug__)) 103_mm_rcp_ss(__m128 a) 104{ 105 return __builtin_ia32_rcpss(a); 106} 107 108static inline __m128 __attribute__((__always_inline__, __nodebug__)) 109_mm_rcp_ps(__m128 a) 110{ 111 return __builtin_ia32_rcpps(a); 112} 113 114static inline __m128 __attribute__((__always_inline__, __nodebug__)) 115_mm_rsqrt_ss(__m128 a) 116{ 117 return __builtin_ia32_rsqrtss(a); 118} 119 120static inline __m128 __attribute__((__always_inline__, __nodebug__)) 121_mm_rsqrt_ps(__m128 a) 122{ 123 return __builtin_ia32_rsqrtps(a); 124} 125 126static inline __m128 __attribute__((__always_inline__, __nodebug__)) 127_mm_min_ss(__m128 a, __m128 b) 128{ 129 return __builtin_ia32_minss(a, b); 130} 131 132static inline __m128 __attribute__((__always_inline__, __nodebug__)) 133_mm_min_ps(__m128 a, __m128 b) 134{ 135 return __builtin_ia32_minps(a, b); 136} 137 138static inline __m128 __attribute__((__always_inline__, __nodebug__)) 139_mm_max_ss(__m128 a, __m128 b) 140{ 141 return __builtin_ia32_maxss(a, b); 142} 143 144static inline __m128 __attribute__((__always_inline__, __nodebug__)) 145_mm_max_ps(__m128 a, __m128 b) 146{ 147 return __builtin_ia32_maxps(a, b); 148} 149 150static inline __m128 __attribute__((__always_inline__, __nodebug__)) 151_mm_and_ps(__m128 a, __m128 b) 152{ 153 typedef int __v4si __attribute__((__vector_size__(16))); 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static inline __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 typedef int __v4si __attribute__((__vector_size__(16))); 161 return (__m128)(~(__v4si)a & (__v4si)b); 162} 163 164static inline __m128 __attribute__((__always_inline__, __nodebug__)) 165_mm_or_ps(__m128 a, __m128 b) 166{ 167 typedef int __v4si __attribute__((__vector_size__(16))); 168 return (__m128)((__v4si)a | (__v4si)b); 169} 170 171static inline __m128 __attribute__((__always_inline__, __nodebug__)) 172_mm_xor_ps(__m128 a, __m128 b) 173{ 174 typedef int __v4si __attribute__((__vector_size__(16))); 175 return (__m128)((__v4si)a ^ (__v4si)b); 176} 177 178static inline __m128 __attribute__((__always_inline__, __nodebug__)) 179_mm_cmpeq_ss(__m128 a, __m128 b) 180{ 181 return (__m128)__builtin_ia32_cmpss(a, b, 0); 182} 183 184static inline __m128 __attribute__((__always_inline__, __nodebug__)) 185_mm_cmpeq_ps(__m128 a, __m128 b) 186{ 187 return (__m128)__builtin_ia32_cmpps(a, b, 0); 188} 189 190static inline __m128 __attribute__((__always_inline__, __nodebug__)) 191_mm_cmplt_ss(__m128 a, __m128 b) 192{ 193 return (__m128)__builtin_ia32_cmpss(a, b, 1); 194} 195 196static inline __m128 __attribute__((__always_inline__, __nodebug__)) 197_mm_cmplt_ps(__m128 a, __m128 b) 198{ 199 return (__m128)__builtin_ia32_cmpps(a, b, 1); 200} 201 202static inline __m128 __attribute__((__always_inline__, __nodebug__)) 203_mm_cmple_ss(__m128 a, __m128 b) 204{ 205 return (__m128)__builtin_ia32_cmpss(a, b, 2); 206} 207 208static inline __m128 __attribute__((__always_inline__, __nodebug__)) 209_mm_cmple_ps(__m128 a, __m128 b) 210{ 211 return (__m128)__builtin_ia32_cmpps(a, b, 2); 212} 213 214static inline __m128 __attribute__((__always_inline__, __nodebug__)) 215_mm_cmpgt_ss(__m128 a, __m128 b) 216{ 217 return (__m128)__builtin_ia32_cmpss(b, a, 1); 218} 219 220static inline __m128 __attribute__((__always_inline__, __nodebug__)) 221_mm_cmpgt_ps(__m128 a, __m128 b) 222{ 223 return (__m128)__builtin_ia32_cmpps(b, a, 1); 224} 225 226static inline __m128 __attribute__((__always_inline__, __nodebug__)) 227_mm_cmpge_ss(__m128 a, __m128 b) 228{ 229 return (__m128)__builtin_ia32_cmpss(b, a, 2); 230} 231 232static inline __m128 __attribute__((__always_inline__, __nodebug__)) 233_mm_cmpge_ps(__m128 a, __m128 b) 234{ 235 return (__m128)__builtin_ia32_cmpps(b, a, 2); 236} 237 238static inline __m128 __attribute__((__always_inline__, __nodebug__)) 239_mm_cmpneq_ss(__m128 a, __m128 b) 240{ 241 return (__m128)__builtin_ia32_cmpss(a, b, 4); 242} 243 244static inline __m128 __attribute__((__always_inline__, __nodebug__)) 245_mm_cmpneq_ps(__m128 a, __m128 b) 246{ 247 return (__m128)__builtin_ia32_cmpps(a, b, 4); 248} 249 250static inline __m128 __attribute__((__always_inline__, __nodebug__)) 251_mm_cmpnlt_ss(__m128 a, __m128 b) 252{ 253 return (__m128)__builtin_ia32_cmpss(a, b, 5); 254} 255 256static inline __m128 __attribute__((__always_inline__, __nodebug__)) 257_mm_cmpnlt_ps(__m128 a, __m128 b) 258{ 259 return (__m128)__builtin_ia32_cmpps(a, b, 5); 260} 261 262static inline __m128 __attribute__((__always_inline__, __nodebug__)) 263_mm_cmpnle_ss(__m128 a, __m128 b) 264{ 265 return (__m128)__builtin_ia32_cmpss(a, b, 6); 266} 267 268static inline __m128 __attribute__((__always_inline__, __nodebug__)) 269_mm_cmpnle_ps(__m128 a, __m128 b) 270{ 271 return (__m128)__builtin_ia32_cmpps(a, b, 6); 272} 273 274static inline __m128 __attribute__((__always_inline__, __nodebug__)) 275_mm_cmpngt_ss(__m128 a, __m128 b) 276{ 277 return (__m128)__builtin_ia32_cmpss(b, a, 5); 278} 279 280static inline __m128 __attribute__((__always_inline__, __nodebug__)) 281_mm_cmpngt_ps(__m128 a, __m128 b) 282{ 283 return (__m128)__builtin_ia32_cmpps(b, a, 5); 284} 285 286static inline __m128 __attribute__((__always_inline__, __nodebug__)) 287_mm_cmpnge_ss(__m128 a, __m128 b) 288{ 289 return (__m128)__builtin_ia32_cmpss(b, a, 6); 290} 291 292static inline __m128 __attribute__((__always_inline__, __nodebug__)) 293_mm_cmpnge_ps(__m128 a, __m128 b) 294{ 295 return (__m128)__builtin_ia32_cmpps(b, a, 6); 296} 297 298static inline __m128 __attribute__((__always_inline__, __nodebug__)) 299_mm_cmpord_ss(__m128 a, __m128 b) 300{ 301 return (__m128)__builtin_ia32_cmpss(a, b, 7); 302} 303 304static inline __m128 __attribute__((__always_inline__, __nodebug__)) 305_mm_cmpord_ps(__m128 a, __m128 b) 306{ 307 return (__m128)__builtin_ia32_cmpps(a, b, 7); 308} 309 310static inline __m128 __attribute__((__always_inline__, __nodebug__)) 311_mm_cmpunord_ss(__m128 a, __m128 b) 312{ 313 return (__m128)__builtin_ia32_cmpss(a, b, 3); 314} 315 316static inline __m128 __attribute__((__always_inline__, __nodebug__)) 317_mm_cmpunord_ps(__m128 a, __m128 b) 318{ 319 return (__m128)__builtin_ia32_cmpps(a, b, 3); 320} 321 322static inline int __attribute__((__always_inline__, __nodebug__)) 323_mm_comieq_ss(__m128 a, __m128 b) 324{ 325 return __builtin_ia32_comieq(a, b); 326} 327 328static inline int __attribute__((__always_inline__, __nodebug__)) 329_mm_comilt_ss(__m128 a, __m128 b) 330{ 331 return __builtin_ia32_comilt(a, b); 332} 333 334static inline int __attribute__((__always_inline__, __nodebug__)) 335_mm_comile_ss(__m128 a, __m128 b) 336{ 337 return __builtin_ia32_comile(a, b); 338} 339 340static inline int __attribute__((__always_inline__, __nodebug__)) 341_mm_comigt_ss(__m128 a, __m128 b) 342{ 343 return __builtin_ia32_comigt(a, b); 344} 345 346static inline int __attribute__((__always_inline__, __nodebug__)) 347_mm_comige_ss(__m128 a, __m128 b) 348{ 349 return __builtin_ia32_comige(a, b); 350} 351 352static inline int __attribute__((__always_inline__, __nodebug__)) 353_mm_comineq_ss(__m128 a, __m128 b) 354{ 355 return __builtin_ia32_comineq(a, b); 356} 357 358static inline int __attribute__((__always_inline__, __nodebug__)) 359_mm_ucomieq_ss(__m128 a, __m128 b) 360{ 361 return __builtin_ia32_ucomieq(a, b); 362} 363 364static inline int __attribute__((__always_inline__, __nodebug__)) 365_mm_ucomilt_ss(__m128 a, __m128 b) 366{ 367 return __builtin_ia32_ucomilt(a, b); 368} 369 370static inline int __attribute__((__always_inline__, __nodebug__)) 371_mm_ucomile_ss(__m128 a, __m128 b) 372{ 373 return __builtin_ia32_ucomile(a, b); 374} 375 376static inline int __attribute__((__always_inline__, __nodebug__)) 377_mm_ucomigt_ss(__m128 a, __m128 b) 378{ 379 return __builtin_ia32_ucomigt(a, b); 380} 381 382static inline int __attribute__((__always_inline__, __nodebug__)) 383_mm_ucomige_ss(__m128 a, __m128 b) 384{ 385 return __builtin_ia32_ucomige(a, b); 386} 387 388static inline int __attribute__((__always_inline__, __nodebug__)) 389_mm_ucomineq_ss(__m128 a, __m128 b) 390{ 391 return __builtin_ia32_ucomineq(a, b); 392} 393 394static inline int __attribute__((__always_inline__, __nodebug__)) 395_mm_cvtss_si32(__m128 a) 396{ 397 return __builtin_ia32_cvtss2si(a); 398} 399 400#ifdef __x86_64__ 401 402static inline long long __attribute__((__always_inline__, __nodebug__)) 403_mm_cvtss_si64(__m128 a) 404{ 405 return __builtin_ia32_cvtss2si64(a); 406} 407 408#endif 409 410static inline __m64 __attribute__((__always_inline__, __nodebug__)) 411_mm_cvtps_pi32(__m128 a) 412{ 413 return (__m64)__builtin_ia32_cvtps2pi(a); 414} 415 416static inline int __attribute__((__always_inline__, __nodebug__)) 417_mm_cvttss_si32(__m128 a) 418{ 419 return a[0]; 420} 421 422static inline long long __attribute__((__always_inline__, __nodebug__)) 423_mm_cvttss_si64(__m128 a) 424{ 425 return a[0]; 426} 427 428static inline __m64 __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttps_pi32(__m128 a) 430{ 431 return (__m64)__builtin_ia32_cvttps2pi(a); 432} 433 434static inline __m128 __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtsi32_ss(__m128 a, int b) 436{ 437 a[0] = b; 438 return a; 439} 440 441#ifdef __x86_64__ 442 443static inline __m128 __attribute__((__always_inline__, __nodebug__)) 444_mm_cvtsi64_ss(__m128 a, long long b) 445{ 446 a[0] = b; 447 return a; 448} 449 450#endif 451 452static inline __m128 __attribute__((__always_inline__, __nodebug__)) 453_mm_cvtpi32_ps(__m128 a, __m64 b) 454{ 455 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 456} 457 458static inline float __attribute__((__always_inline__, __nodebug__)) 459_mm_cvtss_f32(__m128 a) 460{ 461 return a[0]; 462} 463 464static inline __m128 __attribute__((__always_inline__, __nodebug__)) 465_mm_loadh_pi(__m128 a, __m64 const *p) 466{ 467 __m128 b; 468 b[0] = *(float*)p; 469 b[1] = *((float*)p+1); 470 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 471} 472 473static inline __m128 __attribute__((__always_inline__, __nodebug__)) 474_mm_loadl_pi(__m128 a, __m64 const *p) 475{ 476 __m128 b; 477 b[0] = *(float*)p; 478 b[1] = *((float*)p+1); 479 return __builtin_shufflevector(a, b, 4, 5, 2, 3); 480} 481 482static inline __m128 __attribute__((__always_inline__, __nodebug__)) 483_mm_load_ss(float *p) 484{ 485 return (__m128){ *p, 0, 0, 0 }; 486} 487 488static inline __m128 __attribute__((__always_inline__, __nodebug__)) 489_mm_load1_ps(float *p) 490{ 491 return (__m128){ *p, *p, *p, *p }; 492} 493 494#define _mm_load_ps1(p) _mm_load1_ps(p) 495 496static inline __m128 __attribute__((__always_inline__, __nodebug__)) 497_mm_load_ps(float *p) 498{ 499 return *(__m128*)p; 500} 501 502static inline __m128 __attribute__((__always_inline__, __nodebug__)) 503_mm_loadu_ps(float *p) 504{ 505 return __builtin_ia32_loadups(p); 506} 507 508static inline __m128 __attribute__((__always_inline__, __nodebug__)) 509_mm_loadr_ps(float *p) 510{ 511 __m128 a = _mm_load_ps(p); 512 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 513} 514 515static inline __m128 __attribute__((__always_inline__, __nodebug__)) 516_mm_set_ss(float w) 517{ 518 return (__m128){ w, 0, 0, 0 }; 519} 520 521static inline __m128 __attribute__((__always_inline__, __nodebug__)) 522_mm_set1_ps(float w) 523{ 524 return (__m128){ w, w, w, w }; 525} 526 527// Microsoft specific. 528static inline __m128 __attribute__((__always_inline__, __nodebug__)) 529_mm_set_ps1(float w) 530{ 531 return _mm_set1_ps(w); 532} 533 534static inline __m128 __attribute__((__always_inline__, __nodebug__)) 535_mm_set_ps(float z, float y, float x, float w) 536{ 537 return (__m128){ w, x, y, z }; 538} 539 540static inline __m128 __attribute__((__always_inline__, __nodebug__)) 541_mm_setr_ps(float z, float y, float x, float w) 542{ 543 return (__m128){ z, y, x, w }; 544} 545 546static inline __m128 __attribute__((__always_inline__)) 547_mm_setzero_ps(void) 548{ 549 return (__m128){ 0, 0, 0, 0 }; 550} 551 552static inline void __attribute__((__always_inline__)) 553_mm_storeh_pi(__m64 *p, __m128 a) 554{ 555 __builtin_ia32_storehps((__v2si *)p, a); 556} 557 558static inline void __attribute__((__always_inline__)) 559_mm_storel_pi(__m64 *p, __m128 a) 560{ 561 __builtin_ia32_storelps((__v2si *)p, a); 562} 563 564static inline void __attribute__((__always_inline__)) 565_mm_store_ss(float *p, __m128 a) 566{ 567 *p = a[0]; 568} 569 570static inline void __attribute__((__always_inline__, __nodebug__)) 571_mm_storeu_ps(float *p, __m128 a) 572{ 573 __builtin_ia32_storeups(p, a); 574} 575 576static inline void __attribute__((__always_inline__, __nodebug__)) 577_mm_store1_ps(float *p, __m128 a) 578{ 579 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 580 _mm_storeu_ps(p, a); 581} 582 583static inline void __attribute__((__always_inline__, __nodebug__)) 584_mm_store_ps(float *p, __m128 a) 585{ 586 *(__m128 *)p = a; 587} 588 589static inline void __attribute__((__always_inline__, __nodebug__)) 590_mm_storer_ps(float *p, __m128 a) 591{ 592 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 593 _mm_store_ps(p, a); 594} 595 596#define _MM_HINT_T0 1 597#define _MM_HINT_T1 2 598#define _MM_HINT_T2 3 599#define _MM_HINT_NTA 0 600 601/* FIXME: We have to #define this because "sel" must be a constant integer, and 602 Sema doesn't do any form of constant propagation yet. */ 603 604#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 605 606static inline void __attribute__((__always_inline__, __nodebug__)) 607_mm_stream_pi(__m64 *p, __m64 a) 608{ 609 __builtin_ia32_movntq(p, a); 610} 611 612static inline void __attribute__((__always_inline__, __nodebug__)) 613_mm_stream_ps(float *p, __m128 a) 614{ 615 __builtin_ia32_movntps(p, a); 616} 617 618static inline void __attribute__((__always_inline__, __nodebug__)) 619_mm_sfence(void) 620{ 621 __builtin_ia32_sfence(); 622} 623 624static inline int __attribute__((__always_inline__, __nodebug__)) 625_mm_extract_pi16(__m64 a, int n) 626{ 627 __v4hi b = (__v4hi)a; 628 return (unsigned short)b[n & 3]; 629} 630 631static inline __m64 __attribute__((__always_inline__, __nodebug__)) 632_mm_insert_pi16(__m64 a, int d, int n) 633{ 634 __v4hi b = (__v4hi)a; 635 b[n & 3] = d; 636 return (__m64)b; 637} 638 639static inline __m64 __attribute__((__always_inline__, __nodebug__)) 640_mm_max_pi16(__m64 a, __m64 b) 641{ 642 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 643} 644 645static inline __m64 __attribute__((__always_inline__, __nodebug__)) 646_mm_max_pu8(__m64 a, __m64 b) 647{ 648 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 649} 650 651static inline __m64 __attribute__((__always_inline__, __nodebug__)) 652_mm_min_pi16(__m64 a, __m64 b) 653{ 654 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 655} 656 657static inline __m64 __attribute__((__always_inline__, __nodebug__)) 658_mm_min_pu8(__m64 a, __m64 b) 659{ 660 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 661} 662 663static inline int __attribute__((__always_inline__, __nodebug__)) 664_mm_movemask_pi8(__m64 a) 665{ 666 return __builtin_ia32_pmovmskb((__v8qi)a); 667} 668 669static inline __m64 __attribute__((__always_inline__, __nodebug__)) 670_mm_mulhi_pu16(__m64 a, __m64 b) 671{ 672 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 673} 674 675#define _mm_shuffle_pi16(a, n) \ 676 ((__m64)__builtin_shufflevector((__v4hi)(a), (__v4hi) {0}, \ 677 (n) & 0x3, ((n) & 0xc) >> 2, \ 678 ((n) & 0x30) >> 4, ((n) & 0xc0) >> 6)) 679 680static inline void __attribute__((__always_inline__, __nodebug__)) 681_mm_maskmove_si64(__m64 d, __m64 n, char *p) 682{ 683 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 684} 685 686static inline __m64 __attribute__((__always_inline__, __nodebug__)) 687_mm_avg_pu8(__m64 a, __m64 b) 688{ 689 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 690} 691 692static inline __m64 __attribute__((__always_inline__, __nodebug__)) 693_mm_avg_pu16(__m64 a, __m64 b) 694{ 695 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 696} 697 698static inline __m64 __attribute__((__always_inline__, __nodebug__)) 699_mm_sad_pu8(__m64 a, __m64 b) 700{ 701 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 702} 703 704static inline unsigned int __attribute__((__always_inline__, __nodebug__)) 705_mm_getcsr(void) 706{ 707 return __builtin_ia32_stmxcsr(); 708} 709 710static inline void __attribute__((__always_inline__, __nodebug__)) 711_mm_setcsr(unsigned int i) 712{ 713 __builtin_ia32_ldmxcsr(i); 714} 715 716#define _mm_shuffle_ps(a, b, mask) \ 717 (__builtin_shufflevector(a, b, (mask) & 0x3, ((mask) & 0xc) >> 2, \ 718 (((mask) & 0x30) >> 4) + 4, \ 719 (((mask) & 0xc0) >> 6) + 4)) 720 721static inline __m128 __attribute__((__always_inline__, __nodebug__)) 722_mm_unpackhi_ps(__m128 a, __m128 b) 723{ 724 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 725} 726 727static inline __m128 __attribute__((__always_inline__, __nodebug__)) 728_mm_unpacklo_ps(__m128 a, __m128 b) 729{ 730 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 731} 732 733static inline __m128 __attribute__((__always_inline__, __nodebug__)) 734_mm_move_ss(__m128 a, __m128 b) 735{ 736 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 737} 738 739static inline __m128 __attribute__((__always_inline__, __nodebug__)) 740_mm_movehl_ps(__m128 a, __m128 b) 741{ 742 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 743} 744 745static inline __m128 __attribute__((__always_inline__, __nodebug__)) 746_mm_movelh_ps(__m128 a, __m128 b) 747{ 748 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 749} 750 751static inline __m128 __attribute__((__always_inline__, __nodebug__)) 752_mm_cvtpi16_ps(__m64 a) 753{ 754 __m64 b, c; 755 __m128 r; 756 757 b = _mm_setzero_si64(); 758 b = _mm_cmpgt_pi16(b, a); 759 c = _mm_unpackhi_pi16(a, b); 760 r = _mm_setzero_ps(); 761 r = _mm_cvtpi32_ps(r, c); 762 r = _mm_movelh_ps(r, r); 763 c = _mm_unpacklo_pi16(a, b); 764 r = _mm_cvtpi32_ps(r, c); 765 766 return r; 767} 768 769static inline __m128 __attribute__((__always_inline__, __nodebug__)) 770_mm_cvtpu16_ps(__m64 a) 771{ 772 __m64 b, c; 773 __m128 r; 774 775 b = _mm_setzero_si64(); 776 c = _mm_unpackhi_pi16(a, b); 777 r = _mm_setzero_ps(); 778 r = _mm_cvtpi32_ps(r, c); 779 r = _mm_movelh_ps(r, r); 780 c = _mm_unpacklo_pi16(a, b); 781 r = _mm_cvtpi32_ps(r, c); 782 783 return r; 784} 785 786static inline __m128 __attribute__((__always_inline__, __nodebug__)) 787_mm_cvtpi8_ps(__m64 a) 788{ 789 __m64 b; 790 791 b = _mm_setzero_si64(); 792 b = _mm_cmpgt_pi8(b, a); 793 b = _mm_unpacklo_pi8(a, b); 794 795 return _mm_cvtpi16_ps(b); 796} 797 798static inline __m128 __attribute__((__always_inline__, __nodebug__)) 799_mm_cvtpu8_ps(__m64 a) 800{ 801 __m64 b; 802 803 b = _mm_setzero_si64(); 804 b = _mm_unpacklo_pi8(a, b); 805 806 return _mm_cvtpi16_ps(b); 807} 808 809static inline __m128 __attribute__((__always_inline__, __nodebug__)) 810_mm_cvtpi32x2_ps(__m64 a, __m64 b) 811{ 812 __m128 c; 813 814 c = _mm_setzero_ps(); 815 c = _mm_cvtpi32_ps(c, b); 816 c = _mm_movelh_ps(c, c); 817 818 return _mm_cvtpi32_ps(c, a); 819} 820 821static inline __m64 __attribute__((__always_inline__, __nodebug__)) 822_mm_cvtps_pi16(__m128 a) 823{ 824 __m64 b, c; 825 826 b = _mm_cvtps_pi32(a); 827 a = _mm_movehl_ps(a, a); 828 c = _mm_cvtps_pi32(a); 829 830 return _mm_packs_pi16(b, c); 831} 832 833static inline __m64 __attribute__((__always_inline__, __nodebug__)) 834_mm_cvtps_pi8(__m128 a) 835{ 836 __m64 b, c; 837 838 b = _mm_cvtps_pi16(a); 839 c = _mm_setzero_si64(); 840 841 return _mm_packs_pi16(b, c); 842} 843 844static inline int __attribute__((__always_inline__, __nodebug__)) 845_mm_movemask_ps(__m128 a) 846{ 847 return __builtin_ia32_movmskps(a); 848} 849 850#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 851 852#define _MM_EXCEPT_INVALID (0x0001) 853#define _MM_EXCEPT_DENORM (0x0002) 854#define _MM_EXCEPT_DIV_ZERO (0x0004) 855#define _MM_EXCEPT_OVERFLOW (0x0008) 856#define _MM_EXCEPT_UNDERFLOW (0x0010) 857#define _MM_EXCEPT_INEXACT (0x0020) 858#define _MM_EXCEPT_MASK (0x003f) 859 860#define _MM_MASK_INVALID (0x0080) 861#define _MM_MASK_DENORM (0x0100) 862#define _MM_MASK_DIV_ZERO (0x0200) 863#define _MM_MASK_OVERFLOW (0x0400) 864#define _MM_MASK_UNDERFLOW (0x0800) 865#define _MM_MASK_INEXACT (0x1000) 866#define _MM_MASK_MASK (0x1f80) 867 868#define _MM_ROUND_NEAREST (0x0000) 869#define _MM_ROUND_DOWN (0x2000) 870#define _MM_ROUND_UP (0x4000) 871#define _MM_ROUND_TOWARD_ZERO (0x6000) 872#define _MM_ROUND_MASK (0x6000) 873 874#define _MM_FLUSH_ZERO_MASK (0x8000) 875#define _MM_FLUSH_ZERO_ON (0x8000) 876#define _MM_FLUSH_ZERO_OFF (0x8000) 877 878#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 879#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 880#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 881#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 882 883#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 884#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 885#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 886#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 887 888#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 889do { \ 890 __m128 tmp3, tmp2, tmp1, tmp0; \ 891 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 892 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 893 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 894 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 895 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 896 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 897 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 898 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 899} while (0) 900 901/* Ugly hack for backwards-compatibility (compatible with gcc) */ 902#ifdef __SSE2__ 903#include <emmintrin.h> 904#endif 905 906#endif /* __SSE__ */ 907 908#endif /* __XMMINTRIN_H */ 909