xmmintrin.h revision 62005c16bc3a416c1ecfe0ceaeda9a8dd3e5b0be
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef float __v4sf __attribute__((__vector_size__(16))); 34typedef float __m128 __attribute__((__vector_size__(16))); 35 36#include <mm_malloc.h> 37 38static inline __m128 __attribute__((__always_inline__)) _mm_add_ss(__m128 a, __m128 b) 39{ 40 return __builtin_ia32_addss(a, b); 41} 42 43static inline __m128 __attribute__((__always_inline__)) _mm_add_ps(__m128 a, __m128 b) 44{ 45 return a + b; 46} 47 48static inline __m128 __attribute__((__always_inline__)) _mm_sub_ss(__m128 a, __m128 b) 49{ 50 return __builtin_ia32_subss(a, b); 51} 52 53static inline __m128 __attribute__((__always_inline__)) _mm_sub_ps(__m128 a, __m128 b) 54{ 55 return a - b; 56} 57 58static inline __m128 __attribute__((__always_inline__)) _mm_mul_ss(__m128 a, __m128 b) 59{ 60 return __builtin_ia32_mulss(a, b); 61} 62 63static inline __m128 __attribute__((__always_inline__)) _mm_mul_ps(__m128 a, __m128 b) 64{ 65 return a * b; 66} 67 68static inline __m128 __attribute__((__always_inline__)) _mm_div_ss(__m128 a, __m128 b) 69{ 70 return __builtin_ia32_divss(a, b); 71} 72 73static inline __m128 __attribute__((__always_inline__)) _mm_div_ps(__m128 a, __m128 b) 74{ 75 return a / b; 76} 77 78static inline __m128 __attribute__((__always_inline__)) _mm_sqrt_ss(__m128 a) 79{ 80 return __builtin_ia32_sqrtss(a); 81} 82 83static inline __m128 __attribute__((__always_inline__)) _mm_sqrt_ps(__m128 a) 84{ 85 return __builtin_ia32_sqrtps(a); 86} 87 88static inline __m128 __attribute__((__always_inline__)) _mm_rcp_ss(__m128 a) 89{ 90 return __builtin_ia32_rcpss(a); 91} 92 93static inline __m128 __attribute__((__always_inline__)) _mm_rcp_ps(__m128 a) 94{ 95 return __builtin_ia32_rcpps(a); 96} 97 98static inline __m128 __attribute__((__always_inline__)) _mm_rsqrt_ss(__m128 a) 99{ 100 return __builtin_ia32_rsqrtss(a); 101} 102 103static inline __m128 __attribute__((__always_inline__)) _mm_rsqrt_ps(__m128 a) 104{ 105 return __builtin_ia32_rsqrtps(a); 106} 107 108static inline __m128 __attribute__((__always_inline__)) _mm_min_ss(__m128 a, __m128 b) 109{ 110 return __builtin_ia32_minss(a, b); 111} 112 113static inline __m128 __attribute__((__always_inline__)) _mm_min_ps(__m128 a, __m128 b) 114{ 115 return __builtin_ia32_minps(a, b); 116} 117 118static inline __m128 __attribute__((__always_inline__)) _mm_max_ss(__m128 a, __m128 b) 119{ 120 return __builtin_ia32_maxss(a, b); 121} 122 123static inline __m128 __attribute__((__always_inline__)) _mm_max_ps(__m128 a, __m128 b) 124{ 125 return __builtin_ia32_maxps(a, b); 126} 127 128static inline __m128 __attribute__((__always_inline__)) _mm_and_ps(__m128 a, __m128 b) 129{ 130 return __builtin_ia32_andps(a, b); 131} 132 133static inline __m128 __attribute__((__always_inline__)) _mm_andnot_ps(__m128 a, __m128 b) 134{ 135 return __builtin_ia32_andnps(a, b); 136} 137 138static inline __m128 __attribute__((__always_inline__)) _mm_or_ps(__m128 a, __m128 b) 139{ 140 return __builtin_ia32_orps(a, b); 141} 142 143static inline __m128 __attribute__((__always_inline__)) _mm_xor_ps(__m128 a, __m128 b) 144{ 145 return __builtin_ia32_xorps(a, b); 146} 147 148static inline __m128 __attribute__((__always_inline__)) _mm_cmpeq_ss(__m128 a, __m128 b) 149{ 150 return (__m128)__builtin_ia32_cmpeqss(a, b); 151} 152 153static inline __m128 __attribute__((__always_inline__)) _mm_cmpeq_ps(__m128 a, __m128 b) 154{ 155 return (__m128)__builtin_ia32_cmpeqps(a, b); 156} 157 158static inline __m128 __attribute__((__always_inline__)) _mm_cmplt_ss(__m128 a, __m128 b) 159{ 160 return (__m128)__builtin_ia32_cmpltss(a, b); 161} 162 163static inline __m128 __attribute__((__always_inline__)) _mm_cmplt_ps(__m128 a, __m128 b) 164{ 165 return (__m128)__builtin_ia32_cmpltps(a, b); 166} 167 168static inline __m128 __attribute__((__always_inline__)) _mm_cmple_ss(__m128 a, __m128 b) 169{ 170 return (__m128)__builtin_ia32_cmpless(a, b); 171} 172 173static inline __m128 __attribute__((__always_inline__)) _mm_cmple_ps(__m128 a, __m128 b) 174{ 175 return (__m128)__builtin_ia32_cmpleps(a, b); 176} 177 178static inline __m128 __attribute__((__always_inline__)) _mm_cmpgt_ss(__m128 a, __m128 b) 179{ 180 return (__m128)__builtin_ia32_cmpltss(b, a); 181} 182 183static inline __m128 __attribute__((__always_inline__)) _mm_cmpgt_ps(__m128 a, __m128 b) 184{ 185 return (__m128)__builtin_ia32_cmpltps(b, a); 186} 187 188static inline __m128 __attribute__((__always_inline__)) _mm_cmpge_ss(__m128 a, __m128 b) 189{ 190 return (__m128)__builtin_ia32_cmpless(b, a); 191} 192 193static inline __m128 __attribute__((__always_inline__)) _mm_cmpge_ps(__m128 a, __m128 b) 194{ 195 return (__m128)__builtin_ia32_cmpleps(b, a); 196} 197 198static inline __m128 __attribute__((__always_inline__)) _mm_cmpneq_ss(__m128 a, __m128 b) 199{ 200 return (__m128)__builtin_ia32_cmpneqss(a, b); 201} 202 203static inline __m128 __attribute__((__always_inline__)) _mm_cmpneq_ps(__m128 a, __m128 b) 204{ 205 return (__m128)__builtin_ia32_cmpneqps(a, b); 206} 207 208static inline __m128 __attribute__((__always_inline__)) _mm_cmpnlt_ss(__m128 a, __m128 b) 209{ 210 return (__m128)__builtin_ia32_cmpnltss(a, b); 211} 212 213static inline __m128 __attribute__((__always_inline__)) _mm_cmpnlt_ps(__m128 a, __m128 b) 214{ 215 return (__m128)__builtin_ia32_cmpnltps(a, b); 216} 217 218static inline __m128 __attribute__((__always_inline__)) _mm_cmpnle_ss(__m128 a, __m128 b) 219{ 220 return (__m128)__builtin_ia32_cmpnless(a, b); 221} 222 223static inline __m128 __attribute__((__always_inline__)) _mm_cmpnle_ps(__m128 a, __m128 b) 224{ 225 return (__m128)__builtin_ia32_cmpnleps(a, b); 226} 227 228static inline __m128 __attribute__((__always_inline__)) _mm_cmpngt_ss(__m128 a, __m128 b) 229{ 230 return (__m128)__builtin_ia32_cmpnltss(b, a); 231} 232 233static inline __m128 __attribute__((__always_inline__)) _mm_cmpngt_ps(__m128 a, __m128 b) 234{ 235 return (__m128)__builtin_ia32_cmpnltps(b, a); 236} 237 238static inline __m128 __attribute__((__always_inline__)) _mm_cmpnge_ss(__m128 a, __m128 b) 239{ 240 return (__m128)__builtin_ia32_cmpnless(b, a); 241} 242 243static inline __m128 __attribute__((__always_inline__)) _mm_cmpnge_ps(__m128 a, __m128 b) 244{ 245 return (__m128)__builtin_ia32_cmpnleps(b, a); 246} 247 248static inline __m128 __attribute__((__always_inline__)) _mm_cmpord_ss(__m128 a, __m128 b) 249{ 250 return (__m128)__builtin_ia32_cmpordss(a, b); 251} 252 253static inline __m128 __attribute__((__always_inline__)) _mm_cmpord_ps(__m128 a, __m128 b) 254{ 255 return (__m128)__builtin_ia32_cmpordps(a, b); 256} 257 258static inline __m128 __attribute__((__always_inline__)) _mm_cmpunord_ss(__m128 a, __m128 b) 259{ 260 return (__m128)__builtin_ia32_cmpunordss(a, b); 261} 262 263static inline __m128 __attribute__((__always_inline__)) _mm_cmpunord_ps(__m128 a, __m128 b) 264{ 265 return (__m128)__builtin_ia32_cmpunordps(a, b); 266} 267 268static inline int __attribute__((__always_inline__)) _mm_comieq_ss(__m128 a, __m128 b) 269{ 270 return __builtin_ia32_comieq(a, b); 271} 272 273static inline int __attribute__((__always_inline__)) _mm_comilt_ss(__m128 a, __m128 b) 274{ 275 return __builtin_ia32_comilt(a, b); 276} 277 278static inline int __attribute__((__always_inline__)) _mm_comile_ss(__m128 a, __m128 b) 279{ 280 return __builtin_ia32_comile(a, b); 281} 282 283static inline int __attribute__((__always_inline__)) _mm_comigt_ss(__m128 a, __m128 b) 284{ 285 return __builtin_ia32_comigt(a, b); 286} 287 288static inline int __attribute__((__always_inline__)) _mm_comige_ss(__m128 a, __m128 b) 289{ 290 return __builtin_ia32_comige(a, b); 291} 292 293static inline int __attribute__((__always_inline__)) _mm_comineq_ss(__m128 a, __m128 b) 294{ 295 return __builtin_ia32_comineq(a, b); 296} 297 298static inline int __attribute__((__always_inline__)) _mm_ucomieq_ss(__m128 a, __m128 b) 299{ 300 return __builtin_ia32_ucomieq(a, b); 301} 302 303static inline int __attribute__((__always_inline__)) _mm_ucomilt_ss(__m128 a, __m128 b) 304{ 305 return __builtin_ia32_ucomilt(a, b); 306} 307 308static inline int __attribute__((__always_inline__)) _mm_ucomile_ss(__m128 a, __m128 b) 309{ 310 return __builtin_ia32_ucomile(a, b); 311} 312 313static inline int __attribute__((__always_inline__)) _mm_ucomigt_ss(__m128 a, __m128 b) 314{ 315 return __builtin_ia32_ucomigt(a, b); 316} 317 318static inline int __attribute__((__always_inline__)) _mm_ucomige_ss(__m128 a, __m128 b) 319{ 320 return __builtin_ia32_ucomige(a, b); 321} 322 323static inline int __attribute__((__always_inline__)) _mm_ucomineq_ss(__m128 a, __m128 b) 324{ 325 return __builtin_ia32_ucomineq(a, b); 326} 327 328static inline int __attribute__((__always_inline__)) _mm_cvtss_si32(__m128 a) 329{ 330 return __builtin_ia32_cvtss2si(a); 331} 332 333static inline long long __attribute__((__always_inline__)) _mm_cvtss_si64(__m128 a) 334{ 335 return __builtin_ia32_cvtss2si64(a); 336} 337 338static inline __m64 __attribute__((__always_inline__)) _mm_cvtps_pi32(__m128 a) 339{ 340 return (__m64)__builtin_ia32_cvtps2pi(a); 341} 342 343static inline int __attribute__((__always_inline__)) _mm_cvttss_si32(__m128 a) 344{ 345 return __builtin_ia32_cvttss2si(a); 346} 347 348static inline long long __attribute__((__always_inline__)) _mm_cvttss_si64(__m128 a) 349{ 350 return __builtin_ia32_cvttss2si64(a); 351} 352 353static inline __m64 __attribute__((__always_inline__)) _mm_cvttps_pi32(__m128 a) 354{ 355 return (__m64)__builtin_ia32_cvttps2pi(a); 356} 357 358static inline __m128 __attribute__((__always_inline__)) _mm_cvtsi32_ss(__m128 a, int b) 359{ 360 return __builtin_ia32_cvtsi2ss(a, b); 361} 362 363#ifdef __x86_64__ 364 365static inline __m128 __attribute__((__always_inline__)) _mm_cvtsi64_ss(__m128 a, long long b) 366{ 367 return __builtin_ia32_cvtsi642ss(a, b); 368} 369 370#endif 371 372static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi32_ps(__m128 a, __m64 b) 373{ 374 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 375} 376 377static inline float __attribute__((__always_inline__)) _mm_cvtss_f32(__m128 a) 378{ 379 return a[0]; 380} 381 382static inline __m128 __attribute__((__always_inline__)) _mm_loadh_pi(__m128 a, __m64 const *p) 383{ 384 return __builtin_ia32_loadhps(a, (__v2si *)p); 385} 386 387static inline __m128 __attribute__((__always_inline__)) _mm_loadl_pi(__m128 a, __m64 const *p) 388{ 389 return __builtin_ia32_loadlps(a, (__v2si *)p); 390} 391 392static inline __m128 __attribute__((__always_inline__)) _mm_load_ss(float *p) 393{ 394 return (__m128){ *p, 0, 0, 0 }; 395} 396 397static inline __m128 __attribute__((__always_inline__)) _mm_load1_ps(float *p) 398{ 399 return (__m128){ *p, *p, *p, *p }; 400} 401 402static inline __m128 __attribute__((__always_inline__)) _mm_load_ps(float *p) 403{ 404 return *(__m128*)p; 405} 406 407static inline __m128 __attribute__((__always_inline__)) _mm_loadu_ps(float *p) 408{ 409 return __builtin_ia32_loadups(p); 410} 411 412static inline __m128 __attribute__((__always_inline__)) _mm_loadr_ps(float *p) 413{ 414 __m128 a = _mm_load_ps(p); 415 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 416} 417 418static inline __m128 __attribute__((__always_inline__)) _mm_set_ss(float w) 419{ 420 return (__m128){ w, 0, 0, 0 }; 421} 422 423static inline __m128 __attribute__((__always_inline__)) _mm_set1_ps(float w) 424{ 425 return (__m128){ w, w, w, w }; 426} 427 428// Microsoft specific. 429static inline __m128 __attribute__((__always_inline__)) _mm_set_ps1(float w) 430{ 431 return _mm_set1_ps(w); 432} 433 434static inline __m128 __attribute__((__always_inline__)) _mm_set_ps(float z, float y, float x, float w) 435{ 436 return (__m128){ w, x, y, z }; 437} 438 439static inline __m128 __attribute__((__always_inline__)) _mm_setr_ps(float z, float y, float x, float w) 440{ 441 return (__m128){ z, y, x, w }; 442} 443 444static inline __m128 __attribute__((__always__inline__)) _mm_setzero_ps(void) 445{ 446 return (__m128){ 0, 0, 0, 0 }; 447} 448 449static inline void __attribute__((__always__inline__)) _mm_storeh_pi(__m64 *p, __m128 a) 450{ 451 __builtin_ia32_storehps((__v2si *)p, a); 452} 453 454static inline void __attribute__((__always__inline__)) _mm_storel_pi(__m64 *p, __m128 a) 455{ 456 __builtin_ia32_storelps((__v2si *)p, a); 457} 458 459static inline void __attribute__((__always__inline__)) _mm_store_ss(float *p, __m128 a) 460{ 461 *p = a[0]; 462} 463 464static inline void __attribute__((__always_inline__)) _mm_storeu_ps(float *p, __m128 a) 465{ 466 __builtin_ia32_storeups(p, a); 467} 468 469static inline void __attribute__((__always_inline__)) _mm_store1_ps(float *p, __m128 a) 470{ 471 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 472 _mm_storeu_ps(p, a); 473} 474 475static inline void __attribute__((__always_inline__)) _mm_store_ps(float *p, __m128 a) 476{ 477 *(__m128 *)p = a; 478} 479 480static inline void __attribute__((__always_inline__)) _mm_storer_ps(float *p, __m128 a) 481{ 482 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 483 _mm_store_ps(p, a); 484} 485 486#define _MM_HINT_T0 1 487#define _MM_HINT_T1 2 488#define _MM_HINT_T2 3 489#define _MM_HINT_NTA 0 490 491/* FIXME: We have to #define this because "sel" must be a constant integer, and 492 Sema doesn't do any form of constant propagation yet. */ 493 494#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 495 496static inline void __attribute__((__always_inline__)) _mm_stream_pi(__m64 *p, __m64 a) 497{ 498 __builtin_ia32_movntq(p, a); 499} 500 501static inline void __attribute__((__always_inline__)) _mm_stream_ps(float *p, __m128 a) 502{ 503 __builtin_ia32_movntps(p, a); 504} 505 506static inline void __attribute__((__always_inline__)) _mm_sfence(void) 507{ 508 __builtin_ia32_sfence(); 509} 510 511static inline int __attribute__((__always_inline__)) _mm_extract_pi16(__m64 a, int n) 512{ 513 /* FIXME: 514 * This should force n to be an immediate. 515 * This does not use the PEXTRW instruction. From looking at the LLVM source, the 516 instruction doesn't seem to be hooked up. 517 * The code could probably be made better :) 518 */ 519 __v4hi b = (__v4hi)a; 520 return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))]; 521} 522 523/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to 524 the already existing __builtin_shufflevector. 525*/ 526/* 527static inline __m64 __attribute__((__always_inline__)) _mm_insert_pi16(__m64 a, int d, int n) 528{ 529 return (__m64){ 0LL }; 530} 531*/ 532 533static inline __m64 __attribute__((__always_inline__)) _mm_max_pi16(__m64 a, __m64 b) 534{ 535 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 536} 537 538static inline __m64 __attribute__((__always_inline__)) _mm_max_pu8(__m64 a, __m64 b) 539{ 540 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 541} 542 543static inline __m64 __attribute__((__always_inline__)) _mm_min_pi16(__m64 a, __m64 b) 544{ 545 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 546} 547 548static inline __m64 __attribute__((__always_inline__)) _mm_min_pu8(__m64 a, __m64 b) 549{ 550 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 551} 552 553static inline int __attribute__((__always_inline__)) _mm_movemask_pi8(__m64 a) 554{ 555 return __builtin_ia32_pmovmskb((__v8qi)a); 556} 557 558static inline __m64 __attribute__((__always_inline__)) _mm_mulhi_pu16(__m64 a, __m64 b) 559{ 560 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 561} 562 563#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n)) 564 565static inline void __attribute__((__always_inline__)) _mm_maskmove_si64(__m64 d, __m64 n, char *p) 566{ 567 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 568} 569 570static inline __m64 __attribute__((__always_inline__)) _mm_avg_pu8(__m64 a, __m64 b) 571{ 572 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 573} 574 575static inline __m64 __attribute__((__always_inline__)) _mm_avg_pu16(__m64 a, __m64 b) 576{ 577 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 578} 579 580static inline __m64 __attribute__((__always_inline___)) _mm_sad_pu8(__m64 a, __m64 b) 581{ 582 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 583} 584 585static inline unsigned int __attribute__((__always_inline___)) _mm_getcsr(void) 586{ 587 return __builtin_ia32_stmxcsr(); 588} 589 590static inline void __attribute__((__always_inline__)) _mm_setcsr(unsigned int i) 591{ 592 __builtin_ia32_ldmxcsr(i); 593} 594 595#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask)) 596 597static inline __m128 __attribute__((__always_inline__)) _mm_unpackhi_ps(__m128 a, __m128 b) 598{ 599 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 600} 601 602static inline __m128 __attribute__((__always_inline__)) _mm_unpacklo_ps(__m128 a, __m128 b) 603{ 604 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 605} 606 607static inline __m128 __attribute__((__always_inline__)) _mm_move_ss(__m128 a, __m128 b) 608{ 609 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 610} 611 612static inline __m128 __attribute__((__always_inline__)) _mm_movehl_ps(__m128 a, __m128 b) 613{ 614 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 615} 616 617static inline __m128 __attribute__((__always_inline__)) _mm_movelh_ps(__m128 a, __m128 b) 618{ 619 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 620} 621 622static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi16_ps(__m64 a) 623{ 624 __m64 b, c; 625 __m128 r; 626 627 b = _mm_setzero_si64(); 628 b = _mm_cmpgt_pi16(b, a); 629 c = _mm_unpackhi_pi16(a, b); 630 r = _mm_setzero_ps(); 631 r = _mm_cvtpi32_ps(r, c); 632 r = _mm_movelh_ps(r, r); 633 c = _mm_unpacklo_pi16(a, b); 634 r = _mm_cvtpi32_ps(r, c); 635 636 return r; 637} 638 639static inline __m128 __attribute__((__always_inline__)) _mm_cvtpu16_ps(__m64 a) 640{ 641 __m64 b, c; 642 __m128 r; 643 644 b = _mm_setzero_si64(); 645 c = _mm_unpackhi_pi16(a, b); 646 r = _mm_setzero_ps(); 647 r = _mm_cvtpi32_ps(r, c); 648 r = _mm_movelh_ps(r, r); 649 c = _mm_unpacklo_pi16(a, b); 650 r = _mm_cvtpi32_ps(r, c); 651 652 return r; 653} 654 655static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi8_ps(__m64 a) 656{ 657 __m64 b; 658 659 b = _mm_setzero_si64(); 660 b = _mm_cmpgt_pi8(b, a); 661 b = _mm_unpacklo_pi8(a, b); 662 663 return _mm_cvtpi16_ps(b); 664} 665 666static inline __m128 __attribute__((__always_inline__)) _mm_cvtpu8_ps(__m64 a) 667{ 668 __m64 b; 669 670 b = _mm_setzero_si64(); 671 b = _mm_unpacklo_pi8(a, b); 672 673 return _mm_cvtpi16_ps(b); 674} 675 676static inline __m128 __attribute__((__always_inline__)) _mm_cvtpi32x2_ps(__m64 a, __m64 b) 677{ 678 __m128 c; 679 680 c = _mm_setzero_ps(); 681 c = _mm_cvtpi32_ps(c, b); 682 c = _mm_movelh_ps(c, c); 683 684 return _mm_cvtpi32_ps(c, a); 685} 686 687static inline __m64 __attribute__((__always_inline__)) _mm_cvtps_pi16(__m128 a) 688{ 689 __m64 b, c; 690 691 b = _mm_cvtps_pi32(a); 692 a = _mm_movehl_ps(a, a); 693 c = _mm_cvtps_pi32(a); 694 695 return _mm_packs_pi16(b, c); 696} 697 698static inline __m64 __attribute__((__always_inline__)) _mm_cvtps_pi8(__m128 a) 699{ 700 __m64 b, c; 701 702 b = _mm_cvtps_pi16(a); 703 c = _mm_setzero_si64(); 704 705 return _mm_packs_pi16(b, c); 706} 707 708static inline int __attribute__((__always_inline__)) _mm_movemask_ps(__m128 a) 709{ 710 return __builtin_ia32_movmskps(a); 711} 712 713#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 714 715#define _MM_MASK_MASK (0x1f80) 716#define _MM_EXCEPT_MASK (0x003f) 717#define _MM_FLUSH_ZERO_MASK (0x8000) 718#define _MM_ROUND_MASK (0x6000) 719 720#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 721#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 722#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 723#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 724 725#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 726#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 727#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 728#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 729 730#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 731do { \ 732 __m128 tmp3, tmp2, tmp1, tmp0; \ 733 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 734 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 735 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 736 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 737 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 738 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 739 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 740 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 741} while (0) 742 743#include <emmintrin.h> 744 745#endif /* __SSE__ */ 746 747#endif /* __XMMINTRIN_H */ 748