xmmintrin.h revision a2f12ae0e3893cfa703abbe43c74d513abebe1a1
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef float __v4sf __attribute__((__vector_size__(16))); 34typedef float __m128 __attribute__((__vector_size__(16))); 35 36#include <mm_malloc.h> 37 38static inline __m128 __attribute__((__always_inline__, __nodebug__)) 39_mm_add_ss(__m128 a, __m128 b) 40{ 41 return __builtin_ia32_addss(a, b); 42} 43 44static inline __m128 __attribute__((__always_inline__, __nodebug__)) 45_mm_add_ps(__m128 a, __m128 b) 46{ 47 return a + b; 48} 49 50static inline __m128 __attribute__((__always_inline__, __nodebug__)) 51_mm_sub_ss(__m128 a, __m128 b) 52{ 53 return __builtin_ia32_subss(a, b); 54} 55 56static inline __m128 __attribute__((__always_inline__, __nodebug__)) 57_mm_sub_ps(__m128 a, __m128 b) 58{ 59 return a - b; 60} 61 62static inline __m128 __attribute__((__always_inline__, __nodebug__)) 63_mm_mul_ss(__m128 a, __m128 b) 64{ 65 return __builtin_ia32_mulss(a, b); 66} 67 68static inline __m128 __attribute__((__always_inline__, __nodebug__)) 69_mm_mul_ps(__m128 a, __m128 b) 70{ 71 return a * b; 72} 73 74static inline __m128 __attribute__((__always_inline__, __nodebug__)) 75_mm_div_ss(__m128 a, __m128 b) 76{ 77 return __builtin_ia32_divss(a, b); 78} 79 80static inline __m128 __attribute__((__always_inline__, __nodebug__)) 81_mm_div_ps(__m128 a, __m128 b) 82{ 83 return a / b; 84} 85 86static inline __m128 __attribute__((__always_inline__, __nodebug__)) 87_mm_sqrt_ss(__m128 a) 88{ 89 return __builtin_ia32_sqrtss(a); 90} 91 92static inline __m128 __attribute__((__always_inline__, __nodebug__)) 93_mm_sqrt_ps(__m128 a) 94{ 95 return __builtin_ia32_sqrtps(a); 96} 97 98static inline __m128 __attribute__((__always_inline__, __nodebug__)) 99_mm_rcp_ss(__m128 a) 100{ 101 return __builtin_ia32_rcpss(a); 102} 103 104static inline __m128 __attribute__((__always_inline__, __nodebug__)) 105_mm_rcp_ps(__m128 a) 106{ 107 return __builtin_ia32_rcpps(a); 108} 109 110static inline __m128 __attribute__((__always_inline__, __nodebug__)) 111_mm_rsqrt_ss(__m128 a) 112{ 113 return __builtin_ia32_rsqrtss(a); 114} 115 116static inline __m128 __attribute__((__always_inline__, __nodebug__)) 117_mm_rsqrt_ps(__m128 a) 118{ 119 return __builtin_ia32_rsqrtps(a); 120} 121 122static inline __m128 __attribute__((__always_inline__, __nodebug__)) 123_mm_min_ss(__m128 a, __m128 b) 124{ 125 return __builtin_ia32_minss(a, b); 126} 127 128static inline __m128 __attribute__((__always_inline__, __nodebug__)) 129_mm_min_ps(__m128 a, __m128 b) 130{ 131 return __builtin_ia32_minps(a, b); 132} 133 134static inline __m128 __attribute__((__always_inline__, __nodebug__)) 135_mm_max_ss(__m128 a, __m128 b) 136{ 137 return __builtin_ia32_maxss(a, b); 138} 139 140static inline __m128 __attribute__((__always_inline__, __nodebug__)) 141_mm_max_ps(__m128 a, __m128 b) 142{ 143 return __builtin_ia32_maxps(a, b); 144} 145 146static inline __m128 __attribute__((__always_inline__, __nodebug__)) 147_mm_and_ps(__m128 a, __m128 b) 148{ 149 return __builtin_ia32_andps(a, b); 150} 151 152static inline __m128 __attribute__((__always_inline__, __nodebug__)) 153_mm_andnot_ps(__m128 a, __m128 b) 154{ 155 return __builtin_ia32_andnps(a, b); 156} 157 158static inline __m128 __attribute__((__always_inline__, __nodebug__)) 159_mm_or_ps(__m128 a, __m128 b) 160{ 161 return __builtin_ia32_orps(a, b); 162} 163 164static inline __m128 __attribute__((__always_inline__, __nodebug__)) 165_mm_xor_ps(__m128 a, __m128 b) 166{ 167 return __builtin_ia32_xorps(a, b); 168} 169 170static inline __m128 __attribute__((__always_inline__, __nodebug__)) 171_mm_cmpeq_ss(__m128 a, __m128 b) 172{ 173 return (__m128)__builtin_ia32_cmpeqss(a, b); 174} 175 176static inline __m128 __attribute__((__always_inline__, __nodebug__)) 177_mm_cmpeq_ps(__m128 a, __m128 b) 178{ 179 return (__m128)__builtin_ia32_cmpeqps(a, b); 180} 181 182static inline __m128 __attribute__((__always_inline__, __nodebug__)) 183_mm_cmplt_ss(__m128 a, __m128 b) 184{ 185 return (__m128)__builtin_ia32_cmpltss(a, b); 186} 187 188static inline __m128 __attribute__((__always_inline__, __nodebug__)) 189_mm_cmplt_ps(__m128 a, __m128 b) 190{ 191 return (__m128)__builtin_ia32_cmpltps(a, b); 192} 193 194static inline __m128 __attribute__((__always_inline__, __nodebug__)) 195_mm_cmple_ss(__m128 a, __m128 b) 196{ 197 return (__m128)__builtin_ia32_cmpless(a, b); 198} 199 200static inline __m128 __attribute__((__always_inline__, __nodebug__)) 201_mm_cmple_ps(__m128 a, __m128 b) 202{ 203 return (__m128)__builtin_ia32_cmpleps(a, b); 204} 205 206static inline __m128 __attribute__((__always_inline__, __nodebug__)) 207_mm_cmpgt_ss(__m128 a, __m128 b) 208{ 209 return (__m128)__builtin_ia32_cmpltss(b, a); 210} 211 212static inline __m128 __attribute__((__always_inline__, __nodebug__)) 213_mm_cmpgt_ps(__m128 a, __m128 b) 214{ 215 return (__m128)__builtin_ia32_cmpltps(b, a); 216} 217 218static inline __m128 __attribute__((__always_inline__, __nodebug__)) 219_mm_cmpge_ss(__m128 a, __m128 b) 220{ 221 return (__m128)__builtin_ia32_cmpless(b, a); 222} 223 224static inline __m128 __attribute__((__always_inline__, __nodebug__)) 225_mm_cmpge_ps(__m128 a, __m128 b) 226{ 227 return (__m128)__builtin_ia32_cmpleps(b, a); 228} 229 230static inline __m128 __attribute__((__always_inline__, __nodebug__)) 231_mm_cmpneq_ss(__m128 a, __m128 b) 232{ 233 return (__m128)__builtin_ia32_cmpneqss(a, b); 234} 235 236static inline __m128 __attribute__((__always_inline__, __nodebug__)) 237_mm_cmpneq_ps(__m128 a, __m128 b) 238{ 239 return (__m128)__builtin_ia32_cmpneqps(a, b); 240} 241 242static inline __m128 __attribute__((__always_inline__, __nodebug__)) 243_mm_cmpnlt_ss(__m128 a, __m128 b) 244{ 245 return (__m128)__builtin_ia32_cmpnltss(a, b); 246} 247 248static inline __m128 __attribute__((__always_inline__, __nodebug__)) 249_mm_cmpnlt_ps(__m128 a, __m128 b) 250{ 251 return (__m128)__builtin_ia32_cmpnltps(a, b); 252} 253 254static inline __m128 __attribute__((__always_inline__, __nodebug__)) 255_mm_cmpnle_ss(__m128 a, __m128 b) 256{ 257 return (__m128)__builtin_ia32_cmpnless(a, b); 258} 259 260static inline __m128 __attribute__((__always_inline__, __nodebug__)) 261_mm_cmpnle_ps(__m128 a, __m128 b) 262{ 263 return (__m128)__builtin_ia32_cmpnleps(a, b); 264} 265 266static inline __m128 __attribute__((__always_inline__, __nodebug__)) 267_mm_cmpngt_ss(__m128 a, __m128 b) 268{ 269 return (__m128)__builtin_ia32_cmpnltss(b, a); 270} 271 272static inline __m128 __attribute__((__always_inline__, __nodebug__)) 273_mm_cmpngt_ps(__m128 a, __m128 b) 274{ 275 return (__m128)__builtin_ia32_cmpnltps(b, a); 276} 277 278static inline __m128 __attribute__((__always_inline__, __nodebug__)) 279_mm_cmpnge_ss(__m128 a, __m128 b) 280{ 281 return (__m128)__builtin_ia32_cmpnless(b, a); 282} 283 284static inline __m128 __attribute__((__always_inline__, __nodebug__)) 285_mm_cmpnge_ps(__m128 a, __m128 b) 286{ 287 return (__m128)__builtin_ia32_cmpnleps(b, a); 288} 289 290static inline __m128 __attribute__((__always_inline__, __nodebug__)) 291_mm_cmpord_ss(__m128 a, __m128 b) 292{ 293 return (__m128)__builtin_ia32_cmpordss(a, b); 294} 295 296static inline __m128 __attribute__((__always_inline__, __nodebug__)) 297_mm_cmpord_ps(__m128 a, __m128 b) 298{ 299 return (__m128)__builtin_ia32_cmpordps(a, b); 300} 301 302static inline __m128 __attribute__((__always_inline__, __nodebug__)) 303_mm_cmpunord_ss(__m128 a, __m128 b) 304{ 305 return (__m128)__builtin_ia32_cmpunordss(a, b); 306} 307 308static inline __m128 __attribute__((__always_inline__, __nodebug__)) 309_mm_cmpunord_ps(__m128 a, __m128 b) 310{ 311 return (__m128)__builtin_ia32_cmpunordps(a, b); 312} 313 314static inline int __attribute__((__always_inline__, __nodebug__)) 315_mm_comieq_ss(__m128 a, __m128 b) 316{ 317 return __builtin_ia32_comieq(a, b); 318} 319 320static inline int __attribute__((__always_inline__, __nodebug__)) 321_mm_comilt_ss(__m128 a, __m128 b) 322{ 323 return __builtin_ia32_comilt(a, b); 324} 325 326static inline int __attribute__((__always_inline__, __nodebug__)) 327_mm_comile_ss(__m128 a, __m128 b) 328{ 329 return __builtin_ia32_comile(a, b); 330} 331 332static inline int __attribute__((__always_inline__, __nodebug__)) 333_mm_comigt_ss(__m128 a, __m128 b) 334{ 335 return __builtin_ia32_comigt(a, b); 336} 337 338static inline int __attribute__((__always_inline__, __nodebug__)) 339_mm_comige_ss(__m128 a, __m128 b) 340{ 341 return __builtin_ia32_comige(a, b); 342} 343 344static inline int __attribute__((__always_inline__, __nodebug__)) 345_mm_comineq_ss(__m128 a, __m128 b) 346{ 347 return __builtin_ia32_comineq(a, b); 348} 349 350static inline int __attribute__((__always_inline__, __nodebug__)) 351_mm_ucomieq_ss(__m128 a, __m128 b) 352{ 353 return __builtin_ia32_ucomieq(a, b); 354} 355 356static inline int __attribute__((__always_inline__, __nodebug__)) 357_mm_ucomilt_ss(__m128 a, __m128 b) 358{ 359 return __builtin_ia32_ucomilt(a, b); 360} 361 362static inline int __attribute__((__always_inline__, __nodebug__)) 363_mm_ucomile_ss(__m128 a, __m128 b) 364{ 365 return __builtin_ia32_ucomile(a, b); 366} 367 368static inline int __attribute__((__always_inline__, __nodebug__)) 369_mm_ucomigt_ss(__m128 a, __m128 b) 370{ 371 return __builtin_ia32_ucomigt(a, b); 372} 373 374static inline int __attribute__((__always_inline__, __nodebug__)) 375_mm_ucomige_ss(__m128 a, __m128 b) 376{ 377 return __builtin_ia32_ucomige(a, b); 378} 379 380static inline int __attribute__((__always_inline__, __nodebug__)) 381_mm_ucomineq_ss(__m128 a, __m128 b) 382{ 383 return __builtin_ia32_ucomineq(a, b); 384} 385 386static inline int __attribute__((__always_inline__, __nodebug__)) 387_mm_cvtss_si32(__m128 a) 388{ 389 return __builtin_ia32_cvtss2si(a); 390} 391 392static inline long long __attribute__((__always_inline__, __nodebug__)) 393_mm_cvtss_si64(__m128 a) 394{ 395 return __builtin_ia32_cvtss2si64(a); 396} 397 398static inline __m64 __attribute__((__always_inline__, __nodebug__)) 399_mm_cvtps_pi32(__m128 a) 400{ 401 return (__m64)__builtin_ia32_cvtps2pi(a); 402} 403 404static inline int __attribute__((__always_inline__, __nodebug__)) 405_mm_cvttss_si32(__m128 a) 406{ 407 return __builtin_ia32_cvttss2si(a); 408} 409 410static inline long long __attribute__((__always_inline__, __nodebug__)) 411_mm_cvttss_si64(__m128 a) 412{ 413 return __builtin_ia32_cvttss2si64(a); 414} 415 416static inline __m64 __attribute__((__always_inline__, __nodebug__)) 417_mm_cvttps_pi32(__m128 a) 418{ 419 return (__m64)__builtin_ia32_cvttps2pi(a); 420} 421 422static inline __m128 __attribute__((__always_inline__, __nodebug__)) 423_mm_cvtsi32_ss(__m128 a, int b) 424{ 425 return __builtin_ia32_cvtsi2ss(a, b); 426} 427 428#ifdef __x86_64__ 429 430static inline __m128 __attribute__((__always_inline__, __nodebug__)) 431_mm_cvtsi64_ss(__m128 a, long long b) 432{ 433 return __builtin_ia32_cvtsi642ss(a, b); 434} 435 436#endif 437 438static inline __m128 __attribute__((__always_inline__, __nodebug__)) 439_mm_cvtpi32_ps(__m128 a, __m64 b) 440{ 441 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 442} 443 444static inline float __attribute__((__always_inline__, __nodebug__)) 445_mm_cvtss_f32(__m128 a) 446{ 447 return a[0]; 448} 449 450static inline __m128 __attribute__((__always_inline__, __nodebug__)) 451_mm_loadh_pi(__m128 a, __m64 const *p) 452{ 453 return __builtin_ia32_loadhps(a, (__v2si *)p); 454} 455 456static inline __m128 __attribute__((__always_inline__, __nodebug__)) 457_mm_loadl_pi(__m128 a, __m64 const *p) 458{ 459 return __builtin_ia32_loadlps(a, (__v2si *)p); 460} 461 462static inline __m128 __attribute__((__always_inline__, __nodebug__)) 463_mm_load_ss(float *p) 464{ 465 return (__m128){ *p, 0, 0, 0 }; 466} 467 468static inline __m128 __attribute__((__always_inline__, __nodebug__)) 469_mm_load1_ps(float *p) 470{ 471 return (__m128){ *p, *p, *p, *p }; 472} 473 474static inline __m128 __attribute__((__always_inline__, __nodebug__)) 475_mm_load_ps(float *p) 476{ 477 return *(__m128*)p; 478} 479 480static inline __m128 __attribute__((__always_inline__, __nodebug__)) 481_mm_loadu_ps(float *p) 482{ 483 return __builtin_ia32_loadups(p); 484} 485 486static inline __m128 __attribute__((__always_inline__, __nodebug__)) 487_mm_loadr_ps(float *p) 488{ 489 __m128 a = _mm_load_ps(p); 490 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 491} 492 493static inline __m128 __attribute__((__always_inline__, __nodebug__)) 494_mm_set_ss(float w) 495{ 496 return (__m128){ w, 0, 0, 0 }; 497} 498 499static inline __m128 __attribute__((__always_inline__, __nodebug__)) 500_mm_set1_ps(float w) 501{ 502 return (__m128){ w, w, w, w }; 503} 504 505// Microsoft specific. 506static inline __m128 __attribute__((__always_inline__, __nodebug__)) 507_mm_set_ps1(float w) 508{ 509 return _mm_set1_ps(w); 510} 511 512static inline __m128 __attribute__((__always_inline__, __nodebug__)) 513_mm_set_ps(float z, float y, float x, float w) 514{ 515 return (__m128){ w, x, y, z }; 516} 517 518static inline __m128 __attribute__((__always_inline__, __nodebug__)) 519_mm_setr_ps(float z, float y, float x, float w) 520{ 521 return (__m128){ z, y, x, w }; 522} 523 524static inline __m128 __attribute__((__always__inline__)) 525_mm_setzero_ps(void) 526{ 527 return (__m128){ 0, 0, 0, 0 }; 528} 529 530static inline void __attribute__((__always__inline__)) 531_mm_storeh_pi(__m64 *p, __m128 a) 532{ 533 __builtin_ia32_storehps((__v2si *)p, a); 534} 535 536static inline void __attribute__((__always__inline__)) 537_mm_storel_pi(__m64 *p, __m128 a) 538{ 539 __builtin_ia32_storelps((__v2si *)p, a); 540} 541 542static inline void __attribute__((__always__inline__)) 543_mm_store_ss(float *p, __m128 a) 544{ 545 *p = a[0]; 546} 547 548static inline void __attribute__((__always_inline__, __nodebug__)) 549_mm_storeu_ps(float *p, __m128 a) 550{ 551 __builtin_ia32_storeups(p, a); 552} 553 554static inline void __attribute__((__always_inline__, __nodebug__)) 555_mm_store1_ps(float *p, __m128 a) 556{ 557 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 558 _mm_storeu_ps(p, a); 559} 560 561static inline void __attribute__((__always_inline__, __nodebug__)) 562_mm_store_ps(float *p, __m128 a) 563{ 564 *(__m128 *)p = a; 565} 566 567static inline void __attribute__((__always_inline__, __nodebug__)) 568_mm_storer_ps(float *p, __m128 a) 569{ 570 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 571 _mm_store_ps(p, a); 572} 573 574#define _MM_HINT_T0 1 575#define _MM_HINT_T1 2 576#define _MM_HINT_T2 3 577#define _MM_HINT_NTA 0 578 579/* FIXME: We have to #define this because "sel" must be a constant integer, and 580 Sema doesn't do any form of constant propagation yet. */ 581 582#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 583 584static inline void __attribute__((__always_inline__, __nodebug__)) 585_mm_stream_pi(__m64 *p, __m64 a) 586{ 587 __builtin_ia32_movntq(p, a); 588} 589 590static inline void __attribute__((__always_inline__, __nodebug__)) 591_mm_stream_ps(float *p, __m128 a) 592{ 593 __builtin_ia32_movntps(p, a); 594} 595 596static inline void __attribute__((__always_inline__, __nodebug__)) 597_mm_sfence(void) 598{ 599 __builtin_ia32_sfence(); 600} 601 602static inline int __attribute__((__always_inline__, __nodebug__)) 603_mm_extract_pi16(__m64 a, int n) 604{ 605 /* FIXME: 606 * This should force n to be an immediate. 607 * This does not use the PEXTRW instruction. From looking at the LLVM source, the 608 instruction doesn't seem to be hooked up. 609 * The code could probably be made better :) 610 */ 611 __v4hi b = (__v4hi)a; 612 return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))]; 613} 614 615/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to 616 the already existing __builtin_shufflevector. 617*/ 618/* 619static inline __m64 __attribute__((__always_inline__, __nodebug__)) 620_mm_insert_pi16(__m64 a, int d, int n) 621{ 622 return (__m64){ 0LL }; 623} 624*/ 625 626static inline __m64 __attribute__((__always_inline__, __nodebug__)) 627_mm_max_pi16(__m64 a, __m64 b) 628{ 629 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 630} 631 632static inline __m64 __attribute__((__always_inline__, __nodebug__)) 633_mm_max_pu8(__m64 a, __m64 b) 634{ 635 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 636} 637 638static inline __m64 __attribute__((__always_inline__, __nodebug__)) 639_mm_min_pi16(__m64 a, __m64 b) 640{ 641 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 642} 643 644static inline __m64 __attribute__((__always_inline__, __nodebug__)) 645_mm_min_pu8(__m64 a, __m64 b) 646{ 647 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 648} 649 650static inline int __attribute__((__always_inline__, __nodebug__)) 651_mm_movemask_pi8(__m64 a) 652{ 653 return __builtin_ia32_pmovmskb((__v8qi)a); 654} 655 656static inline __m64 __attribute__((__always_inline__, __nodebug__)) 657_mm_mulhi_pu16(__m64 a, __m64 b) 658{ 659 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 660} 661 662#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n)) 663 664static inline void __attribute__((__always_inline__, __nodebug__)) 665_mm_maskmove_si64(__m64 d, __m64 n, char *p) 666{ 667 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 668} 669 670static inline __m64 __attribute__((__always_inline__, __nodebug__)) 671_mm_avg_pu8(__m64 a, __m64 b) 672{ 673 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 674} 675 676static inline __m64 __attribute__((__always_inline__, __nodebug__)) 677_mm_avg_pu16(__m64 a, __m64 b) 678{ 679 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 680} 681 682static inline __m64 __attribute__((__always_inline__, __nodebug___)) 683_mm_sad_pu8(__m64 a, __m64 b) 684{ 685 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 686} 687 688static inline unsigned int __attribute__((__always_inline__, __nodebug___)) 689_mm_getcsr(void) 690{ 691 return __builtin_ia32_stmxcsr(); 692} 693 694static inline void __attribute__((__always_inline__, __nodebug__)) 695_mm_setcsr(unsigned int i) 696{ 697 __builtin_ia32_ldmxcsr(i); 698} 699 700#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask)) 701 702static inline __m128 __attribute__((__always_inline__, __nodebug__)) 703_mm_unpackhi_ps(__m128 a, __m128 b) 704{ 705 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 706} 707 708static inline __m128 __attribute__((__always_inline__, __nodebug__)) 709_mm_unpacklo_ps(__m128 a, __m128 b) 710{ 711 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 712} 713 714static inline __m128 __attribute__((__always_inline__, __nodebug__)) 715_mm_move_ss(__m128 a, __m128 b) 716{ 717 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 718} 719 720static inline __m128 __attribute__((__always_inline__, __nodebug__)) 721_mm_movehl_ps(__m128 a, __m128 b) 722{ 723 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 724} 725 726static inline __m128 __attribute__((__always_inline__, __nodebug__)) 727_mm_movelh_ps(__m128 a, __m128 b) 728{ 729 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 730} 731 732static inline __m128 __attribute__((__always_inline__, __nodebug__)) 733_mm_cvtpi16_ps(__m64 a) 734{ 735 __m64 b, c; 736 __m128 r; 737 738 b = _mm_setzero_si64(); 739 b = _mm_cmpgt_pi16(b, a); 740 c = _mm_unpackhi_pi16(a, b); 741 r = _mm_setzero_ps(); 742 r = _mm_cvtpi32_ps(r, c); 743 r = _mm_movelh_ps(r, r); 744 c = _mm_unpacklo_pi16(a, b); 745 r = _mm_cvtpi32_ps(r, c); 746 747 return r; 748} 749 750static inline __m128 __attribute__((__always_inline__, __nodebug__)) 751_mm_cvtpu16_ps(__m64 a) 752{ 753 __m64 b, c; 754 __m128 r; 755 756 b = _mm_setzero_si64(); 757 c = _mm_unpackhi_pi16(a, b); 758 r = _mm_setzero_ps(); 759 r = _mm_cvtpi32_ps(r, c); 760 r = _mm_movelh_ps(r, r); 761 c = _mm_unpacklo_pi16(a, b); 762 r = _mm_cvtpi32_ps(r, c); 763 764 return r; 765} 766 767static inline __m128 __attribute__((__always_inline__, __nodebug__)) 768_mm_cvtpi8_ps(__m64 a) 769{ 770 __m64 b; 771 772 b = _mm_setzero_si64(); 773 b = _mm_cmpgt_pi8(b, a); 774 b = _mm_unpacklo_pi8(a, b); 775 776 return _mm_cvtpi16_ps(b); 777} 778 779static inline __m128 __attribute__((__always_inline__, __nodebug__)) 780_mm_cvtpu8_ps(__m64 a) 781{ 782 __m64 b; 783 784 b = _mm_setzero_si64(); 785 b = _mm_unpacklo_pi8(a, b); 786 787 return _mm_cvtpi16_ps(b); 788} 789 790static inline __m128 __attribute__((__always_inline__, __nodebug__)) 791_mm_cvtpi32x2_ps(__m64 a, __m64 b) 792{ 793 __m128 c; 794 795 c = _mm_setzero_ps(); 796 c = _mm_cvtpi32_ps(c, b); 797 c = _mm_movelh_ps(c, c); 798 799 return _mm_cvtpi32_ps(c, a); 800} 801 802static inline __m64 __attribute__((__always_inline__, __nodebug__)) 803_mm_cvtps_pi16(__m128 a) 804{ 805 __m64 b, c; 806 807 b = _mm_cvtps_pi32(a); 808 a = _mm_movehl_ps(a, a); 809 c = _mm_cvtps_pi32(a); 810 811 return _mm_packs_pi16(b, c); 812} 813 814static inline __m64 __attribute__((__always_inline__, __nodebug__)) 815_mm_cvtps_pi8(__m128 a) 816{ 817 __m64 b, c; 818 819 b = _mm_cvtps_pi16(a); 820 c = _mm_setzero_si64(); 821 822 return _mm_packs_pi16(b, c); 823} 824 825static inline int __attribute__((__always_inline__, __nodebug__)) 826_mm_movemask_ps(__m128 a) 827{ 828 return __builtin_ia32_movmskps(a); 829} 830 831#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 832 833#define _MM_EXCEPT_INVALID (0x0001) 834#define _MM_EXCEPT_DENORM (0x0002) 835#define _MM_EXCEPT_DIV_ZERO (0x0004) 836#define _MM_EXCEPT_OVERFLOW (0x0008) 837#define _MM_EXCEPT_UNDERFLOW (0x0010) 838#define _MM_EXCEPT_INEXACT (0x0020) 839#define _MM_EXCEPT_MASK (0x003f) 840 841#define _MM_MASK_INVALID (0x0080) 842#define _MM_MASK_DENORM (0x0100) 843#define _MM_EXCEPT_DIV_ZERO (0x0200) 844#define _MM_EXCEPT_OVERFLOW (0x0400) 845#define _MM_EXCEPT_UNDERFLOW (0x0800) 846#define _MM_EXCEPT_INEXACT (0x1000) 847#define _MM_MASK_MASK (0x1f80) 848 849#define _MM_ROUND_NEAREST (0x0000) 850#define _MM_ROUND_DOWN (0x2000) 851#define _MM_ROUND_UP (0x4000) 852#define _MM_ROUND_TOWARD_ZERO (0x6000) 853#define _MM_ROUND_MASK (0x6000) 854 855#define _MM_FLUSH_ZERO_MASK (0x8000) 856#define _MM_FLUSH_ZERO_ON (0x8000) 857#define _MM_FLUSH_ZERO_OFF (0x8000) 858 859#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 860#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 861#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 862#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 863 864#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 865#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 866#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 867#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 868 869#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 870do { \ 871 __m128 tmp3, tmp2, tmp1, tmp0; \ 872 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 873 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 874 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 875 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 876 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 877 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 878 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 879 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 880} while (0) 881 882#include <emmintrin.h> 883 884#endif /* __SSE__ */ 885 886#endif /* __XMMINTRIN_H */ 887