xmmintrin.h revision 17d2e3a7d15dc809a25896973d4aa2205e63c122
1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#ifndef __SSE__ 28#error "SSE instruction set not enabled" 29#else 30 31#include <mmintrin.h> 32 33typedef float __v4sf __attribute__((__vector_size__(16))); 34typedef float __m128 __attribute__((__vector_size__(16))); 35 36#include <mm_malloc.h> 37 38static inline __m128 __attribute__((__always_inline__, __nodebug__)) 39_mm_add_ss(__m128 a, __m128 b) 40{ 41 a[0] += b[0]; 42 return a; 43} 44 45static inline __m128 __attribute__((__always_inline__, __nodebug__)) 46_mm_add_ps(__m128 a, __m128 b) 47{ 48 return a + b; 49} 50 51static inline __m128 __attribute__((__always_inline__, __nodebug__)) 52_mm_sub_ss(__m128 a, __m128 b) 53{ 54 a[0] -= b[0]; 55 return a; 56} 57 58static inline __m128 __attribute__((__always_inline__, __nodebug__)) 59_mm_sub_ps(__m128 a, __m128 b) 60{ 61 return a - b; 62} 63 64static inline __m128 __attribute__((__always_inline__, __nodebug__)) 65_mm_mul_ss(__m128 a, __m128 b) 66{ 67 a[0] *= b[0]; 68 return a; 69} 70 71static inline __m128 __attribute__((__always_inline__, __nodebug__)) 72_mm_mul_ps(__m128 a, __m128 b) 73{ 74 return a * b; 75} 76 77static inline __m128 __attribute__((__always_inline__, __nodebug__)) 78_mm_div_ss(__m128 a, __m128 b) 79{ 80 a[0] /= b[0]; 81 return a; 82} 83 84static inline __m128 __attribute__((__always_inline__, __nodebug__)) 85_mm_div_ps(__m128 a, __m128 b) 86{ 87 return a / b; 88} 89 90static inline __m128 __attribute__((__always_inline__, __nodebug__)) 91_mm_sqrt_ss(__m128 a) 92{ 93 return __builtin_ia32_sqrtss(a); 94} 95 96static inline __m128 __attribute__((__always_inline__, __nodebug__)) 97_mm_sqrt_ps(__m128 a) 98{ 99 return __builtin_ia32_sqrtps(a); 100} 101 102static inline __m128 __attribute__((__always_inline__, __nodebug__)) 103_mm_rcp_ss(__m128 a) 104{ 105 return __builtin_ia32_rcpss(a); 106} 107 108static inline __m128 __attribute__((__always_inline__, __nodebug__)) 109_mm_rcp_ps(__m128 a) 110{ 111 return __builtin_ia32_rcpps(a); 112} 113 114static inline __m128 __attribute__((__always_inline__, __nodebug__)) 115_mm_rsqrt_ss(__m128 a) 116{ 117 return __builtin_ia32_rsqrtss(a); 118} 119 120static inline __m128 __attribute__((__always_inline__, __nodebug__)) 121_mm_rsqrt_ps(__m128 a) 122{ 123 return __builtin_ia32_rsqrtps(a); 124} 125 126static inline __m128 __attribute__((__always_inline__, __nodebug__)) 127_mm_min_ss(__m128 a, __m128 b) 128{ 129 return __builtin_ia32_minss(a, b); 130} 131 132static inline __m128 __attribute__((__always_inline__, __nodebug__)) 133_mm_min_ps(__m128 a, __m128 b) 134{ 135 return __builtin_ia32_minps(a, b); 136} 137 138static inline __m128 __attribute__((__always_inline__, __nodebug__)) 139_mm_max_ss(__m128 a, __m128 b) 140{ 141 return __builtin_ia32_maxss(a, b); 142} 143 144static inline __m128 __attribute__((__always_inline__, __nodebug__)) 145_mm_max_ps(__m128 a, __m128 b) 146{ 147 return __builtin_ia32_maxps(a, b); 148} 149 150static inline __m128 __attribute__((__always_inline__, __nodebug__)) 151_mm_and_ps(__m128 a, __m128 b) 152{ 153 typedef int __v4si __attribute__((__vector_size__(16))); 154 return (__m128)((__v4si)a & (__v4si)b); 155} 156 157static inline __m128 __attribute__((__always_inline__, __nodebug__)) 158_mm_andnot_ps(__m128 a, __m128 b) 159{ 160 typedef int __v4si __attribute__((__vector_size__(16))); 161 return (__m128)(~(__v4si)a & (__v4si)b); 162} 163 164static inline __m128 __attribute__((__always_inline__, __nodebug__)) 165_mm_or_ps(__m128 a, __m128 b) 166{ 167 typedef int __v4si __attribute__((__vector_size__(16))); 168 return (__m128)((__v4si)a | (__v4si)b); 169} 170 171static inline __m128 __attribute__((__always_inline__, __nodebug__)) 172_mm_xor_ps(__m128 a, __m128 b) 173{ 174 typedef int __v4si __attribute__((__vector_size__(16))); 175 return (__m128)((__v4si)a ^ ~(__v4si)b); 176} 177 178static inline __m128 __attribute__((__always_inline__, __nodebug__)) 179_mm_cmpeq_ss(__m128 a, __m128 b) 180{ 181 return (__m128)__builtin_ia32_cmpss(a, b, 0); 182} 183 184static inline __m128 __attribute__((__always_inline__, __nodebug__)) 185_mm_cmpeq_ps(__m128 a, __m128 b) 186{ 187 return (__m128)__builtin_ia32_cmpps(a, b, 0); 188} 189 190static inline __m128 __attribute__((__always_inline__, __nodebug__)) 191_mm_cmplt_ss(__m128 a, __m128 b) 192{ 193 return (__m128)__builtin_ia32_cmpss(a, b, 1); 194} 195 196static inline __m128 __attribute__((__always_inline__, __nodebug__)) 197_mm_cmplt_ps(__m128 a, __m128 b) 198{ 199 return (__m128)__builtin_ia32_cmpps(a, b, 1); 200} 201 202static inline __m128 __attribute__((__always_inline__, __nodebug__)) 203_mm_cmple_ss(__m128 a, __m128 b) 204{ 205 return (__m128)__builtin_ia32_cmpss(a, b, 2); 206} 207 208static inline __m128 __attribute__((__always_inline__, __nodebug__)) 209_mm_cmple_ps(__m128 a, __m128 b) 210{ 211 return (__m128)__builtin_ia32_cmpps(a, b, 2); 212} 213 214static inline __m128 __attribute__((__always_inline__, __nodebug__)) 215_mm_cmpgt_ss(__m128 a, __m128 b) 216{ 217 return (__m128)__builtin_ia32_cmpss(b, a, 1); 218} 219 220static inline __m128 __attribute__((__always_inline__, __nodebug__)) 221_mm_cmpgt_ps(__m128 a, __m128 b) 222{ 223 return (__m128)__builtin_ia32_cmpps(b, a, 1); 224} 225 226static inline __m128 __attribute__((__always_inline__, __nodebug__)) 227_mm_cmpge_ss(__m128 a, __m128 b) 228{ 229 return (__m128)__builtin_ia32_cmpss(b, a, 2); 230} 231 232static inline __m128 __attribute__((__always_inline__, __nodebug__)) 233_mm_cmpge_ps(__m128 a, __m128 b) 234{ 235 return (__m128)__builtin_ia32_cmpps(b, a, 2); 236} 237 238static inline __m128 __attribute__((__always_inline__, __nodebug__)) 239_mm_cmpneq_ss(__m128 a, __m128 b) 240{ 241 return (__m128)__builtin_ia32_cmpss(a, b, 4); 242} 243 244static inline __m128 __attribute__((__always_inline__, __nodebug__)) 245_mm_cmpneq_ps(__m128 a, __m128 b) 246{ 247 return (__m128)__builtin_ia32_cmpps(a, b, 4); 248} 249 250static inline __m128 __attribute__((__always_inline__, __nodebug__)) 251_mm_cmpnlt_ss(__m128 a, __m128 b) 252{ 253 return (__m128)__builtin_ia32_cmpss(a, b, 5); 254} 255 256static inline __m128 __attribute__((__always_inline__, __nodebug__)) 257_mm_cmpnlt_ps(__m128 a, __m128 b) 258{ 259 return (__m128)__builtin_ia32_cmpps(a, b, 5); 260} 261 262static inline __m128 __attribute__((__always_inline__, __nodebug__)) 263_mm_cmpnle_ss(__m128 a, __m128 b) 264{ 265 return (__m128)__builtin_ia32_cmpss(a, b, 6); 266} 267 268static inline __m128 __attribute__((__always_inline__, __nodebug__)) 269_mm_cmpnle_ps(__m128 a, __m128 b) 270{ 271 return (__m128)__builtin_ia32_cmpps(a, b, 6); 272} 273 274static inline __m128 __attribute__((__always_inline__, __nodebug__)) 275_mm_cmpngt_ss(__m128 a, __m128 b) 276{ 277 return (__m128)__builtin_ia32_cmpss(b, a, 5); 278} 279 280static inline __m128 __attribute__((__always_inline__, __nodebug__)) 281_mm_cmpngt_ps(__m128 a, __m128 b) 282{ 283 return (__m128)__builtin_ia32_cmpps(b, a, 5); 284} 285 286static inline __m128 __attribute__((__always_inline__, __nodebug__)) 287_mm_cmpnge_ss(__m128 a, __m128 b) 288{ 289 return (__m128)__builtin_ia32_cmpss(b, a, 6); 290} 291 292static inline __m128 __attribute__((__always_inline__, __nodebug__)) 293_mm_cmpnge_ps(__m128 a, __m128 b) 294{ 295 return (__m128)__builtin_ia32_cmpps(b, a, 6); 296} 297 298static inline __m128 __attribute__((__always_inline__, __nodebug__)) 299_mm_cmpord_ss(__m128 a, __m128 b) 300{ 301 return (__m128)__builtin_ia32_cmpss(a, b, 7); 302} 303 304static inline __m128 __attribute__((__always_inline__, __nodebug__)) 305_mm_cmpord_ps(__m128 a, __m128 b) 306{ 307 return (__m128)__builtin_ia32_cmpps(a, b, 7); 308} 309 310static inline __m128 __attribute__((__always_inline__, __nodebug__)) 311_mm_cmpunord_ss(__m128 a, __m128 b) 312{ 313 return (__m128)__builtin_ia32_cmpss(a, b, 3); 314} 315 316static inline __m128 __attribute__((__always_inline__, __nodebug__)) 317_mm_cmpunord_ps(__m128 a, __m128 b) 318{ 319 return (__m128)__builtin_ia32_cmpps(a, b, 3); 320} 321 322static inline int __attribute__((__always_inline__, __nodebug__)) 323_mm_comieq_ss(__m128 a, __m128 b) 324{ 325 return __builtin_ia32_comieq(a, b); 326} 327 328static inline int __attribute__((__always_inline__, __nodebug__)) 329_mm_comilt_ss(__m128 a, __m128 b) 330{ 331 return __builtin_ia32_comilt(a, b); 332} 333 334static inline int __attribute__((__always_inline__, __nodebug__)) 335_mm_comile_ss(__m128 a, __m128 b) 336{ 337 return __builtin_ia32_comile(a, b); 338} 339 340static inline int __attribute__((__always_inline__, __nodebug__)) 341_mm_comigt_ss(__m128 a, __m128 b) 342{ 343 return __builtin_ia32_comigt(a, b); 344} 345 346static inline int __attribute__((__always_inline__, __nodebug__)) 347_mm_comige_ss(__m128 a, __m128 b) 348{ 349 return __builtin_ia32_comige(a, b); 350} 351 352static inline int __attribute__((__always_inline__, __nodebug__)) 353_mm_comineq_ss(__m128 a, __m128 b) 354{ 355 return __builtin_ia32_comineq(a, b); 356} 357 358static inline int __attribute__((__always_inline__, __nodebug__)) 359_mm_ucomieq_ss(__m128 a, __m128 b) 360{ 361 return __builtin_ia32_ucomieq(a, b); 362} 363 364static inline int __attribute__((__always_inline__, __nodebug__)) 365_mm_ucomilt_ss(__m128 a, __m128 b) 366{ 367 return __builtin_ia32_ucomilt(a, b); 368} 369 370static inline int __attribute__((__always_inline__, __nodebug__)) 371_mm_ucomile_ss(__m128 a, __m128 b) 372{ 373 return __builtin_ia32_ucomile(a, b); 374} 375 376static inline int __attribute__((__always_inline__, __nodebug__)) 377_mm_ucomigt_ss(__m128 a, __m128 b) 378{ 379 return __builtin_ia32_ucomigt(a, b); 380} 381 382static inline int __attribute__((__always_inline__, __nodebug__)) 383_mm_ucomige_ss(__m128 a, __m128 b) 384{ 385 return __builtin_ia32_ucomige(a, b); 386} 387 388static inline int __attribute__((__always_inline__, __nodebug__)) 389_mm_ucomineq_ss(__m128 a, __m128 b) 390{ 391 return __builtin_ia32_ucomineq(a, b); 392} 393 394static inline int __attribute__((__always_inline__, __nodebug__)) 395_mm_cvtss_si32(__m128 a) 396{ 397 return __builtin_ia32_cvtss2si(a); 398} 399 400#ifdef __x86_64__ 401 402static inline long long __attribute__((__always_inline__, __nodebug__)) 403_mm_cvtss_si64(__m128 a) 404{ 405 return __builtin_ia32_cvtss2si64(a); 406} 407 408#endif 409 410static inline __m64 __attribute__((__always_inline__, __nodebug__)) 411_mm_cvtps_pi32(__m128 a) 412{ 413 return (__m64)__builtin_ia32_cvtps2pi(a); 414} 415 416static inline int __attribute__((__always_inline__, __nodebug__)) 417_mm_cvttss_si32(__m128 a) 418{ 419 return a[0]; 420} 421 422static inline long long __attribute__((__always_inline__, __nodebug__)) 423_mm_cvttss_si64(__m128 a) 424{ 425 return a[0]; 426} 427 428static inline __m64 __attribute__((__always_inline__, __nodebug__)) 429_mm_cvttps_pi32(__m128 a) 430{ 431 return (__m64)__builtin_ia32_cvttps2pi(a); 432} 433 434static inline __m128 __attribute__((__always_inline__, __nodebug__)) 435_mm_cvtsi32_ss(__m128 a, int b) 436{ 437 a[0] = b; 438 return a; 439} 440 441#ifdef __x86_64__ 442 443static inline __m128 __attribute__((__always_inline__, __nodebug__)) 444_mm_cvtsi64_ss(__m128 a, long long b) 445{ 446 a[0] = b; 447 return a; 448} 449 450#endif 451 452static inline __m128 __attribute__((__always_inline__, __nodebug__)) 453_mm_cvtpi32_ps(__m128 a, __m64 b) 454{ 455 return __builtin_ia32_cvtpi2ps(a, (__v2si)b); 456} 457 458static inline float __attribute__((__always_inline__, __nodebug__)) 459_mm_cvtss_f32(__m128 a) 460{ 461 return a[0]; 462} 463 464static inline __m128 __attribute__((__always_inline__, __nodebug__)) 465_mm_loadh_pi(__m128 a, __m64 const *p) 466{ 467 return __builtin_ia32_loadhps(a, (__v2si *)p); 468} 469 470static inline __m128 __attribute__((__always_inline__, __nodebug__)) 471_mm_loadl_pi(__m128 a, __m64 const *p) 472{ 473#if 0 474 // FIXME: This should work, but gives really crappy code at the moment 475 __m128 b; 476 b[0] = *(float*)p; 477 b[1] = *((float*)p+1); 478 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 479#endif 480 return __builtin_ia32_loadlps(a, (__v2si *)p); 481} 482 483static inline __m128 __attribute__((__always_inline__, __nodebug__)) 484_mm_load_ss(float *p) 485{ 486 return (__m128){ *p, 0, 0, 0 }; 487} 488 489static inline __m128 __attribute__((__always_inline__, __nodebug__)) 490_mm_load1_ps(float *p) 491{ 492 return (__m128){ *p, *p, *p, *p }; 493} 494 495#define _mm_load_ps1(p) _mm_load1_ps(p) 496 497static inline __m128 __attribute__((__always_inline__, __nodebug__)) 498_mm_load_ps(float *p) 499{ 500 return *(__m128*)p; 501} 502 503static inline __m128 __attribute__((__always_inline__, __nodebug__)) 504_mm_loadu_ps(float *p) 505{ 506 return __builtin_ia32_loadups(p); 507} 508 509static inline __m128 __attribute__((__always_inline__, __nodebug__)) 510_mm_loadr_ps(float *p) 511{ 512 __m128 a = _mm_load_ps(p); 513 return __builtin_shufflevector(a, a, 3, 2, 1, 0); 514} 515 516static inline __m128 __attribute__((__always_inline__, __nodebug__)) 517_mm_set_ss(float w) 518{ 519 return (__m128){ w, 0, 0, 0 }; 520} 521 522static inline __m128 __attribute__((__always_inline__, __nodebug__)) 523_mm_set1_ps(float w) 524{ 525 return (__m128){ w, w, w, w }; 526} 527 528// Microsoft specific. 529static inline __m128 __attribute__((__always_inline__, __nodebug__)) 530_mm_set_ps1(float w) 531{ 532 return _mm_set1_ps(w); 533} 534 535static inline __m128 __attribute__((__always_inline__, __nodebug__)) 536_mm_set_ps(float z, float y, float x, float w) 537{ 538 return (__m128){ w, x, y, z }; 539} 540 541static inline __m128 __attribute__((__always_inline__, __nodebug__)) 542_mm_setr_ps(float z, float y, float x, float w) 543{ 544 return (__m128){ z, y, x, w }; 545} 546 547static inline __m128 __attribute__((__always_inline__)) 548_mm_setzero_ps(void) 549{ 550 return (__m128){ 0, 0, 0, 0 }; 551} 552 553static inline void __attribute__((__always_inline__)) 554_mm_storeh_pi(__m64 *p, __m128 a) 555{ 556 __builtin_ia32_storehps((__v2si *)p, a); 557} 558 559static inline void __attribute__((__always_inline__)) 560_mm_storel_pi(__m64 *p, __m128 a) 561{ 562 __builtin_ia32_storelps((__v2si *)p, a); 563} 564 565static inline void __attribute__((__always_inline__)) 566_mm_store_ss(float *p, __m128 a) 567{ 568 *p = a[0]; 569} 570 571static inline void __attribute__((__always_inline__, __nodebug__)) 572_mm_storeu_ps(float *p, __m128 a) 573{ 574 __builtin_ia32_storeups(p, a); 575} 576 577static inline void __attribute__((__always_inline__, __nodebug__)) 578_mm_store1_ps(float *p, __m128 a) 579{ 580 a = __builtin_shufflevector(a, a, 0, 0, 0, 0); 581 _mm_storeu_ps(p, a); 582} 583 584static inline void __attribute__((__always_inline__, __nodebug__)) 585_mm_store_ps(float *p, __m128 a) 586{ 587 *(__m128 *)p = a; 588} 589 590static inline void __attribute__((__always_inline__, __nodebug__)) 591_mm_storer_ps(float *p, __m128 a) 592{ 593 a = __builtin_shufflevector(a, a, 3, 2, 1, 0); 594 _mm_store_ps(p, a); 595} 596 597#define _MM_HINT_T0 1 598#define _MM_HINT_T1 2 599#define _MM_HINT_T2 3 600#define _MM_HINT_NTA 0 601 602/* FIXME: We have to #define this because "sel" must be a constant integer, and 603 Sema doesn't do any form of constant propagation yet. */ 604 605#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel)) 606 607static inline void __attribute__((__always_inline__, __nodebug__)) 608_mm_stream_pi(__m64 *p, __m64 a) 609{ 610 __builtin_ia32_movntq(p, a); 611} 612 613static inline void __attribute__((__always_inline__, __nodebug__)) 614_mm_stream_ps(float *p, __m128 a) 615{ 616 __builtin_ia32_movntps(p, a); 617} 618 619static inline void __attribute__((__always_inline__, __nodebug__)) 620_mm_sfence(void) 621{ 622 __builtin_ia32_sfence(); 623} 624 625static inline int __attribute__((__always_inline__, __nodebug__)) 626_mm_extract_pi16(__m64 a, int n) 627{ 628 __v4hi b = (__v4hi)a; 629 return (unsigned short)b[n & 3]; 630} 631 632static inline __m64 __attribute__((__always_inline__, __nodebug__)) 633_mm_insert_pi16(__m64 a, int d, int n) 634{ 635 __v4hi b = (__v4hi)a; 636 b[n & 3] = d; 637 return (__m64)b; 638} 639 640static inline __m64 __attribute__((__always_inline__, __nodebug__)) 641_mm_max_pi16(__m64 a, __m64 b) 642{ 643 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b); 644} 645 646static inline __m64 __attribute__((__always_inline__, __nodebug__)) 647_mm_max_pu8(__m64 a, __m64 b) 648{ 649 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b); 650} 651 652static inline __m64 __attribute__((__always_inline__, __nodebug__)) 653_mm_min_pi16(__m64 a, __m64 b) 654{ 655 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b); 656} 657 658static inline __m64 __attribute__((__always_inline__, __nodebug__)) 659_mm_min_pu8(__m64 a, __m64 b) 660{ 661 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b); 662} 663 664static inline int __attribute__((__always_inline__, __nodebug__)) 665_mm_movemask_pi8(__m64 a) 666{ 667 return __builtin_ia32_pmovmskb((__v8qi)a); 668} 669 670static inline __m64 __attribute__((__always_inline__, __nodebug__)) 671_mm_mulhi_pu16(__m64 a, __m64 b) 672{ 673 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b); 674} 675 676#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n)) 677 678static inline void __attribute__((__always_inline__, __nodebug__)) 679_mm_maskmove_si64(__m64 d, __m64 n, char *p) 680{ 681 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p); 682} 683 684static inline __m64 __attribute__((__always_inline__, __nodebug__)) 685_mm_avg_pu8(__m64 a, __m64 b) 686{ 687 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b); 688} 689 690static inline __m64 __attribute__((__always_inline__, __nodebug__)) 691_mm_avg_pu16(__m64 a, __m64 b) 692{ 693 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b); 694} 695 696static inline __m64 __attribute__((__always_inline__, __nodebug__)) 697_mm_sad_pu8(__m64 a, __m64 b) 698{ 699 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b); 700} 701 702static inline unsigned int __attribute__((__always_inline__, __nodebug__)) 703_mm_getcsr(void) 704{ 705 return __builtin_ia32_stmxcsr(); 706} 707 708static inline void __attribute__((__always_inline__, __nodebug__)) 709_mm_setcsr(unsigned int i) 710{ 711 __builtin_ia32_ldmxcsr(i); 712} 713 714#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask)) 715 716static inline __m128 __attribute__((__always_inline__, __nodebug__)) 717_mm_unpackhi_ps(__m128 a, __m128 b) 718{ 719 return __builtin_shufflevector(a, b, 2, 6, 3, 7); 720} 721 722static inline __m128 __attribute__((__always_inline__, __nodebug__)) 723_mm_unpacklo_ps(__m128 a, __m128 b) 724{ 725 return __builtin_shufflevector(a, b, 0, 4, 1, 5); 726} 727 728static inline __m128 __attribute__((__always_inline__, __nodebug__)) 729_mm_move_ss(__m128 a, __m128 b) 730{ 731 return __builtin_shufflevector(a, b, 4, 1, 2, 3); 732} 733 734static inline __m128 __attribute__((__always_inline__, __nodebug__)) 735_mm_movehl_ps(__m128 a, __m128 b) 736{ 737 return __builtin_shufflevector(a, b, 6, 7, 2, 3); 738} 739 740static inline __m128 __attribute__((__always_inline__, __nodebug__)) 741_mm_movelh_ps(__m128 a, __m128 b) 742{ 743 return __builtin_shufflevector(a, b, 0, 1, 4, 5); 744} 745 746static inline __m128 __attribute__((__always_inline__, __nodebug__)) 747_mm_cvtpi16_ps(__m64 a) 748{ 749 __m64 b, c; 750 __m128 r; 751 752 b = _mm_setzero_si64(); 753 b = _mm_cmpgt_pi16(b, a); 754 c = _mm_unpackhi_pi16(a, b); 755 r = _mm_setzero_ps(); 756 r = _mm_cvtpi32_ps(r, c); 757 r = _mm_movelh_ps(r, r); 758 c = _mm_unpacklo_pi16(a, b); 759 r = _mm_cvtpi32_ps(r, c); 760 761 return r; 762} 763 764static inline __m128 __attribute__((__always_inline__, __nodebug__)) 765_mm_cvtpu16_ps(__m64 a) 766{ 767 __m64 b, c; 768 __m128 r; 769 770 b = _mm_setzero_si64(); 771 c = _mm_unpackhi_pi16(a, b); 772 r = _mm_setzero_ps(); 773 r = _mm_cvtpi32_ps(r, c); 774 r = _mm_movelh_ps(r, r); 775 c = _mm_unpacklo_pi16(a, b); 776 r = _mm_cvtpi32_ps(r, c); 777 778 return r; 779} 780 781static inline __m128 __attribute__((__always_inline__, __nodebug__)) 782_mm_cvtpi8_ps(__m64 a) 783{ 784 __m64 b; 785 786 b = _mm_setzero_si64(); 787 b = _mm_cmpgt_pi8(b, a); 788 b = _mm_unpacklo_pi8(a, b); 789 790 return _mm_cvtpi16_ps(b); 791} 792 793static inline __m128 __attribute__((__always_inline__, __nodebug__)) 794_mm_cvtpu8_ps(__m64 a) 795{ 796 __m64 b; 797 798 b = _mm_setzero_si64(); 799 b = _mm_unpacklo_pi8(a, b); 800 801 return _mm_cvtpi16_ps(b); 802} 803 804static inline __m128 __attribute__((__always_inline__, __nodebug__)) 805_mm_cvtpi32x2_ps(__m64 a, __m64 b) 806{ 807 __m128 c; 808 809 c = _mm_setzero_ps(); 810 c = _mm_cvtpi32_ps(c, b); 811 c = _mm_movelh_ps(c, c); 812 813 return _mm_cvtpi32_ps(c, a); 814} 815 816static inline __m64 __attribute__((__always_inline__, __nodebug__)) 817_mm_cvtps_pi16(__m128 a) 818{ 819 __m64 b, c; 820 821 b = _mm_cvtps_pi32(a); 822 a = _mm_movehl_ps(a, a); 823 c = _mm_cvtps_pi32(a); 824 825 return _mm_packs_pi16(b, c); 826} 827 828static inline __m64 __attribute__((__always_inline__, __nodebug__)) 829_mm_cvtps_pi8(__m128 a) 830{ 831 __m64 b, c; 832 833 b = _mm_cvtps_pi16(a); 834 c = _mm_setzero_si64(); 835 836 return _mm_packs_pi16(b, c); 837} 838 839static inline int __attribute__((__always_inline__, __nodebug__)) 840_mm_movemask_ps(__m128 a) 841{ 842 return __builtin_ia32_movmskps(a); 843} 844 845#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 846 847#define _MM_EXCEPT_INVALID (0x0001) 848#define _MM_EXCEPT_DENORM (0x0002) 849#define _MM_EXCEPT_DIV_ZERO (0x0004) 850#define _MM_EXCEPT_OVERFLOW (0x0008) 851#define _MM_EXCEPT_UNDERFLOW (0x0010) 852#define _MM_EXCEPT_INEXACT (0x0020) 853#define _MM_EXCEPT_MASK (0x003f) 854 855#define _MM_MASK_INVALID (0x0080) 856#define _MM_MASK_DENORM (0x0100) 857#define _MM_MASK_DIV_ZERO (0x0200) 858#define _MM_MASK_OVERFLOW (0x0400) 859#define _MM_MASK_UNDERFLOW (0x0800) 860#define _MM_MASK_INEXACT (0x1000) 861#define _MM_MASK_MASK (0x1f80) 862 863#define _MM_ROUND_NEAREST (0x0000) 864#define _MM_ROUND_DOWN (0x2000) 865#define _MM_ROUND_UP (0x4000) 866#define _MM_ROUND_TOWARD_ZERO (0x6000) 867#define _MM_ROUND_MASK (0x6000) 868 869#define _MM_FLUSH_ZERO_MASK (0x8000) 870#define _MM_FLUSH_ZERO_ON (0x8000) 871#define _MM_FLUSH_ZERO_OFF (0x8000) 872 873#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 874#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 875#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 876#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 877 878#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 879#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 880#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 881#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 882 883#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 884do { \ 885 __m128 tmp3, tmp2, tmp1, tmp0; \ 886 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 887 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 888 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 889 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 890 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 891 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 892 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 893 (row3) = _mm_movelh_ps(tmp3, tmp1); \ 894} while (0) 895 896#include <emmintrin.h> 897 898#endif /* __SSE__ */ 899 900#endif /* __XMMINTRIN_H */ 901