1/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __XMMINTRIN_H 25#define __XMMINTRIN_H 26 27#include <mmintrin.h> 28 29typedef int __v4si __attribute__((__vector_size__(16))); 30typedef float __v4sf __attribute__((__vector_size__(16))); 31typedef float __m128 __attribute__((__vector_size__(16))); 32 33/* This header should only be included in a hosted environment as it depends on 34 * a standard library to provide allocation routines. */ 35#if __STDC_HOSTED__ 36#include <mm_malloc.h> 37#endif 38 39/* Define the default attributes for the functions in this file. */ 40#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"))) 41 42/// \brief Adds the 32-bit float values in the low-order bits of the operands. 43/// 44/// \headerfile <x86intrin.h> 45/// 46/// This intrinsic corresponds to the \c VADDSS / ADDSS instructions. 47/// 48/// \param __a 49/// A 128-bit vector of [4 x float] containing one of the source operands. 50/// The lower 32 bits of this operand are used in the calculation. 51/// \param __b 52/// A 128-bit vector of [4 x float] containing one of the source operands. 53/// The lower 32 bits of this operand are used in the calculation. 54/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 55/// of the lower 32 bits of both operands. The upper 96 bits are copied from 56/// the upper 96 bits of the first source operand. 57static __inline__ __m128 __DEFAULT_FN_ATTRS 58_mm_add_ss(__m128 __a, __m128 __b) 59{ 60 __a[0] += __b[0]; 61 return __a; 62} 63 64/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of 65/// the addition. 66/// 67/// \headerfile <x86intrin.h> 68/// 69/// This intrinsic corresponds to the \c VADDPS / ADDPS instructions. 70/// 71/// \param __a 72/// A 128-bit vector of [4 x float] containing one of the source operands. 73/// \param __b 74/// A 128-bit vector of [4 x float] containing one of the source operands. 75/// \returns A 128-bit vector of [4 x float] containing the sums of both 76/// operands. 77static __inline__ __m128 __DEFAULT_FN_ATTRS 78_mm_add_ps(__m128 __a, __m128 __b) 79{ 80 return (__m128)((__v4sf)__a + (__v4sf)__b); 81} 82 83/// \brief Subtracts the 32-bit float value in the low-order bits of the second 84/// operand from the corresponding value in the first operand. 85/// 86/// \headerfile <x86intrin.h> 87/// 88/// This intrinsic corresponds to the \c VSUBSS / SUBSS instructions. 89/// 90/// \param __a 91/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits 92/// of this operand are used in the calculation. 93/// \param __b 94/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 95/// bits of this operand are used in the calculation. 96/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 97/// difference of the lower 32 bits of both operands. The upper 96 bits are 98/// copied from the upper 96 bits of the first source operand. 99static __inline__ __m128 __DEFAULT_FN_ATTRS 100_mm_sub_ss(__m128 __a, __m128 __b) 101{ 102 __a[0] -= __b[0]; 103 return __a; 104} 105 106/// \brief Subtracts each of the values of the second operand from the first 107/// operand, both of which are 128-bit vectors of [4 x float] and returns 108/// the results of the subtraction. 109/// 110/// \headerfile <x86intrin.h> 111/// 112/// This intrinsic corresponds to the \c VSUBPS / SUBPS instructions. 113/// 114/// \param __a 115/// A 128-bit vector of [4 x float] containing the minuend. 116/// \param __b 117/// A 128-bit vector of [4 x float] containing the subtrahend. 118/// \returns A 128-bit vector of [4 x float] containing the differences between 119/// both operands. 120static __inline__ __m128 __DEFAULT_FN_ATTRS 121_mm_sub_ps(__m128 __a, __m128 __b) 122{ 123 return (__m128)((__v4sf)__a - (__v4sf)__b); 124} 125 126/// \brief Multiplies two 32-bit float values in the low-order bits of the 127/// operands. 128/// 129/// \headerfile <x86intrin.h> 130/// 131/// This intrinsic corresponds to the \c VMULSS / MULSS instructions. 132/// 133/// \param __a 134/// A 128-bit vector of [4 x float] containing one of the source operands. 135/// The lower 32 bits of this operand are used in the calculation. 136/// \param __b 137/// A 128-bit vector of [4 x float] containing one of the source operands. 138/// The lower 32 bits of this operand are used in the calculation. 139/// \returns A 128-bit vector of [4 x float] containing the product of the lower 140/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 141/// bits of the first source operand. 142static __inline__ __m128 __DEFAULT_FN_ATTRS 143_mm_mul_ss(__m128 __a, __m128 __b) 144{ 145 __a[0] *= __b[0]; 146 return __a; 147} 148 149/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the 150/// results of the multiplication. 151/// 152/// \headerfile <x86intrin.h> 153/// 154/// This intrinsic corresponds to the \c VMULPS / MULPS instructions. 155/// 156/// \param __a 157/// A 128-bit vector of [4 x float] containing one of the source operands. 158/// \param __b 159/// A 128-bit vector of [4 x float] containing one of the source operands. 160/// \returns A 128-bit vector of [4 x float] containing the products of both 161/// operands. 162static __inline__ __m128 __DEFAULT_FN_ATTRS 163_mm_mul_ps(__m128 __a, __m128 __b) 164{ 165 return (__m128)((__v4sf)__a * (__v4sf)__b); 166} 167 168/// \brief Divides the value in the low-order 32 bits of the first operand by 169/// the corresponding value in the second operand. 170/// 171/// \headerfile <x86intrin.h> 172/// 173/// This intrinsic corresponds to the \c VDIVSS / DIVSS instructions. 174/// 175/// \param __a 176/// A 128-bit vector of [4 x float] containing the dividend. The lower 32 177/// bits of this operand are used in the calculation. 178/// \param __b 179/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits 180/// of this operand are used in the calculation. 181/// \returns A 128-bit vector of [4 x float] containing the quotients of the 182/// lower 32 bits of both operands. The upper 96 bits are copied from the 183/// upper 96 bits of the first source operand. 184static __inline__ __m128 __DEFAULT_FN_ATTRS 185_mm_div_ss(__m128 __a, __m128 __b) 186{ 187 __a[0] /= __b[0]; 188 return __a; 189} 190 191/// \brief Divides two 128-bit vectors of [4 x float]. 192/// 193/// \headerfile <x86intrin.h> 194/// 195/// This intrinsic corresponds to the \c VDIVPS / DIVPS instructions. 196/// 197/// \param __a 198/// A 128-bit vector of [4 x float] containing the dividend. 199/// \param __b 200/// A 128-bit vector of [4 x float] containing the divisor. 201/// \returns A 128-bit vector of [4 x float] containing the quotients of both 202/// operands. 203static __inline__ __m128 __DEFAULT_FN_ATTRS 204_mm_div_ps(__m128 __a, __m128 __b) 205{ 206 return (__m128)((__v4sf)__a / (__v4sf)__b); 207} 208 209/// \brief Calculates the square root of the value stored in the low-order bits 210/// of a 128-bit vector of [4 x float]. 211/// 212/// \headerfile <x86intrin.h> 213/// 214/// This intrinsic corresponds to the \c VSQRTSS / SQRTSS instructions. 215/// 216/// \param __a 217/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 218/// used in the calculation. 219/// \returns A 128-bit vector of [4 x float] containing the square root of the 220/// value in the low-order bits of the operand. 221static __inline__ __m128 __DEFAULT_FN_ATTRS 222_mm_sqrt_ss(__m128 __a) 223{ 224 __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a); 225 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 226} 227 228/// \brief Calculates the square roots of the values stored in a 128-bit vector 229/// of [4 x float]. 230/// 231/// \headerfile <x86intrin.h> 232/// 233/// This intrinsic corresponds to the \c VSQRTPS / SQRTPS instructions. 234/// 235/// \param __a 236/// A 128-bit vector of [4 x float]. 237/// \returns A 128-bit vector of [4 x float] containing the square roots of the 238/// values in the operand. 239static __inline__ __m128 __DEFAULT_FN_ATTRS 240_mm_sqrt_ps(__m128 __a) 241{ 242 return __builtin_ia32_sqrtps((__v4sf)__a); 243} 244 245/// \brief Calculates the approximate reciprocal of the value stored in the 246/// low-order bits of a 128-bit vector of [4 x float]. 247/// 248/// \headerfile <x86intrin.h> 249/// 250/// This intrinsic corresponds to the \c VRCPSS / RCPSS instructions. 251/// 252/// \param __a 253/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 254/// used in the calculation. 255/// \returns A 128-bit vector of [4 x float] containing the approximate 256/// reciprocal of the value in the low-order bits of the operand. 257static __inline__ __m128 __DEFAULT_FN_ATTRS 258_mm_rcp_ss(__m128 __a) 259{ 260 __m128 __c = __builtin_ia32_rcpss((__v4sf)__a); 261 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 262} 263 264/// \brief Calculates the approximate reciprocals of the values stored in a 265/// 128-bit vector of [4 x float]. 266/// 267/// \headerfile <x86intrin.h> 268/// 269/// This intrinsic corresponds to the \c VRCPPS / RCPPS instructions. 270/// 271/// \param __a 272/// A 128-bit vector of [4 x float]. 273/// \returns A 128-bit vector of [4 x float] containing the approximate 274/// reciprocals of the values in the operand. 275static __inline__ __m128 __DEFAULT_FN_ATTRS 276_mm_rcp_ps(__m128 __a) 277{ 278 return __builtin_ia32_rcpps((__v4sf)__a); 279} 280 281/// \brief Calculates the approximate reciprocal of the square root of the value 282/// stored in the low-order bits of a 128-bit vector of [4 x float]. 283/// 284/// \headerfile <x86intrin.h> 285/// 286/// This intrinsic corresponds to the \c VRSQRTSS / RSQRTSS instructions. 287/// 288/// \param __a 289/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 290/// used in the calculation. 291/// \returns A 128-bit vector of [4 x float] containing the approximate 292/// reciprocal of the square root of the value in the low-order bits of the 293/// operand. 294static __inline__ __m128 __DEFAULT_FN_ATTRS 295_mm_rsqrt_ss(__m128 __a) 296{ 297 __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a); 298 return (__m128) { __c[0], __a[1], __a[2], __a[3] }; 299} 300 301/// \brief Calculates the approximate reciprocals of the square roots of the 302/// values stored in a 128-bit vector of [4 x float]. 303/// 304/// \headerfile <x86intrin.h> 305/// 306/// This intrinsic corresponds to the \c VRSQRTPS / RSQRTPS instructions. 307/// 308/// \param __a 309/// A 128-bit vector of [4 x float]. 310/// \returns A 128-bit vector of [4 x float] containing the approximate 311/// reciprocals of the square roots of the values in the operand. 312static __inline__ __m128 __DEFAULT_FN_ATTRS 313_mm_rsqrt_ps(__m128 __a) 314{ 315 return __builtin_ia32_rsqrtps((__v4sf)__a); 316} 317 318/// \brief Compares two 32-bit float values in the low-order bits of both 319/// operands and returns the lesser value in the low-order bits of the 320/// vector of [4 x float]. 321/// 322/// \headerfile <x86intrin.h> 323/// 324/// This intrinsic corresponds to the \c VMINSS / MINSS instructions. 325/// 326/// \param __a 327/// A 128-bit vector of [4 x float] containing one of the operands. The lower 328/// 32 bits of this operand are used in the comparison. 329/// \param __b 330/// A 128-bit vector of [4 x float] containing one of the operands. The lower 331/// 32 bits of this operand are used in the comparison. 332/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 333/// minimum value between both operands. The upper 96 bits are copied from 334/// the upper 96 bits of the first source operand. 335static __inline__ __m128 __DEFAULT_FN_ATTRS 336_mm_min_ss(__m128 __a, __m128 __b) 337{ 338 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); 339} 340 341/// \brief Compares two 128-bit vectors of [4 x float] and returns the 342/// lesser of each pair of values. 343/// 344/// \headerfile <x86intrin.h> 345/// 346/// This intrinsic corresponds to the \c VMINPS / MINPS instructions. 347/// 348/// \param __a 349/// A 128-bit vector of [4 x float] containing one of the operands. 350/// \param __b 351/// A 128-bit vector of [4 x float] containing one of the operands. 352/// \returns A 128-bit vector of [4 x float] containing the minimum values 353/// between both operands. 354static __inline__ __m128 __DEFAULT_FN_ATTRS 355_mm_min_ps(__m128 __a, __m128 __b) 356{ 357 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); 358} 359 360/// \brief Compares two 32-bit float values in the low-order bits of both 361/// operands and returns the greater value in the low-order bits of 362/// a vector [4 x float]. 363/// 364/// \headerfile <x86intrin.h> 365/// 366/// This intrinsic corresponds to the \c VMAXSS / MAXSS instructions. 367/// 368/// \param __a 369/// A 128-bit vector of [4 x float] containing one of the operands. The lower 370/// 32 bits of this operand are used in the comparison. 371/// \param __b 372/// A 128-bit vector of [4 x float] containing one of the operands. The lower 373/// 32 bits of this operand are used in the comparison. 374/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the 375/// maximum value between both operands. The upper 96 bits are copied from 376/// the upper 96 bits of the first source operand. 377static __inline__ __m128 __DEFAULT_FN_ATTRS 378_mm_max_ss(__m128 __a, __m128 __b) 379{ 380 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); 381} 382 383/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater 384/// of each pair of values. 385/// 386/// \headerfile <x86intrin.h> 387/// 388/// This intrinsic corresponds to the \c VMAXPS / MAXPS instructions. 389/// 390/// \param __a 391/// A 128-bit vector of [4 x float] containing one of the operands. 392/// \param __b 393/// A 128-bit vector of [4 x float] containing one of the operands. 394/// \returns A 128-bit vector of [4 x float] containing the maximum values 395/// between both operands. 396static __inline__ __m128 __DEFAULT_FN_ATTRS 397_mm_max_ps(__m128 __a, __m128 __b) 398{ 399 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); 400} 401 402/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float]. 403/// 404/// \headerfile <x86intrin.h> 405/// 406/// This intrinsic corresponds to the \c VANDPS / ANDPS instructions. 407/// 408/// \param __a 409/// A 128-bit vector containing one of the source operands. 410/// \param __b 411/// A 128-bit vector containing one of the source operands. 412/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 413/// values between both operands. 414static __inline__ __m128 __DEFAULT_FN_ATTRS 415_mm_and_ps(__m128 __a, __m128 __b) 416{ 417 return (__m128)((__v4si)__a & (__v4si)__b); 418} 419 420/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using 421/// the one's complement of the values contained in the first source 422/// operand. 423/// 424/// \headerfile <x86intrin.h> 425/// 426/// This intrinsic corresponds to the \c VANDNPS / ANDNPS instructions. 427/// 428/// \param __a 429/// A 128-bit vector of [4 x float] containing the first source operand. The 430/// one's complement of this value is used in the bitwise AND. 431/// \param __b 432/// A 128-bit vector of [4 x float] containing the second source operand. 433/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 434/// one's complement of the first operand and the values in the second 435/// operand. 436static __inline__ __m128 __DEFAULT_FN_ATTRS 437_mm_andnot_ps(__m128 __a, __m128 __b) 438{ 439 return (__m128)(~(__v4si)__a & (__v4si)__b); 440} 441 442/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float]. 443/// 444/// \headerfile <x86intrin.h> 445/// 446/// This intrinsic corresponds to the \c VORPS / ORPS instructions. 447/// 448/// \param __a 449/// A 128-bit vector of [4 x float] containing one of the source operands. 450/// \param __b 451/// A 128-bit vector of [4 x float] containing one of the source operands. 452/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the 453/// values between both operands. 454static __inline__ __m128 __DEFAULT_FN_ATTRS 455_mm_or_ps(__m128 __a, __m128 __b) 456{ 457 return (__m128)((__v4si)__a | (__v4si)__b); 458} 459 460/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of 461/// [4 x float]. 462/// 463/// \headerfile <x86intrin.h> 464/// 465/// This intrinsic corresponds to the \c VXORPS / XORPS instructions. 466/// 467/// \param __a 468/// A 128-bit vector of [4 x float] containing one of the source operands. 469/// \param __b 470/// A 128-bit vector of [4 x float] containing one of the source operands. 471/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 472/// of the values between both operands. 473static __inline__ __m128 __DEFAULT_FN_ATTRS 474_mm_xor_ps(__m128 __a, __m128 __b) 475{ 476 return (__m128)((__v4si)__a ^ (__v4si)__b); 477} 478 479/// \brief Compares two 32-bit float values in the low-order bits of both 480/// operands for equality and returns the result of the comparison in the 481/// low-order bits of a vector [4 x float]. 482/// 483/// \headerfile <x86intrin.h> 484/// 485/// This intrinsic corresponds to the \c VCMPEQSS / CMPEQSS instructions. 486/// 487/// \param __a 488/// A 128-bit vector of [4 x float] containing one of the operands. The lower 489/// 32 bits of this operand are used in the comparison. 490/// \param __b 491/// A 128-bit vector of [4 x float] containing one of the operands. The lower 492/// 32 bits of this operand are used in the comparison. 493/// \returns A 128-bit vector of [4 x float] containing the comparison results 494/// in the low-order bits. 495static __inline__ __m128 __DEFAULT_FN_ATTRS 496_mm_cmpeq_ss(__m128 __a, __m128 __b) 497{ 498 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); 499} 500 501/// \brief Compares each of the corresponding 32-bit float values of the 502/// 128-bit vectors of [4 x float] for equality. 503/// 504/// \headerfile <x86intrin.h> 505/// 506/// This intrinsic corresponds to the \c VCMPEQPS / CMPEQPS instructions. 507/// 508/// \param __a 509/// A 128-bit vector of [4 x float]. 510/// \param __b 511/// A 128-bit vector of [4 x float]. 512/// \returns A 128-bit vector of [4 x float] containing the comparison results. 513static __inline__ __m128 __DEFAULT_FN_ATTRS 514_mm_cmpeq_ps(__m128 __a, __m128 __b) 515{ 516 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); 517} 518 519/// \brief Compares two 32-bit float values in the low-order bits of both 520/// operands to determine if the value in the first operand is less than the 521/// corresponding value in the second operand and returns the result of the 522/// comparison in the low-order bits of a vector of [4 x float]. 523/// 524/// \headerfile <x86intrin.h> 525/// 526/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions. 527/// 528/// \param __a 529/// A 128-bit vector of [4 x float] containing one of the operands. The lower 530/// 32 bits of this operand are used in the comparison. 531/// \param __b 532/// A 128-bit vector of [4 x float] containing one of the operands. The lower 533/// 32 bits of this operand are used in the comparison. 534/// \returns A 128-bit vector of [4 x float] containing the comparison results 535/// in the low-order bits. 536static __inline__ __m128 __DEFAULT_FN_ATTRS 537_mm_cmplt_ss(__m128 __a, __m128 __b) 538{ 539 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); 540} 541 542/// \brief Compares each of the corresponding 32-bit float values of the 543/// 128-bit vectors of [4 x float] to determine if the values in the first 544/// operand are less than those in the second operand. 545/// 546/// \headerfile <x86intrin.h> 547/// 548/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions. 549/// 550/// \param __a 551/// A 128-bit vector of [4 x float]. 552/// \param __b 553/// A 128-bit vector of [4 x float]. 554/// \returns A 128-bit vector of [4 x float] containing the comparison results. 555static __inline__ __m128 __DEFAULT_FN_ATTRS 556_mm_cmplt_ps(__m128 __a, __m128 __b) 557{ 558 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); 559} 560 561/// \brief Compares two 32-bit float values in the low-order bits of both 562/// operands to determine if the value in the first operand is less than or 563/// equal to the corresponding value in the second operand and returns the 564/// result of the comparison in the low-order bits of a vector of 565/// [4 x float]. 566/// 567/// \headerfile <x86intrin.h> 568/// 569/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions. 570/// 571/// \param __a 572/// A 128-bit vector of [4 x float] containing one of the operands. The lower 573/// 32 bits of this operand are used in the comparison. 574/// \param __b 575/// A 128-bit vector of [4 x float] containing one of the operands. The lower 576/// 32 bits of this operand are used in the comparison. 577/// \returns A 128-bit vector of [4 x float] containing the comparison results 578/// in the low-order bits. 579static __inline__ __m128 __DEFAULT_FN_ATTRS 580_mm_cmple_ss(__m128 __a, __m128 __b) 581{ 582 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); 583} 584 585/// \brief Compares each of the corresponding 32-bit float values of the 586/// 128-bit vectors of [4 x float] to determine if the values in the first 587/// operand are less than or equal to those in the second operand. 588/// 589/// \headerfile <x86intrin.h> 590/// 591/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions. 592/// 593/// \param __a 594/// A 128-bit vector of [4 x float]. 595/// \param __b 596/// A 128-bit vector of [4 x float]. 597/// \returns A 128-bit vector of [4 x float] containing the comparison results. 598static __inline__ __m128 __DEFAULT_FN_ATTRS 599_mm_cmple_ps(__m128 __a, __m128 __b) 600{ 601 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); 602} 603 604/// \brief Compares two 32-bit float values in the low-order bits of both 605/// operands to determine if the value in the first operand is greater than 606/// the corresponding value in the second operand and returns the result of 607/// the comparison in the low-order bits of a vector of [4 x float]. 608/// 609/// \headerfile <x86intrin.h> 610/// 611/// This intrinsic corresponds to the \c VCMPLTSS / CMPLTSS instructions. 612/// 613/// \param __a 614/// A 128-bit vector of [4 x float] containing one of the operands. The lower 615/// 32 bits of this operand are used in the comparison. 616/// \param __b 617/// A 128-bit vector of [4 x float] containing one of the operands. The lower 618/// 32 bits of this operand are used in the comparison. 619/// \returns A 128-bit vector of [4 x float] containing the comparison results 620/// in the low-order bits. 621static __inline__ __m128 __DEFAULT_FN_ATTRS 622_mm_cmpgt_ss(__m128 __a, __m128 __b) 623{ 624 return (__m128)__builtin_shufflevector((__v4sf)__a, 625 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), 626 4, 1, 2, 3); 627} 628 629/// \brief Compares each of the corresponding 32-bit float values of the 630/// 128-bit vectors of [4 x float] to determine if the values in the first 631/// operand are greater than those in the second operand. 632/// 633/// \headerfile <x86intrin.h> 634/// 635/// This intrinsic corresponds to the \c VCMPLTPS / CMPLTPS instructions. 636/// 637/// \param __a 638/// A 128-bit vector of [4 x float]. 639/// \param __b 640/// A 128-bit vector of [4 x float]. 641/// \returns A 128-bit vector of [4 x float] containing the comparison results. 642static __inline__ __m128 __DEFAULT_FN_ATTRS 643_mm_cmpgt_ps(__m128 __a, __m128 __b) 644{ 645 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); 646} 647 648/// \brief Compares two 32-bit float values in the low-order bits of both 649/// operands to determine if the value in the first operand is greater than 650/// or equal to the corresponding value in the second operand and returns 651/// the result of the comparison in the low-order bits of a vector of 652/// [4 x float]. 653/// 654/// \headerfile <x86intrin.h> 655/// 656/// This intrinsic corresponds to the \c VCMPLESS / CMPLESS instructions. 657/// 658/// \param __a 659/// A 128-bit vector of [4 x float] containing one of the operands. The lower 660/// 32 bits of this operand are used in the comparison. 661/// \param __b 662/// A 128-bit vector of [4 x float] containing one of the operands. The lower 663/// 32 bits of this operand are used in the comparison. 664/// \returns A 128-bit vector of [4 x float] containing the comparison results 665/// in the low-order bits. 666static __inline__ __m128 __DEFAULT_FN_ATTRS 667_mm_cmpge_ss(__m128 __a, __m128 __b) 668{ 669 return (__m128)__builtin_shufflevector((__v4sf)__a, 670 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), 671 4, 1, 2, 3); 672} 673 674/// \brief Compares each of the corresponding 32-bit float values of the 675/// 128-bit vectors of [4 x float] to determine if the values in the first 676/// operand are greater than or equal to those in the second operand. 677/// 678/// \headerfile <x86intrin.h> 679/// 680/// This intrinsic corresponds to the \c VCMPLEPS / CMPLEPS instructions. 681/// 682/// \param __a 683/// A 128-bit vector of [4 x float]. 684/// \param __b 685/// A 128-bit vector of [4 x float]. 686/// \returns A 128-bit vector of [4 x float] containing the comparison results. 687static __inline__ __m128 __DEFAULT_FN_ATTRS 688_mm_cmpge_ps(__m128 __a, __m128 __b) 689{ 690 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); 691} 692 693/// \brief Compares two 32-bit float values in the low-order bits of both 694/// operands for inequality and returns the result of the comparison in the 695/// low-order bits of a vector of [4 x float]. 696/// 697/// \headerfile <x86intrin.h> 698/// 699/// This intrinsic corresponds to the \c VCMPNEQSS / CMPNEQSS instructions. 700/// 701/// \param __a 702/// A 128-bit vector of [4 x float] containing one of the operands. The lower 703/// 32 bits of this operand are used in the comparison. 704/// \param __b 705/// A 128-bit vector of [4 x float] containing one of the operands. The lower 706/// 32 bits of this operand are used in the comparison. 707/// \returns A 128-bit vector of [4 x float] containing the comparison results 708/// in the low-order bits. 709static __inline__ __m128 __DEFAULT_FN_ATTRS 710_mm_cmpneq_ss(__m128 __a, __m128 __b) 711{ 712 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); 713} 714 715/// \brief Compares each of the corresponding 32-bit float values of the 716/// 128-bit vectors of [4 x float] for inequality. 717/// 718/// \headerfile <x86intrin.h> 719/// 720/// This intrinsic corresponds to the \c VCMPNEQPS / CMPNEQPS instructions. 721/// 722/// \param __a 723/// A 128-bit vector of [4 x float]. 724/// \param __b 725/// A 128-bit vector of [4 x float]. 726/// \returns A 128-bit vector of [4 x float] containing the comparison results. 727static __inline__ __m128 __DEFAULT_FN_ATTRS 728_mm_cmpneq_ps(__m128 __a, __m128 __b) 729{ 730 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); 731} 732 733/// \brief Compares two 32-bit float values in the low-order bits of both 734/// operands to determine if the value in the first operand is not less than 735/// the corresponding value in the second operand and returns the result of 736/// the comparison in the low-order bits of a vector of [4 x float]. 737/// 738/// \headerfile <x86intrin.h> 739/// 740/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions. 741/// 742/// \param __a 743/// A 128-bit vector of [4 x float] containing one of the operands. The lower 744/// 32 bits of this operand are used in the comparison. 745/// \param __b 746/// A 128-bit vector of [4 x float] containing one of the operands. The lower 747/// 32 bits of this operand are used in the comparison. 748/// \returns A 128-bit vector of [4 x float] containing the comparison results 749/// in the low-order bits. 750static __inline__ __m128 __DEFAULT_FN_ATTRS 751_mm_cmpnlt_ss(__m128 __a, __m128 __b) 752{ 753 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); 754} 755 756/// \brief Compares each of the corresponding 32-bit float values of the 757/// 128-bit vectors of [4 x float] to determine if the values in the first 758/// operand are not less than those in the second operand. 759/// 760/// \headerfile <x86intrin.h> 761/// 762/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions. 763/// 764/// \param __a 765/// A 128-bit vector of [4 x float]. 766/// \param __b 767/// A 128-bit vector of [4 x float]. 768/// \returns A 128-bit vector of [4 x float] containing the comparison results. 769static __inline__ __m128 __DEFAULT_FN_ATTRS 770_mm_cmpnlt_ps(__m128 __a, __m128 __b) 771{ 772 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); 773} 774 775/// \brief Compares two 32-bit float values in the low-order bits of both 776/// operands to determine if the value in the first operand is not less than 777/// or equal to the corresponding value in the second operand and returns 778/// the result of the comparison in the low-order bits of a vector of 779/// [4 x float]. 780/// 781/// \headerfile <x86intrin.h> 782/// 783/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions. 784/// 785/// \param __a 786/// A 128-bit vector of [4 x float] containing one of the operands. The lower 787/// 32 bits of this operand are used in the comparison. 788/// \param __b 789/// A 128-bit vector of [4 x float] containing one of the operands. The lower 790/// 32 bits of this operand are used in the comparison. 791/// \returns A 128-bit vector of [4 x float] containing the comparison results 792/// in the low-order bits. 793static __inline__ __m128 __DEFAULT_FN_ATTRS 794_mm_cmpnle_ss(__m128 __a, __m128 __b) 795{ 796 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); 797} 798 799/// \brief Compares each of the corresponding 32-bit float values of the 800/// 128-bit vectors of [4 x float] to determine if the values in the first 801/// operand are not less than or equal to those in the second operand. 802/// 803/// \headerfile <x86intrin.h> 804/// 805/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions. 806/// 807/// \param __a 808/// A 128-bit vector of [4 x float]. 809/// \param __b 810/// A 128-bit vector of [4 x float]. 811/// \returns A 128-bit vector of [4 x float] containing the comparison results. 812static __inline__ __m128 __DEFAULT_FN_ATTRS 813_mm_cmpnle_ps(__m128 __a, __m128 __b) 814{ 815 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); 816} 817 818/// \brief Compares two 32-bit float values in the low-order bits of both 819/// operands to determine if the value in the first operand is not greater 820/// than the corresponding value in the second operand and returns the 821/// result of the comparison in the low-order bits of a vector of 822/// [4 x float]. 823/// 824/// \headerfile <x86intrin.h> 825/// 826/// This intrinsic corresponds to the \c VCMPNLTSS / CMPNLTSS instructions. 827/// 828/// \param __a 829/// A 128-bit vector of [4 x float] containing one of the operands. The lower 830/// 32 bits of this operand are used in the comparison. 831/// \param __b 832/// A 128-bit vector of [4 x float] containing one of the operands. The lower 833/// 32 bits of this operand are used in the comparison. 834/// \returns A 128-bit vector of [4 x float] containing the comparison results 835/// in the low-order bits. 836static __inline__ __m128 __DEFAULT_FN_ATTRS 837_mm_cmpngt_ss(__m128 __a, __m128 __b) 838{ 839 return (__m128)__builtin_shufflevector((__v4sf)__a, 840 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), 841 4, 1, 2, 3); 842} 843 844/// \brief Compares each of the corresponding 32-bit float values of the 845/// 128-bit vectors of [4 x float] to determine if the values in the first 846/// operand are not greater than those in the second operand. 847/// 848/// \headerfile <x86intrin.h> 849/// 850/// This intrinsic corresponds to the \c VCMPNLTPS / CMPNLTPS instructions. 851/// 852/// \param __a 853/// A 128-bit vector of [4 x float]. 854/// \param __b 855/// A 128-bit vector of [4 x float]. 856/// \returns A 128-bit vector of [4 x float] containing the comparison results. 857static __inline__ __m128 __DEFAULT_FN_ATTRS 858_mm_cmpngt_ps(__m128 __a, __m128 __b) 859{ 860 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); 861} 862 863/// \brief Compares two 32-bit float values in the low-order bits of both 864/// operands to determine if the value in the first operand is not greater 865/// than or equal to the corresponding value in the second operand and 866/// returns the result of the comparison in the low-order bits of a vector 867/// of [4 x float]. 868/// 869/// \headerfile <x86intrin.h> 870/// 871/// This intrinsic corresponds to the \c VCMPNLESS / CMPNLESS instructions. 872/// 873/// \param __a 874/// A 128-bit vector of [4 x float] containing one of the operands. The lower 875/// 32 bits of this operand are used in the comparison. 876/// \param __b 877/// A 128-bit vector of [4 x float] containing one of the operands. The lower 878/// 32 bits of this operand are used in the comparison. 879/// \returns A 128-bit vector of [4 x float] containing the comparison results 880/// in the low-order bits. 881static __inline__ __m128 __DEFAULT_FN_ATTRS 882_mm_cmpnge_ss(__m128 __a, __m128 __b) 883{ 884 return (__m128)__builtin_shufflevector((__v4sf)__a, 885 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), 886 4, 1, 2, 3); 887} 888 889/// \brief Compares each of the corresponding 32-bit float values of the 890/// 128-bit vectors of [4 x float] to determine if the values in the first 891/// operand are not greater than or equal to those in the second operand. 892/// 893/// \headerfile <x86intrin.h> 894/// 895/// This intrinsic corresponds to the \c VCMPNLEPS / CMPNLEPS instructions. 896/// 897/// \param __a 898/// A 128-bit vector of [4 x float]. 899/// \param __b 900/// A 128-bit vector of [4 x float]. 901/// \returns A 128-bit vector of [4 x float] containing the comparison results. 902static __inline__ __m128 __DEFAULT_FN_ATTRS 903_mm_cmpnge_ps(__m128 __a, __m128 __b) 904{ 905 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); 906} 907 908/// \brief Compares two 32-bit float values in the low-order bits of both 909/// operands to determine if the value in the first operand is ordered with 910/// respect to the corresponding value in the second operand and returns the 911/// result of the comparison in the low-order bits of a vector of 912/// [4 x float]. 913/// 914/// \headerfile <x86intrin.h> 915/// 916/// This intrinsic corresponds to the \c VCMPORDSS / CMPORDSS instructions. 917/// 918/// \param __a 919/// A 128-bit vector of [4 x float] containing one of the operands. The lower 920/// 32 bits of this operand are used in the comparison. 921/// \param __b 922/// A 128-bit vector of [4 x float] containing one of the operands. The lower 923/// 32 bits of this operand are used in the comparison. 924/// \returns A 128-bit vector of [4 x float] containing the comparison results 925/// in the low-order bits. 926static __inline__ __m128 __DEFAULT_FN_ATTRS 927_mm_cmpord_ss(__m128 __a, __m128 __b) 928{ 929 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); 930} 931 932/// \brief Compares each of the corresponding 32-bit float values of the 933/// 128-bit vectors of [4 x float] to determine if the values in the first 934/// operand are ordered with respect to those in the second operand. 935/// 936/// \headerfile <x86intrin.h> 937/// 938/// This intrinsic corresponds to the \c VCMPORDPS / CMPORDPS instructions. 939/// 940/// \param __a 941/// A 128-bit vector of [4 x float]. 942/// \param __b 943/// A 128-bit vector of [4 x float]. 944/// \returns A 128-bit vector of [4 x float] containing the comparison results. 945static __inline__ __m128 __DEFAULT_FN_ATTRS 946_mm_cmpord_ps(__m128 __a, __m128 __b) 947{ 948 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); 949} 950 951/// \brief Compares two 32-bit float values in the low-order bits of both 952/// operands to determine if the value in the first operand is unordered 953/// with respect to the corresponding value in the second operand and 954/// returns the result of the comparison in the low-order bits of a vector 955/// of [4 x float]. 956/// 957/// \headerfile <x86intrin.h> 958/// 959/// This intrinsic corresponds to the \c VCMPUNORDSS / CMPUNORDSS instructions. 960/// 961/// \param __a 962/// A 128-bit vector of [4 x float] containing one of the operands. The lower 963/// 32 bits of this operand are used in the comparison. 964/// \param __b 965/// A 128-bit vector of [4 x float] containing one of the operands. The lower 966/// 32 bits of this operand are used in the comparison. 967/// \returns A 128-bit vector of [4 x float] containing the comparison results 968/// in the low-order bits. 969static __inline__ __m128 __DEFAULT_FN_ATTRS 970_mm_cmpunord_ss(__m128 __a, __m128 __b) 971{ 972 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); 973} 974 975/// \brief Compares each of the corresponding 32-bit float values of the 976/// 128-bit vectors of [4 x float] to determine if the values in the first 977/// operand are unordered with respect to those in the second operand. 978/// 979/// \headerfile <x86intrin.h> 980/// 981/// This intrinsic corresponds to the \c VCMPUNORDPS / CMPUNORDPS instructions. 982/// 983/// \param __a 984/// A 128-bit vector of [4 x float]. 985/// \param __b 986/// A 128-bit vector of [4 x float]. 987/// \returns A 128-bit vector of [4 x float] containing the comparison results. 988static __inline__ __m128 __DEFAULT_FN_ATTRS 989_mm_cmpunord_ps(__m128 __a, __m128 __b) 990{ 991 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); 992} 993 994/// \brief Compares two 32-bit float values in the low-order bits of both 995/// operands for equality and returns the result of the comparison. 996/// 997/// \headerfile <x86intrin.h> 998/// 999/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1000/// 1001/// \param __a 1002/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1003/// used in the comparison. 1004/// \param __b 1005/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1006/// used in the comparison. 1007/// \returns An integer containing the comparison results. 1008static __inline__ int __DEFAULT_FN_ATTRS 1009_mm_comieq_ss(__m128 __a, __m128 __b) 1010{ 1011 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); 1012} 1013 1014/// \brief Compares two 32-bit float values in the low-order bits of both 1015/// operands to determine if the first operand is less than the second 1016/// operand and returns the result of the comparison. 1017/// 1018/// \headerfile <x86intrin.h> 1019/// 1020/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1021/// 1022/// \param __a 1023/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1024/// used in the comparison. 1025/// \param __b 1026/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1027/// used in the comparison. 1028/// \returns An integer containing the comparison results. 1029static __inline__ int __DEFAULT_FN_ATTRS 1030_mm_comilt_ss(__m128 __a, __m128 __b) 1031{ 1032 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); 1033} 1034 1035/// \brief Compares two 32-bit float values in the low-order bits of both 1036/// operands to determine if the first operand is less than or equal to the 1037/// second operand and returns the result of the comparison. 1038/// 1039/// \headerfile <x86intrin.h> 1040/// 1041/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1042/// 1043/// \param __a 1044/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1045/// used in the comparison. 1046/// \param __b 1047/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1048/// used in the comparison. 1049/// \returns An integer containing the comparison results. 1050static __inline__ int __DEFAULT_FN_ATTRS 1051_mm_comile_ss(__m128 __a, __m128 __b) 1052{ 1053 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); 1054} 1055 1056/// \brief Compares two 32-bit float values in the low-order bits of both 1057/// operands to determine if the first operand is greater than the second 1058/// operand and returns the result of the comparison. 1059/// 1060/// \headerfile <x86intrin.h> 1061/// 1062/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1063/// 1064/// \param __a 1065/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1066/// used in the comparison. 1067/// \param __b 1068/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1069/// used in the comparison. 1070/// \returns An integer containing the comparison results. 1071static __inline__ int __DEFAULT_FN_ATTRS 1072_mm_comigt_ss(__m128 __a, __m128 __b) 1073{ 1074 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); 1075} 1076 1077/// \brief Compares two 32-bit float values in the low-order bits of both 1078/// operands to determine if the first operand is greater than or equal to 1079/// the second operand and returns the result of the comparison. 1080/// 1081/// \headerfile <x86intrin.h> 1082/// 1083/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1084/// 1085/// \param __a 1086/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1087/// used in the comparison. 1088/// \param __b 1089/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1090/// used in the comparison. 1091/// \returns An integer containing the comparison results. 1092static __inline__ int __DEFAULT_FN_ATTRS 1093_mm_comige_ss(__m128 __a, __m128 __b) 1094{ 1095 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); 1096} 1097 1098/// \brief Compares two 32-bit float values in the low-order bits of both 1099/// operands to determine if the first operand is not equal to the second 1100/// operand and returns the result of the comparison. 1101/// 1102/// \headerfile <x86intrin.h> 1103/// 1104/// This intrinsic corresponds to the \c VCOMISS / COMISS instructions. 1105/// 1106/// \param __a 1107/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1108/// used in the comparison. 1109/// \param __b 1110/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1111/// used in the comparison. 1112/// \returns An integer containing the comparison results. 1113static __inline__ int __DEFAULT_FN_ATTRS 1114_mm_comineq_ss(__m128 __a, __m128 __b) 1115{ 1116 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); 1117} 1118 1119/// \brief Performs an unordered comparison of two 32-bit float values using 1120/// the low-order bits of both operands to determine equality and returns 1121/// the result of the comparison. 1122/// 1123/// \headerfile <x86intrin.h> 1124/// 1125/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1126/// 1127/// \param __a 1128/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1129/// used in the comparison. 1130/// \param __b 1131/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1132/// used in the comparison. 1133/// \returns An integer containing the comparison results. 1134static __inline__ int __DEFAULT_FN_ATTRS 1135_mm_ucomieq_ss(__m128 __a, __m128 __b) 1136{ 1137 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); 1138} 1139 1140/// \brief Performs an unordered comparison of two 32-bit float values using 1141/// the low-order bits of both operands to determine if the first operand is 1142/// less than the second operand and returns the result of the comparison. 1143/// 1144/// \headerfile <x86intrin.h> 1145/// 1146/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1147/// 1148/// \param __a 1149/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1150/// used in the comparison. 1151/// \param __b 1152/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1153/// used in the comparison. 1154/// \returns An integer containing the comparison results. 1155static __inline__ int __DEFAULT_FN_ATTRS 1156_mm_ucomilt_ss(__m128 __a, __m128 __b) 1157{ 1158 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); 1159} 1160 1161/// \brief Performs an unordered comparison of two 32-bit float values using 1162/// the low-order bits of both operands to determine if the first operand 1163/// is less than or equal to the second operand and returns the result of 1164/// the comparison. 1165/// 1166/// \headerfile <x86intrin.h> 1167/// 1168/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1169/// 1170/// \param __a 1171/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1172/// used in the comparison. 1173/// \param __b 1174/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1175/// used in the comparison. 1176/// \returns An integer containing the comparison results. 1177static __inline__ int __DEFAULT_FN_ATTRS 1178_mm_ucomile_ss(__m128 __a, __m128 __b) 1179{ 1180 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); 1181} 1182 1183/// \brief Performs an unordered comparison of two 32-bit float values using 1184/// the low-order bits of both operands to determine if the first operand 1185/// is greater than the second operand and returns the result of the 1186/// comparison. 1187/// 1188/// \headerfile <x86intrin.h> 1189/// 1190/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1191/// 1192/// \param __a 1193/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1194/// used in the comparison. 1195/// \param __b 1196/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1197/// used in the comparison. 1198/// \returns An integer containing the comparison results. 1199static __inline__ int __DEFAULT_FN_ATTRS 1200_mm_ucomigt_ss(__m128 __a, __m128 __b) 1201{ 1202 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); 1203} 1204 1205/// \brief Performs an unordered comparison of two 32-bit float values using 1206/// the low-order bits of both operands to determine if the first operand is 1207/// greater than or equal to the second operand and returns the result of 1208/// the comparison. 1209/// 1210/// \headerfile <x86intrin.h> 1211/// 1212/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1213/// 1214/// \param __a 1215/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1216/// used in the comparison. 1217/// \param __b 1218/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1219/// used in the comparison. 1220/// \returns An integer containing the comparison results. 1221static __inline__ int __DEFAULT_FN_ATTRS 1222_mm_ucomige_ss(__m128 __a, __m128 __b) 1223{ 1224 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); 1225} 1226 1227/// \brief Performs an unordered comparison of two 32-bit float values using 1228/// the low-order bits of both operands to determine inequality and returns 1229/// the result of the comparison. 1230/// 1231/// \headerfile <x86intrin.h> 1232/// 1233/// This intrinsic corresponds to the \c VUCOMISS / UCOMISS instructions. 1234/// 1235/// \param __a 1236/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1237/// used in the comparison. 1238/// \param __b 1239/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1240/// used in the comparison. 1241/// \returns An integer containing the comparison results. 1242static __inline__ int __DEFAULT_FN_ATTRS 1243_mm_ucomineq_ss(__m128 __a, __m128 __b) 1244{ 1245 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); 1246} 1247 1248/// \brief Converts a float value contained in the lower 32 bits of a vector of 1249/// [4 x float] into a 32-bit integer. 1250/// 1251/// \headerfile <x86intrin.h> 1252/// 1253/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. 1254/// 1255/// \param __a 1256/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1257/// used in the conversion. 1258/// \returns A 32-bit integer containing the converted value. 1259static __inline__ int __DEFAULT_FN_ATTRS 1260_mm_cvtss_si32(__m128 __a) 1261{ 1262 return __builtin_ia32_cvtss2si((__v4sf)__a); 1263} 1264 1265/// \brief Converts a float value contained in the lower 32 bits of a vector of 1266/// [4 x float] into a 32-bit integer. 1267/// 1268/// \headerfile <x86intrin.h> 1269/// 1270/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. 1271/// 1272/// \param __a 1273/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1274/// used in the conversion. 1275/// \returns A 32-bit integer containing the converted value. 1276static __inline__ int __DEFAULT_FN_ATTRS 1277_mm_cvt_ss2si(__m128 __a) 1278{ 1279 return _mm_cvtss_si32(__a); 1280} 1281 1282#ifdef __x86_64__ 1283 1284/// \brief Converts a float value contained in the lower 32 bits of a vector of 1285/// [4 x float] into a 64-bit integer. 1286/// 1287/// \headerfile <x86intrin.h> 1288/// 1289/// This intrinsic corresponds to the \c VCVTSS2SI / CVTSS2SI instructions. 1290/// 1291/// \param __a 1292/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1293/// used in the conversion. 1294/// \returns A 64-bit integer containing the converted value. 1295static __inline__ long long __DEFAULT_FN_ATTRS 1296_mm_cvtss_si64(__m128 __a) 1297{ 1298 return __builtin_ia32_cvtss2si64((__v4sf)__a); 1299} 1300 1301#endif 1302 1303/// \brief Converts two low-order float values in a 128-bit vector of 1304/// [4 x float] into a 64-bit vector of [2 x i32]. 1305/// 1306/// \headerfile <x86intrin.h> 1307/// 1308/// This intrinsic corresponds to the \c CVTPS2PI instruction. 1309/// 1310/// \param __a 1311/// A 128-bit vector of [4 x float]. 1312/// \returns A 64-bit integer vector containing the converted values. 1313static __inline__ __m64 __DEFAULT_FN_ATTRS 1314_mm_cvtps_pi32(__m128 __a) 1315{ 1316 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); 1317} 1318 1319/// \brief Converts two low-order float values in a 128-bit vector of 1320/// [4 x float] into a 64-bit vector of [2 x i32]. 1321/// 1322/// \headerfile <x86intrin.h> 1323/// 1324/// This intrinsic corresponds to the \c CVTPS2PI instruction. 1325/// 1326/// \param __a 1327/// A 128-bit vector of [4 x float]. 1328/// \returns A 64-bit integer vector containing the converted values. 1329static __inline__ __m64 __DEFAULT_FN_ATTRS 1330_mm_cvt_ps2pi(__m128 __a) 1331{ 1332 return _mm_cvtps_pi32(__a); 1333} 1334 1335/// \brief Converts a float value contained in the lower 32 bits of a vector of 1336/// [4 x float] into a 32-bit integer, truncating the result when it is 1337/// inexact. 1338/// 1339/// \headerfile <x86intrin.h> 1340/// 1341/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. 1342/// 1343/// \param __a 1344/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1345/// used in the conversion. 1346/// \returns A 32-bit integer containing the converted value. 1347static __inline__ int __DEFAULT_FN_ATTRS 1348_mm_cvttss_si32(__m128 __a) 1349{ 1350 return __a[0]; 1351} 1352 1353/// \brief Converts a float value contained in the lower 32 bits of a vector of 1354/// [4 x float] into a 32-bit integer, truncating the result when it is 1355/// inexact. 1356/// 1357/// \headerfile <x86intrin.h> 1358/// 1359/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. 1360/// 1361/// \param __a 1362/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1363/// used in the conversion. 1364/// \returns A 32-bit integer containing the converted value. 1365static __inline__ int __DEFAULT_FN_ATTRS 1366_mm_cvtt_ss2si(__m128 __a) 1367{ 1368 return _mm_cvttss_si32(__a); 1369} 1370 1371/// \brief Converts a float value contained in the lower 32 bits of a vector of 1372/// [4 x float] into a 64-bit integer, truncating the result when it is 1373/// inexact. 1374/// 1375/// \headerfile <x86intrin.h> 1376/// 1377/// This intrinsic corresponds to the \c VCVTTSS2SI / CVTTSS2SI instructions. 1378/// 1379/// \param __a 1380/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are 1381/// used in the conversion. 1382/// \returns A 64-bit integer containing the converted value. 1383static __inline__ long long __DEFAULT_FN_ATTRS 1384_mm_cvttss_si64(__m128 __a) 1385{ 1386 return __a[0]; 1387} 1388 1389/// \brief Converts two low-order float values in a 128-bit vector of 1390/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result 1391/// when it is inexact. 1392/// 1393/// \headerfile <x86intrin.h> 1394/// 1395/// This intrinsic corresponds to the \c CVTTPS2PI / VTTPS2PI instructions. 1396/// 1397/// \param __a 1398/// A 128-bit vector of [4 x float]. 1399/// \returns A 64-bit integer vector containing the converted values. 1400static __inline__ __m64 __DEFAULT_FN_ATTRS 1401_mm_cvttps_pi32(__m128 __a) 1402{ 1403 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); 1404} 1405 1406static __inline__ __m64 __DEFAULT_FN_ATTRS 1407_mm_cvtt_ps2pi(__m128 __a) 1408{ 1409 return _mm_cvttps_pi32(__a); 1410} 1411 1412static __inline__ __m128 __DEFAULT_FN_ATTRS 1413_mm_cvtsi32_ss(__m128 __a, int __b) 1414{ 1415 __a[0] = __b; 1416 return __a; 1417} 1418 1419static __inline__ __m128 __DEFAULT_FN_ATTRS 1420_mm_cvt_si2ss(__m128 __a, int __b) 1421{ 1422 return _mm_cvtsi32_ss(__a, __b); 1423} 1424 1425#ifdef __x86_64__ 1426 1427static __inline__ __m128 __DEFAULT_FN_ATTRS 1428_mm_cvtsi64_ss(__m128 __a, long long __b) 1429{ 1430 __a[0] = __b; 1431 return __a; 1432} 1433 1434#endif 1435 1436static __inline__ __m128 __DEFAULT_FN_ATTRS 1437_mm_cvtpi32_ps(__m128 __a, __m64 __b) 1438{ 1439 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); 1440} 1441 1442static __inline__ __m128 __DEFAULT_FN_ATTRS 1443_mm_cvt_pi2ps(__m128 __a, __m64 __b) 1444{ 1445 return _mm_cvtpi32_ps(__a, __b); 1446} 1447 1448static __inline__ float __DEFAULT_FN_ATTRS 1449_mm_cvtss_f32(__m128 __a) 1450{ 1451 return __a[0]; 1452} 1453 1454static __inline__ __m128 __DEFAULT_FN_ATTRS 1455_mm_loadh_pi(__m128 __a, const __m64 *__p) 1456{ 1457 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); 1458 struct __mm_loadh_pi_struct { 1459 __mm_loadh_pi_v2f32 __u; 1460 } __attribute__((__packed__, __may_alias__)); 1461 __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u; 1462 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1463 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); 1464} 1465 1466static __inline__ __m128 __DEFAULT_FN_ATTRS 1467_mm_loadl_pi(__m128 __a, const __m64 *__p) 1468{ 1469 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); 1470 struct __mm_loadl_pi_struct { 1471 __mm_loadl_pi_v2f32 __u; 1472 } __attribute__((__packed__, __may_alias__)); 1473 __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u; 1474 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); 1475 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); 1476} 1477 1478static __inline__ __m128 __DEFAULT_FN_ATTRS 1479_mm_load_ss(const float *__p) 1480{ 1481 struct __mm_load_ss_struct { 1482 float __u; 1483 } __attribute__((__packed__, __may_alias__)); 1484 float __u = ((struct __mm_load_ss_struct*)__p)->__u; 1485 return (__m128){ __u, 0, 0, 0 }; 1486} 1487 1488static __inline__ __m128 __DEFAULT_FN_ATTRS 1489_mm_load1_ps(const float *__p) 1490{ 1491 struct __mm_load1_ps_struct { 1492 float __u; 1493 } __attribute__((__packed__, __may_alias__)); 1494 float __u = ((struct __mm_load1_ps_struct*)__p)->__u; 1495 return (__m128){ __u, __u, __u, __u }; 1496} 1497 1498#define _mm_load_ps1(p) _mm_load1_ps(p) 1499 1500static __inline__ __m128 __DEFAULT_FN_ATTRS 1501_mm_load_ps(const float *__p) 1502{ 1503 return *(__m128*)__p; 1504} 1505 1506static __inline__ __m128 __DEFAULT_FN_ATTRS 1507_mm_loadu_ps(const float *__p) 1508{ 1509 struct __loadu_ps { 1510 __m128 __v; 1511 } __attribute__((__packed__, __may_alias__)); 1512 return ((struct __loadu_ps*)__p)->__v; 1513} 1514 1515static __inline__ __m128 __DEFAULT_FN_ATTRS 1516_mm_loadr_ps(const float *__p) 1517{ 1518 __m128 __a = _mm_load_ps(__p); 1519 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1520} 1521 1522static __inline__ __m128 __DEFAULT_FN_ATTRS 1523_mm_undefined_ps() 1524{ 1525 return (__m128)__builtin_ia32_undef128(); 1526} 1527 1528static __inline__ __m128 __DEFAULT_FN_ATTRS 1529_mm_set_ss(float __w) 1530{ 1531 return (__m128){ __w, 0, 0, 0 }; 1532} 1533 1534static __inline__ __m128 __DEFAULT_FN_ATTRS 1535_mm_set1_ps(float __w) 1536{ 1537 return (__m128){ __w, __w, __w, __w }; 1538} 1539 1540/* Microsoft specific. */ 1541static __inline__ __m128 __DEFAULT_FN_ATTRS 1542_mm_set_ps1(float __w) 1543{ 1544 return _mm_set1_ps(__w); 1545} 1546 1547static __inline__ __m128 __DEFAULT_FN_ATTRS 1548_mm_set_ps(float __z, float __y, float __x, float __w) 1549{ 1550 return (__m128){ __w, __x, __y, __z }; 1551} 1552 1553static __inline__ __m128 __DEFAULT_FN_ATTRS 1554_mm_setr_ps(float __z, float __y, float __x, float __w) 1555{ 1556 return (__m128){ __z, __y, __x, __w }; 1557} 1558 1559static __inline__ __m128 __DEFAULT_FN_ATTRS 1560_mm_setzero_ps(void) 1561{ 1562 return (__m128){ 0, 0, 0, 0 }; 1563} 1564 1565static __inline__ void __DEFAULT_FN_ATTRS 1566_mm_storeh_pi(__m64 *__p, __m128 __a) 1567{ 1568 __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); 1569} 1570 1571static __inline__ void __DEFAULT_FN_ATTRS 1572_mm_storel_pi(__m64 *__p, __m128 __a) 1573{ 1574 __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); 1575} 1576 1577static __inline__ void __DEFAULT_FN_ATTRS 1578_mm_store_ss(float *__p, __m128 __a) 1579{ 1580 struct __mm_store_ss_struct { 1581 float __u; 1582 } __attribute__((__packed__, __may_alias__)); 1583 ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; 1584} 1585 1586static __inline__ void __DEFAULT_FN_ATTRS 1587_mm_storeu_ps(float *__p, __m128 __a) 1588{ 1589 struct __storeu_ps { 1590 __m128 __v; 1591 } __attribute__((__packed__, __may_alias__)); 1592 ((struct __storeu_ps*)__p)->__v = __a; 1593} 1594 1595static __inline__ void __DEFAULT_FN_ATTRS 1596_mm_store_ps(float *__p, __m128 __a) 1597{ 1598 *(__m128*)__p = __a; 1599} 1600 1601static __inline__ void __DEFAULT_FN_ATTRS 1602_mm_store1_ps(float *__p, __m128 __a) 1603{ 1604 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); 1605 _mm_store_ps(__p, __a); 1606} 1607 1608static __inline__ void __DEFAULT_FN_ATTRS 1609_mm_store_ps1(float *__p, __m128 __a) 1610{ 1611 return _mm_store1_ps(__p, __a); 1612} 1613 1614static __inline__ void __DEFAULT_FN_ATTRS 1615_mm_storer_ps(float *__p, __m128 __a) 1616{ 1617 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); 1618 _mm_store_ps(__p, __a); 1619} 1620 1621#define _MM_HINT_T0 3 1622#define _MM_HINT_T1 2 1623#define _MM_HINT_T2 1 1624#define _MM_HINT_NTA 0 1625 1626#ifndef _MSC_VER 1627/* FIXME: We have to #define this because "sel" must be a constant integer, and 1628 Sema doesn't do any form of constant propagation yet. */ 1629 1630#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) 1631#endif 1632 1633static __inline__ void __DEFAULT_FN_ATTRS 1634_mm_stream_pi(__m64 *__p, __m64 __a) 1635{ 1636 __builtin_ia32_movntq(__p, __a); 1637} 1638 1639static __inline__ void __DEFAULT_FN_ATTRS 1640_mm_stream_ps(float *__p, __m128 __a) 1641{ 1642 __builtin_ia32_movntps(__p, (__v4sf)__a); 1643} 1644 1645static __inline__ void __DEFAULT_FN_ATTRS 1646_mm_sfence(void) 1647{ 1648 __builtin_ia32_sfence(); 1649} 1650 1651static __inline__ int __DEFAULT_FN_ATTRS 1652_mm_extract_pi16(__m64 __a, int __n) 1653{ 1654 __v4hi __b = (__v4hi)__a; 1655 return (unsigned short)__b[__n & 3]; 1656} 1657 1658static __inline__ __m64 __DEFAULT_FN_ATTRS 1659_mm_insert_pi16(__m64 __a, int __d, int __n) 1660{ 1661 __v4hi __b = (__v4hi)__a; 1662 __b[__n & 3] = __d; 1663 return (__m64)__b; 1664} 1665 1666static __inline__ __m64 __DEFAULT_FN_ATTRS 1667_mm_max_pi16(__m64 __a, __m64 __b) 1668{ 1669 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); 1670} 1671 1672static __inline__ __m64 __DEFAULT_FN_ATTRS 1673_mm_max_pu8(__m64 __a, __m64 __b) 1674{ 1675 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); 1676} 1677 1678static __inline__ __m64 __DEFAULT_FN_ATTRS 1679_mm_min_pi16(__m64 __a, __m64 __b) 1680{ 1681 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); 1682} 1683 1684static __inline__ __m64 __DEFAULT_FN_ATTRS 1685_mm_min_pu8(__m64 __a, __m64 __b) 1686{ 1687 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); 1688} 1689 1690static __inline__ int __DEFAULT_FN_ATTRS 1691_mm_movemask_pi8(__m64 __a) 1692{ 1693 return __builtin_ia32_pmovmskb((__v8qi)__a); 1694} 1695 1696static __inline__ __m64 __DEFAULT_FN_ATTRS 1697_mm_mulhi_pu16(__m64 __a, __m64 __b) 1698{ 1699 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); 1700} 1701 1702#define _mm_shuffle_pi16(a, n) __extension__ ({ \ 1703 (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) 1704 1705static __inline__ void __DEFAULT_FN_ATTRS 1706_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) 1707{ 1708 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); 1709} 1710 1711static __inline__ __m64 __DEFAULT_FN_ATTRS 1712_mm_avg_pu8(__m64 __a, __m64 __b) 1713{ 1714 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); 1715} 1716 1717static __inline__ __m64 __DEFAULT_FN_ATTRS 1718_mm_avg_pu16(__m64 __a, __m64 __b) 1719{ 1720 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); 1721} 1722 1723static __inline__ __m64 __DEFAULT_FN_ATTRS 1724_mm_sad_pu8(__m64 __a, __m64 __b) 1725{ 1726 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); 1727} 1728 1729static __inline__ unsigned int __DEFAULT_FN_ATTRS 1730_mm_getcsr(void) 1731{ 1732 return __builtin_ia32_stmxcsr(); 1733} 1734 1735static __inline__ void __DEFAULT_FN_ATTRS 1736_mm_setcsr(unsigned int __i) 1737{ 1738 __builtin_ia32_ldmxcsr(__i); 1739} 1740 1741#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ 1742 (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ 1743 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 1744 (((mask) & 0x30) >> 4) + 4, \ 1745 (((mask) & 0xc0) >> 6) + 4); }) 1746 1747static __inline__ __m128 __DEFAULT_FN_ATTRS 1748_mm_unpackhi_ps(__m128 __a, __m128 __b) 1749{ 1750 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); 1751} 1752 1753static __inline__ __m128 __DEFAULT_FN_ATTRS 1754_mm_unpacklo_ps(__m128 __a, __m128 __b) 1755{ 1756 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); 1757} 1758 1759static __inline__ __m128 __DEFAULT_FN_ATTRS 1760_mm_move_ss(__m128 __a, __m128 __b) 1761{ 1762 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3); 1763} 1764 1765static __inline__ __m128 __DEFAULT_FN_ATTRS 1766_mm_movehl_ps(__m128 __a, __m128 __b) 1767{ 1768 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); 1769} 1770 1771static __inline__ __m128 __DEFAULT_FN_ATTRS 1772_mm_movelh_ps(__m128 __a, __m128 __b) 1773{ 1774 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); 1775} 1776 1777static __inline__ __m128 __DEFAULT_FN_ATTRS 1778_mm_cvtpi16_ps(__m64 __a) 1779{ 1780 __m64 __b, __c; 1781 __m128 __r; 1782 1783 __b = _mm_setzero_si64(); 1784 __b = _mm_cmpgt_pi16(__b, __a); 1785 __c = _mm_unpackhi_pi16(__a, __b); 1786 __r = _mm_setzero_ps(); 1787 __r = _mm_cvtpi32_ps(__r, __c); 1788 __r = _mm_movelh_ps(__r, __r); 1789 __c = _mm_unpacklo_pi16(__a, __b); 1790 __r = _mm_cvtpi32_ps(__r, __c); 1791 1792 return __r; 1793} 1794 1795static __inline__ __m128 __DEFAULT_FN_ATTRS 1796_mm_cvtpu16_ps(__m64 __a) 1797{ 1798 __m64 __b, __c; 1799 __m128 __r; 1800 1801 __b = _mm_setzero_si64(); 1802 __c = _mm_unpackhi_pi16(__a, __b); 1803 __r = _mm_setzero_ps(); 1804 __r = _mm_cvtpi32_ps(__r, __c); 1805 __r = _mm_movelh_ps(__r, __r); 1806 __c = _mm_unpacklo_pi16(__a, __b); 1807 __r = _mm_cvtpi32_ps(__r, __c); 1808 1809 return __r; 1810} 1811 1812static __inline__ __m128 __DEFAULT_FN_ATTRS 1813_mm_cvtpi8_ps(__m64 __a) 1814{ 1815 __m64 __b; 1816 1817 __b = _mm_setzero_si64(); 1818 __b = _mm_cmpgt_pi8(__b, __a); 1819 __b = _mm_unpacklo_pi8(__a, __b); 1820 1821 return _mm_cvtpi16_ps(__b); 1822} 1823 1824static __inline__ __m128 __DEFAULT_FN_ATTRS 1825_mm_cvtpu8_ps(__m64 __a) 1826{ 1827 __m64 __b; 1828 1829 __b = _mm_setzero_si64(); 1830 __b = _mm_unpacklo_pi8(__a, __b); 1831 1832 return _mm_cvtpi16_ps(__b); 1833} 1834 1835static __inline__ __m128 __DEFAULT_FN_ATTRS 1836_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) 1837{ 1838 __m128 __c; 1839 1840 __c = _mm_setzero_ps(); 1841 __c = _mm_cvtpi32_ps(__c, __b); 1842 __c = _mm_movelh_ps(__c, __c); 1843 1844 return _mm_cvtpi32_ps(__c, __a); 1845} 1846 1847static __inline__ __m64 __DEFAULT_FN_ATTRS 1848_mm_cvtps_pi16(__m128 __a) 1849{ 1850 __m64 __b, __c; 1851 1852 __b = _mm_cvtps_pi32(__a); 1853 __a = _mm_movehl_ps(__a, __a); 1854 __c = _mm_cvtps_pi32(__a); 1855 1856 return _mm_packs_pi32(__b, __c); 1857} 1858 1859static __inline__ __m64 __DEFAULT_FN_ATTRS 1860_mm_cvtps_pi8(__m128 __a) 1861{ 1862 __m64 __b, __c; 1863 1864 __b = _mm_cvtps_pi16(__a); 1865 __c = _mm_setzero_si64(); 1866 1867 return _mm_packs_pi16(__b, __c); 1868} 1869 1870static __inline__ int __DEFAULT_FN_ATTRS 1871_mm_movemask_ps(__m128 __a) 1872{ 1873 return __builtin_ia32_movmskps((__v4sf)__a); 1874} 1875 1876 1877#ifdef _MSC_VER 1878#define _MM_ALIGN16 __declspec(align(16)) 1879#endif 1880 1881#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 1882 1883#define _MM_EXCEPT_INVALID (0x0001) 1884#define _MM_EXCEPT_DENORM (0x0002) 1885#define _MM_EXCEPT_DIV_ZERO (0x0004) 1886#define _MM_EXCEPT_OVERFLOW (0x0008) 1887#define _MM_EXCEPT_UNDERFLOW (0x0010) 1888#define _MM_EXCEPT_INEXACT (0x0020) 1889#define _MM_EXCEPT_MASK (0x003f) 1890 1891#define _MM_MASK_INVALID (0x0080) 1892#define _MM_MASK_DENORM (0x0100) 1893#define _MM_MASK_DIV_ZERO (0x0200) 1894#define _MM_MASK_OVERFLOW (0x0400) 1895#define _MM_MASK_UNDERFLOW (0x0800) 1896#define _MM_MASK_INEXACT (0x1000) 1897#define _MM_MASK_MASK (0x1f80) 1898 1899#define _MM_ROUND_NEAREST (0x0000) 1900#define _MM_ROUND_DOWN (0x2000) 1901#define _MM_ROUND_UP (0x4000) 1902#define _MM_ROUND_TOWARD_ZERO (0x6000) 1903#define _MM_ROUND_MASK (0x6000) 1904 1905#define _MM_FLUSH_ZERO_MASK (0x8000) 1906#define _MM_FLUSH_ZERO_ON (0x8000) 1907#define _MM_FLUSH_ZERO_OFF (0x0000) 1908 1909#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) 1910#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) 1911#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) 1912#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) 1913 1914#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) 1915#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) 1916#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) 1917#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) 1918 1919#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1920do { \ 1921 __m128 tmp3, tmp2, tmp1, tmp0; \ 1922 tmp0 = _mm_unpacklo_ps((row0), (row1)); \ 1923 tmp2 = _mm_unpacklo_ps((row2), (row3)); \ 1924 tmp1 = _mm_unpackhi_ps((row0), (row1)); \ 1925 tmp3 = _mm_unpackhi_ps((row2), (row3)); \ 1926 (row0) = _mm_movelh_ps(tmp0, tmp2); \ 1927 (row1) = _mm_movehl_ps(tmp2, tmp0); \ 1928 (row2) = _mm_movelh_ps(tmp1, tmp3); \ 1929 (row3) = _mm_movehl_ps(tmp3, tmp1); \ 1930} while (0) 1931 1932/* Aliases for compatibility. */ 1933#define _m_pextrw _mm_extract_pi16 1934#define _m_pinsrw _mm_insert_pi16 1935#define _m_pmaxsw _mm_max_pi16 1936#define _m_pmaxub _mm_max_pu8 1937#define _m_pminsw _mm_min_pi16 1938#define _m_pminub _mm_min_pu8 1939#define _m_pmovmskb _mm_movemask_pi8 1940#define _m_pmulhuw _mm_mulhi_pu16 1941#define _m_pshufw _mm_shuffle_pi16 1942#define _m_maskmovq _mm_maskmove_si64 1943#define _m_pavgb _mm_avg_pu8 1944#define _m_pavgw _mm_avg_pu16 1945#define _m_psadbw _mm_sad_pu8 1946#define _m_ _mm_ 1947#define _m_ _mm_ 1948 1949#undef __DEFAULT_FN_ATTRS 1950 1951/* Ugly hack for backwards-compatibility (compatible with gcc) */ 1952#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) 1953#include <emmintrin.h> 1954#endif 1955 1956#endif /* __XMMINTRIN_H */ 1957