1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#include <xmmintrin.h> 28 29typedef double __m128d __attribute__((__vector_size__(16))); 30typedef long long __m128i __attribute__((__vector_size__(16))); 31 32/* Type defines. */ 33typedef double __v2df __attribute__ ((__vector_size__ (16))); 34typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35typedef short __v8hi __attribute__((__vector_size__(16))); 36typedef char __v16qi __attribute__((__vector_size__(16))); 37 38/* Unsigned types */ 39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 40typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 41typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 42 43/* We need an explicitly signed variant for char. Note that this shouldn't 44 * appear in the interface though. */ 45typedef signed char __v16qs __attribute__((__vector_size__(16))); 46 47#include <f16cintrin.h> 48 49/* Define the default attributes for the functions in this file. */ 50#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 51 52/// \brief Adds lower double-precision values in both operands and returns the 53/// sum in the lower 64 bits of the result. The upper 64 bits of the result 54/// are copied from the upper double-precision value of the first operand. 55/// 56/// \headerfile <x86intrin.h> 57/// 58/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 59/// 60/// \param __a 61/// A 128-bit vector of [2 x double] containing one of the source operands. 62/// \param __b 63/// A 128-bit vector of [2 x double] containing one of the source operands. 64/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 65/// sum of the lower 64 bits of both operands. The upper 64 bits are copied 66/// from the upper 64 bits of the first source operand. 67static __inline__ __m128d __DEFAULT_FN_ATTRS 68_mm_add_sd(__m128d __a, __m128d __b) 69{ 70 __a[0] += __b[0]; 71 return __a; 72} 73 74/// \brief Adds two 128-bit vectors of [2 x double]. 75/// 76/// \headerfile <x86intrin.h> 77/// 78/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 79/// 80/// \param __a 81/// A 128-bit vector of [2 x double] containing one of the source operands. 82/// \param __b 83/// A 128-bit vector of [2 x double] containing one of the source operands. 84/// \returns A 128-bit vector of [2 x double] containing the sums of both 85/// operands. 86static __inline__ __m128d __DEFAULT_FN_ATTRS 87_mm_add_pd(__m128d __a, __m128d __b) 88{ 89 return (__m128d)((__v2df)__a + (__v2df)__b); 90} 91 92/// \brief Subtracts the lower double-precision value of the second operand 93/// from the lower double-precision value of the first operand and returns 94/// the difference in the lower 64 bits of the result. The upper 64 bits of 95/// the result are copied from the upper double-precision value of the first 96/// operand. 97/// 98/// \headerfile <x86intrin.h> 99/// 100/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 101/// 102/// \param __a 103/// A 128-bit vector of [2 x double] containing the minuend. 104/// \param __b 105/// A 128-bit vector of [2 x double] containing the subtrahend. 106/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 107/// difference of the lower 64 bits of both operands. The upper 64 bits are 108/// copied from the upper 64 bits of the first source operand. 109static __inline__ __m128d __DEFAULT_FN_ATTRS 110_mm_sub_sd(__m128d __a, __m128d __b) 111{ 112 __a[0] -= __b[0]; 113 return __a; 114} 115 116/// \brief Subtracts two 128-bit vectors of [2 x double]. 117/// 118/// \headerfile <x86intrin.h> 119/// 120/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 121/// 122/// \param __a 123/// A 128-bit vector of [2 x double] containing the minuend. 124/// \param __b 125/// A 128-bit vector of [2 x double] containing the subtrahend. 126/// \returns A 128-bit vector of [2 x double] containing the differences between 127/// both operands. 128static __inline__ __m128d __DEFAULT_FN_ATTRS 129_mm_sub_pd(__m128d __a, __m128d __b) 130{ 131 return (__m128d)((__v2df)__a - (__v2df)__b); 132} 133 134/// \brief Multiplies lower double-precision values in both operands and returns 135/// the product in the lower 64 bits of the result. The upper 64 bits of the 136/// result are copied from the upper double-precision value of the first 137/// operand. 138/// 139/// \headerfile <x86intrin.h> 140/// 141/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 142/// 143/// \param __a 144/// A 128-bit vector of [2 x double] containing one of the source operands. 145/// \param __b 146/// A 128-bit vector of [2 x double] containing one of the source operands. 147/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 148/// product of the lower 64 bits of both operands. The upper 64 bits are 149/// copied from the upper 64 bits of the first source operand. 150static __inline__ __m128d __DEFAULT_FN_ATTRS 151_mm_mul_sd(__m128d __a, __m128d __b) 152{ 153 __a[0] *= __b[0]; 154 return __a; 155} 156 157/// \brief Multiplies two 128-bit vectors of [2 x double]. 158/// 159/// \headerfile <x86intrin.h> 160/// 161/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 162/// 163/// \param __a 164/// A 128-bit vector of [2 x double] containing one of the operands. 165/// \param __b 166/// A 128-bit vector of [2 x double] containing one of the operands. 167/// \returns A 128-bit vector of [2 x double] containing the products of both 168/// operands. 169static __inline__ __m128d __DEFAULT_FN_ATTRS 170_mm_mul_pd(__m128d __a, __m128d __b) 171{ 172 return (__m128d)((__v2df)__a * (__v2df)__b); 173} 174 175/// \brief Divides the lower double-precision value of the first operand by the 176/// lower double-precision value of the second operand and returns the 177/// quotient in the lower 64 bits of the result. The upper 64 bits of the 178/// result are copied from the upper double-precision value of the first 179/// operand. 180/// 181/// \headerfile <x86intrin.h> 182/// 183/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 184/// 185/// \param __a 186/// A 128-bit vector of [2 x double] containing the dividend. 187/// \param __b 188/// A 128-bit vector of [2 x double] containing divisor. 189/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 190/// quotient of the lower 64 bits of both operands. The upper 64 bits are 191/// copied from the upper 64 bits of the first source operand. 192static __inline__ __m128d __DEFAULT_FN_ATTRS 193_mm_div_sd(__m128d __a, __m128d __b) 194{ 195 __a[0] /= __b[0]; 196 return __a; 197} 198 199/// \brief Performs an element-by-element division of two 128-bit vectors of 200/// [2 x double]. 201/// 202/// \headerfile <x86intrin.h> 203/// 204/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 205/// 206/// \param __a 207/// A 128-bit vector of [2 x double] containing the dividend. 208/// \param __b 209/// A 128-bit vector of [2 x double] containing the divisor. 210/// \returns A 128-bit vector of [2 x double] containing the quotients of both 211/// operands. 212static __inline__ __m128d __DEFAULT_FN_ATTRS 213_mm_div_pd(__m128d __a, __m128d __b) 214{ 215 return (__m128d)((__v2df)__a / (__v2df)__b); 216} 217 218/// \brief Calculates the square root of the lower double-precision value of 219/// the second operand and returns it in the lower 64 bits of the result. 220/// The upper 64 bits of the result are copied from the upper double- 221/// precision value of the first operand. 222/// 223/// \headerfile <x86intrin.h> 224/// 225/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 226/// 227/// \param __a 228/// A 128-bit vector of [2 x double] containing one of the operands. The 229/// upper 64 bits of this operand are copied to the upper 64 bits of the 230/// result. 231/// \param __b 232/// A 128-bit vector of [2 x double] containing one of the operands. The 233/// square root is calculated using the lower 64 bits of this operand. 234/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 235/// square root of the lower 64 bits of operand \a __b, and whose upper 64 236/// bits are copied from the upper 64 bits of operand \a __a. 237static __inline__ __m128d __DEFAULT_FN_ATTRS 238_mm_sqrt_sd(__m128d __a, __m128d __b) 239{ 240 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 241 return (__m128d) { __c[0], __a[1] }; 242} 243 244/// \brief Calculates the square root of the each of two values stored in a 245/// 128-bit vector of [2 x double]. 246/// 247/// \headerfile <x86intrin.h> 248/// 249/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 250/// 251/// \param __a 252/// A 128-bit vector of [2 x double]. 253/// \returns A 128-bit vector of [2 x double] containing the square roots of the 254/// values in the operand. 255static __inline__ __m128d __DEFAULT_FN_ATTRS 256_mm_sqrt_pd(__m128d __a) 257{ 258 return __builtin_ia32_sqrtpd((__v2df)__a); 259} 260 261/// \brief Compares lower 64-bit double-precision values of both operands, and 262/// returns the lesser of the pair of values in the lower 64-bits of the 263/// result. The upper 64 bits of the result are copied from the upper double- 264/// precision value of the first operand. 265/// 266/// \headerfile <x86intrin.h> 267/// 268/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 269/// 270/// \param __a 271/// A 128-bit vector of [2 x double] containing one of the operands. The 272/// lower 64 bits of this operand are used in the comparison. 273/// \param __b 274/// A 128-bit vector of [2 x double] containing one of the operands. The 275/// lower 64 bits of this operand are used in the comparison. 276/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 277/// minimum value between both operands. The upper 64 bits are copied from 278/// the upper 64 bits of the first source operand. 279static __inline__ __m128d __DEFAULT_FN_ATTRS 280_mm_min_sd(__m128d __a, __m128d __b) 281{ 282 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 283} 284 285/// \brief Performs element-by-element comparison of the two 128-bit vectors of 286/// [2 x double] and returns the vector containing the lesser of each pair of 287/// values. 288/// 289/// \headerfile <x86intrin.h> 290/// 291/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 292/// 293/// \param __a 294/// A 128-bit vector of [2 x double] containing one of the operands. 295/// \param __b 296/// A 128-bit vector of [2 x double] containing one of the operands. 297/// \returns A 128-bit vector of [2 x double] containing the minimum values 298/// between both operands. 299static __inline__ __m128d __DEFAULT_FN_ATTRS 300_mm_min_pd(__m128d __a, __m128d __b) 301{ 302 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 303} 304 305/// \brief Compares lower 64-bits double-precision values of both operands, and 306/// returns the greater of the pair of values in the lower 64-bits of the 307/// result. The upper 64 bits of the result are copied from the upper double- 308/// precision value of the first operand. 309/// 310/// \headerfile <x86intrin.h> 311/// 312/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 313/// 314/// \param __a 315/// A 128-bit vector of [2 x double] containing one of the operands. The 316/// lower 64 bits of this operand are used in the comparison. 317/// \param __b 318/// A 128-bit vector of [2 x double] containing one of the operands. The 319/// lower 64 bits of this operand are used in the comparison. 320/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 321/// maximum value between both operands. The upper 64 bits are copied from 322/// the upper 64 bits of the first source operand. 323static __inline__ __m128d __DEFAULT_FN_ATTRS 324_mm_max_sd(__m128d __a, __m128d __b) 325{ 326 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 327} 328 329/// \brief Performs element-by-element comparison of the two 128-bit vectors of 330/// [2 x double] and returns the vector containing the greater of each pair 331/// of values. 332/// 333/// \headerfile <x86intrin.h> 334/// 335/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 336/// 337/// \param __a 338/// A 128-bit vector of [2 x double] containing one of the operands. 339/// \param __b 340/// A 128-bit vector of [2 x double] containing one of the operands. 341/// \returns A 128-bit vector of [2 x double] containing the maximum values 342/// between both operands. 343static __inline__ __m128d __DEFAULT_FN_ATTRS 344_mm_max_pd(__m128d __a, __m128d __b) 345{ 346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 347} 348 349/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double]. 350/// 351/// \headerfile <x86intrin.h> 352/// 353/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 354/// 355/// \param __a 356/// A 128-bit vector of [2 x double] containing one of the source operands. 357/// \param __b 358/// A 128-bit vector of [2 x double] containing one of the source operands. 359/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 360/// values between both operands. 361static __inline__ __m128d __DEFAULT_FN_ATTRS 362_mm_and_pd(__m128d __a, __m128d __b) 363{ 364 return (__m128d)((__v2du)__a & (__v2du)__b); 365} 366 367/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using 368/// the one's complement of the values contained in the first source operand. 369/// 370/// \headerfile <x86intrin.h> 371/// 372/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 373/// 374/// \param __a 375/// A 128-bit vector of [2 x double] containing the left source operand. The 376/// one's complement of this value is used in the bitwise AND. 377/// \param __b 378/// A 128-bit vector of [2 x double] containing the right source operand. 379/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 380/// values in the second operand and the one's complement of the first 381/// operand. 382static __inline__ __m128d __DEFAULT_FN_ATTRS 383_mm_andnot_pd(__m128d __a, __m128d __b) 384{ 385 return (__m128d)(~(__v2du)__a & (__v2du)__b); 386} 387 388/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double]. 389/// 390/// \headerfile <x86intrin.h> 391/// 392/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 393/// 394/// \param __a 395/// A 128-bit vector of [2 x double] containing one of the source operands. 396/// \param __b 397/// A 128-bit vector of [2 x double] containing one of the source operands. 398/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 399/// values between both operands. 400static __inline__ __m128d __DEFAULT_FN_ATTRS 401_mm_or_pd(__m128d __a, __m128d __b) 402{ 403 return (__m128d)((__v2du)__a | (__v2du)__b); 404} 405 406/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 407/// 408/// \headerfile <x86intrin.h> 409/// 410/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 411/// 412/// \param __a 413/// A 128-bit vector of [2 x double] containing one of the source operands. 414/// \param __b 415/// A 128-bit vector of [2 x double] containing one of the source operands. 416/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 417/// values between both operands. 418static __inline__ __m128d __DEFAULT_FN_ATTRS 419_mm_xor_pd(__m128d __a, __m128d __b) 420{ 421 return (__m128d)((__v2du)__a ^ (__v2du)__b); 422} 423 424/// \brief Compares each of the corresponding double-precision values of the 425/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h 426/// for false, FFFFFFFFFFFFFFFFh for true. 427/// 428/// \headerfile <x86intrin.h> 429/// 430/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 431/// 432/// \param __a 433/// A 128-bit vector of [2 x double]. 434/// \param __b 435/// A 128-bit vector of [2 x double]. 436/// \returns A 128-bit vector containing the comparison results. 437static __inline__ __m128d __DEFAULT_FN_ATTRS 438_mm_cmpeq_pd(__m128d __a, __m128d __b) 439{ 440 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 441} 442 443/// \brief Compares each of the corresponding double-precision values of the 444/// 128-bit vectors of [2 x double] to determine if the values in the first 445/// operand are less than those in the second operand. Each comparison 446/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 447/// 448/// \headerfile <x86intrin.h> 449/// 450/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 451/// 452/// \param __a 453/// A 128-bit vector of [2 x double]. 454/// \param __b 455/// A 128-bit vector of [2 x double]. 456/// \returns A 128-bit vector containing the comparison results. 457static __inline__ __m128d __DEFAULT_FN_ATTRS 458_mm_cmplt_pd(__m128d __a, __m128d __b) 459{ 460 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 461} 462 463/// \brief Compares each of the corresponding double-precision values of the 464/// 128-bit vectors of [2 x double] to determine if the values in the first 465/// operand are less than or equal to those in the second operand. Each 466/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 467/// 468/// \headerfile <x86intrin.h> 469/// 470/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 471/// 472/// \param __a 473/// A 128-bit vector of [2 x double]. 474/// \param __b 475/// A 128-bit vector of [2 x double]. 476/// \returns A 128-bit vector containing the comparison results. 477static __inline__ __m128d __DEFAULT_FN_ATTRS 478_mm_cmple_pd(__m128d __a, __m128d __b) 479{ 480 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 481} 482 483/// \brief Compares each of the corresponding double-precision values of the 484/// 128-bit vectors of [2 x double] to determine if the values in the first 485/// operand are greater than those in the second operand. Each comparison 486/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 487/// 488/// \headerfile <x86intrin.h> 489/// 490/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 491/// 492/// \param __a 493/// A 128-bit vector of [2 x double]. 494/// \param __b 495/// A 128-bit vector of [2 x double]. 496/// \returns A 128-bit vector containing the comparison results. 497static __inline__ __m128d __DEFAULT_FN_ATTRS 498_mm_cmpgt_pd(__m128d __a, __m128d __b) 499{ 500 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 501} 502 503/// \brief Compares each of the corresponding double-precision values of the 504/// 128-bit vectors of [2 x double] to determine if the values in the first 505/// operand are greater than or equal to those in the second operand. Each 506/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 507/// 508/// \headerfile <x86intrin.h> 509/// 510/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 511/// 512/// \param __a 513/// A 128-bit vector of [2 x double]. 514/// \param __b 515/// A 128-bit vector of [2 x double]. 516/// \returns A 128-bit vector containing the comparison results. 517static __inline__ __m128d __DEFAULT_FN_ATTRS 518_mm_cmpge_pd(__m128d __a, __m128d __b) 519{ 520 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 521} 522 523/// \brief Compares each of the corresponding double-precision values of the 524/// 128-bit vectors of [2 x double] to determine if the values in the first 525/// operand are ordered with respect to those in the second operand. A pair 526/// of double-precision values are "ordered" with respect to each other if 527/// neither value is a NaN. Each comparison yields 0h for false, 528/// FFFFFFFFFFFFFFFFh for true. 529/// 530/// \headerfile <x86intrin.h> 531/// 532/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 533/// 534/// \param __a 535/// A 128-bit vector of [2 x double]. 536/// \param __b 537/// A 128-bit vector of [2 x double]. 538/// \returns A 128-bit vector containing the comparison results. 539static __inline__ __m128d __DEFAULT_FN_ATTRS 540_mm_cmpord_pd(__m128d __a, __m128d __b) 541{ 542 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 543} 544 545/// \brief Compares each of the corresponding double-precision values of the 546/// 128-bit vectors of [2 x double] to determine if the values in the first 547/// operand are unordered with respect to those in the second operand. A pair 548/// of double-precision values are "unordered" with respect to each other if 549/// one or both values are NaN. Each comparison yields 0h for false, 550/// FFFFFFFFFFFFFFFFh for true. 551/// 552/// \headerfile <x86intrin.h> 553/// 554/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 555/// instruction. 556/// 557/// \param __a 558/// A 128-bit vector of [2 x double]. 559/// \param __b 560/// A 128-bit vector of [2 x double]. 561/// \returns A 128-bit vector containing the comparison results. 562static __inline__ __m128d __DEFAULT_FN_ATTRS 563_mm_cmpunord_pd(__m128d __a, __m128d __b) 564{ 565 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 566} 567 568/// \brief Compares each of the corresponding double-precision values of the 569/// 128-bit vectors of [2 x double] to determine if the values in the first 570/// operand are unequal to those in the second operand. Each comparison 571/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 572/// 573/// \headerfile <x86intrin.h> 574/// 575/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 576/// 577/// \param __a 578/// A 128-bit vector of [2 x double]. 579/// \param __b 580/// A 128-bit vector of [2 x double]. 581/// \returns A 128-bit vector containing the comparison results. 582static __inline__ __m128d __DEFAULT_FN_ATTRS 583_mm_cmpneq_pd(__m128d __a, __m128d __b) 584{ 585 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 586} 587 588/// \brief Compares each of the corresponding double-precision values of the 589/// 128-bit vectors of [2 x double] to determine if the values in the first 590/// operand are not less than those in the second operand. Each comparison 591/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 592/// 593/// \headerfile <x86intrin.h> 594/// 595/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 596/// 597/// \param __a 598/// A 128-bit vector of [2 x double]. 599/// \param __b 600/// A 128-bit vector of [2 x double]. 601/// \returns A 128-bit vector containing the comparison results. 602static __inline__ __m128d __DEFAULT_FN_ATTRS 603_mm_cmpnlt_pd(__m128d __a, __m128d __b) 604{ 605 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 606} 607 608/// \brief Compares each of the corresponding double-precision values of the 609/// 128-bit vectors of [2 x double] to determine if the values in the first 610/// operand are not less than or equal to those in the second operand. Each 611/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 612/// 613/// \headerfile <x86intrin.h> 614/// 615/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 616/// 617/// \param __a 618/// A 128-bit vector of [2 x double]. 619/// \param __b 620/// A 128-bit vector of [2 x double]. 621/// \returns A 128-bit vector containing the comparison results. 622static __inline__ __m128d __DEFAULT_FN_ATTRS 623_mm_cmpnle_pd(__m128d __a, __m128d __b) 624{ 625 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 626} 627 628/// \brief Compares each of the corresponding double-precision values of the 629/// 128-bit vectors of [2 x double] to determine if the values in the first 630/// operand are not greater than those in the second operand. Each 631/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 632/// 633/// \headerfile <x86intrin.h> 634/// 635/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 636/// 637/// \param __a 638/// A 128-bit vector of [2 x double]. 639/// \param __b 640/// A 128-bit vector of [2 x double]. 641/// \returns A 128-bit vector containing the comparison results. 642static __inline__ __m128d __DEFAULT_FN_ATTRS 643_mm_cmpngt_pd(__m128d __a, __m128d __b) 644{ 645 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 646} 647 648/// \brief Compares each of the corresponding double-precision values of the 649/// 128-bit vectors of [2 x double] to determine if the values in the first 650/// operand are not greater than or equal to those in the second operand. 651/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 652/// 653/// \headerfile <x86intrin.h> 654/// 655/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 656/// 657/// \param __a 658/// A 128-bit vector of [2 x double]. 659/// \param __b 660/// A 128-bit vector of [2 x double]. 661/// \returns A 128-bit vector containing the comparison results. 662static __inline__ __m128d __DEFAULT_FN_ATTRS 663_mm_cmpnge_pd(__m128d __a, __m128d __b) 664{ 665 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 666} 667 668/// \brief Compares the lower double-precision floating-point values in each of 669/// the two 128-bit floating-point vectors of [2 x double] for equality. The 670/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 671/// 672/// \headerfile <x86intrin.h> 673/// 674/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 675/// 676/// \param __a 677/// A 128-bit vector of [2 x double]. The lower double-precision value is 678/// compared to the lower double-precision value of \a __b. 679/// \param __b 680/// A 128-bit vector of [2 x double]. The lower double-precision value is 681/// compared to the lower double-precision value of \a __a. 682/// \returns A 128-bit vector. The lower 64 bits contains the comparison 683/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 684static __inline__ __m128d __DEFAULT_FN_ATTRS 685_mm_cmpeq_sd(__m128d __a, __m128d __b) 686{ 687 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 688} 689 690/// \brief Compares the lower double-precision floating-point values in each of 691/// the two 128-bit floating-point vectors of [2 x double] to determine if 692/// the value in the first parameter is less than the corresponding value in 693/// the second parameter. The comparison yields 0h for false, 694/// FFFFFFFFFFFFFFFFh for true. 695/// 696/// \headerfile <x86intrin.h> 697/// 698/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 699/// 700/// \param __a 701/// A 128-bit vector of [2 x double]. The lower double-precision value is 702/// compared to the lower double-precision value of \a __b. 703/// \param __b 704/// A 128-bit vector of [2 x double]. The lower double-precision value is 705/// compared to the lower double-precision value of \a __a. 706/// \returns A 128-bit vector. The lower 64 bits contains the comparison 707/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 708static __inline__ __m128d __DEFAULT_FN_ATTRS 709_mm_cmplt_sd(__m128d __a, __m128d __b) 710{ 711 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 712} 713 714/// \brief Compares the lower double-precision floating-point values in each of 715/// the two 128-bit floating-point vectors of [2 x double] to determine if 716/// the value in the first parameter is less than or equal to the 717/// corresponding value in the second parameter. The comparison yields 0h for 718/// false, FFFFFFFFFFFFFFFFh for true. 719/// 720/// \headerfile <x86intrin.h> 721/// 722/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 723/// 724/// \param __a 725/// A 128-bit vector of [2 x double]. The lower double-precision value is 726/// compared to the lower double-precision value of \a __b. 727/// \param __b 728/// A 128-bit vector of [2 x double]. The lower double-precision value is 729/// compared to the lower double-precision value of \a __a. 730/// \returns A 128-bit vector. The lower 64 bits contains the comparison 731/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 732static __inline__ __m128d __DEFAULT_FN_ATTRS 733_mm_cmple_sd(__m128d __a, __m128d __b) 734{ 735 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 736} 737 738/// \brief Compares the lower double-precision floating-point values in each of 739/// the two 128-bit floating-point vectors of [2 x double] to determine if 740/// the value in the first parameter is greater than the corresponding value 741/// in the second parameter. The comparison yields 0h for false, 742/// FFFFFFFFFFFFFFFFh for true. 743/// 744/// \headerfile <x86intrin.h> 745/// 746/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 747/// 748/// \param __a 749/// A 128-bit vector of [2 x double]. The lower double-precision value is 750/// compared to the lower double-precision value of \a __b. 751/// \param __b 752/// A 128-bit vector of [2 x double]. The lower double-precision value is 753/// compared to the lower double-precision value of \a __a. 754/// \returns A 128-bit vector. The lower 64 bits contains the comparison 755/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 756static __inline__ __m128d __DEFAULT_FN_ATTRS 757_mm_cmpgt_sd(__m128d __a, __m128d __b) 758{ 759 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 760 return (__m128d) { __c[0], __a[1] }; 761} 762 763/// \brief Compares the lower double-precision floating-point values in each of 764/// the two 128-bit floating-point vectors of [2 x double] to determine if 765/// the value in the first parameter is greater than or equal to the 766/// corresponding value in the second parameter. The comparison yields 0h for 767/// false, FFFFFFFFFFFFFFFFh for true. 768/// 769/// \headerfile <x86intrin.h> 770/// 771/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 772/// 773/// \param __a 774/// A 128-bit vector of [2 x double]. The lower double-precision value is 775/// compared to the lower double-precision value of \a __b. 776/// \param __b 777/// A 128-bit vector of [2 x double]. The lower double-precision value is 778/// compared to the lower double-precision value of \a __a. 779/// \returns A 128-bit vector. The lower 64 bits contains the comparison 780/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 781static __inline__ __m128d __DEFAULT_FN_ATTRS 782_mm_cmpge_sd(__m128d __a, __m128d __b) 783{ 784 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 785 return (__m128d) { __c[0], __a[1] }; 786} 787 788/// \brief Compares the lower double-precision floating-point values in each of 789/// the two 128-bit floating-point vectors of [2 x double] to determine if 790/// the value in the first parameter is "ordered" with respect to the 791/// corresponding value in the second parameter. The comparison yields 0h for 792/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are 793/// "ordered" with respect to each other if neither value is a NaN. 794/// 795/// \headerfile <x86intrin.h> 796/// 797/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 798/// 799/// \param __a 800/// A 128-bit vector of [2 x double]. The lower double-precision value is 801/// compared to the lower double-precision value of \a __b. 802/// \param __b 803/// A 128-bit vector of [2 x double]. The lower double-precision value is 804/// compared to the lower double-precision value of \a __a. 805/// \returns A 128-bit vector. The lower 64 bits contains the comparison 806/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 807static __inline__ __m128d __DEFAULT_FN_ATTRS 808_mm_cmpord_sd(__m128d __a, __m128d __b) 809{ 810 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 811} 812 813/// \brief Compares the lower double-precision floating-point values in each of 814/// the two 128-bit floating-point vectors of [2 x double] to determine if 815/// the value in the first parameter is "unordered" with respect to the 816/// corresponding value in the second parameter. The comparison yields 0h 817/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values 818/// are "unordered" with respect to each other if one or both values are NaN. 819/// 820/// \headerfile <x86intrin.h> 821/// 822/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 823/// instruction. 824/// 825/// \param __a 826/// A 128-bit vector of [2 x double]. The lower double-precision value is 827/// compared to the lower double-precision value of \a __b. 828/// \param __b 829/// A 128-bit vector of [2 x double]. The lower double-precision value is 830/// compared to the lower double-precision value of \a __a. 831/// \returns A 128-bit vector. The lower 64 bits contains the comparison 832/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 833static __inline__ __m128d __DEFAULT_FN_ATTRS 834_mm_cmpunord_sd(__m128d __a, __m128d __b) 835{ 836 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 837} 838 839/// \brief Compares the lower double-precision floating-point values in each of 840/// the two 128-bit floating-point vectors of [2 x double] to determine if 841/// the value in the first parameter is unequal to the corresponding value in 842/// the second parameter. The comparison yields 0h for false, 843/// FFFFFFFFFFFFFFFFh for true. 844/// 845/// \headerfile <x86intrin.h> 846/// 847/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 848/// 849/// \param __a 850/// A 128-bit vector of [2 x double]. The lower double-precision value is 851/// compared to the lower double-precision value of \a __b. 852/// \param __b 853/// A 128-bit vector of [2 x double]. The lower double-precision value is 854/// compared to the lower double-precision value of \a __a. 855/// \returns A 128-bit vector. The lower 64 bits contains the comparison 856/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 857static __inline__ __m128d __DEFAULT_FN_ATTRS 858_mm_cmpneq_sd(__m128d __a, __m128d __b) 859{ 860 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 861} 862 863/// \brief Compares the lower double-precision floating-point values in each of 864/// the two 128-bit floating-point vectors of [2 x double] to determine if 865/// the value in the first parameter is not less than the corresponding 866/// value in the second parameter. The comparison yields 0h for false, 867/// FFFFFFFFFFFFFFFFh for true. 868/// 869/// \headerfile <x86intrin.h> 870/// 871/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 872/// 873/// \param __a 874/// A 128-bit vector of [2 x double]. The lower double-precision value is 875/// compared to the lower double-precision value of \a __b. 876/// \param __b 877/// A 128-bit vector of [2 x double]. The lower double-precision value is 878/// compared to the lower double-precision value of \a __a. 879/// \returns A 128-bit vector. The lower 64 bits contains the comparison 880/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 881static __inline__ __m128d __DEFAULT_FN_ATTRS 882_mm_cmpnlt_sd(__m128d __a, __m128d __b) 883{ 884 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 885} 886 887/// \brief Compares the lower double-precision floating-point values in each of 888/// the two 128-bit floating-point vectors of [2 x double] to determine if 889/// the value in the first parameter is not less than or equal to the 890/// corresponding value in the second parameter. The comparison yields 0h 891/// for false, FFFFFFFFFFFFFFFFh for true. 892/// 893/// \headerfile <x86intrin.h> 894/// 895/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 896/// 897/// \param __a 898/// A 128-bit vector of [2 x double]. The lower double-precision value is 899/// compared to the lower double-precision value of \a __b. 900/// \param __b 901/// A 128-bit vector of [2 x double]. The lower double-precision value is 902/// compared to the lower double-precision value of \a __a. 903/// \returns A 128-bit vector. The lower 64 bits contains the comparison 904/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 905static __inline__ __m128d __DEFAULT_FN_ATTRS 906_mm_cmpnle_sd(__m128d __a, __m128d __b) 907{ 908 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 909} 910 911/// \brief Compares the lower double-precision floating-point values in each of 912/// the two 128-bit floating-point vectors of [2 x double] to determine if 913/// the value in the first parameter is not greater than the corresponding 914/// value in the second parameter. The comparison yields 0h for false, 915/// FFFFFFFFFFFFFFFFh for true. 916/// 917/// \headerfile <x86intrin.h> 918/// 919/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 920/// 921/// \param __a 922/// A 128-bit vector of [2 x double]. The lower double-precision value is 923/// compared to the lower double-precision value of \a __b. 924/// \param __b 925/// A 128-bit vector of [2 x double]. The lower double-precision value is 926/// compared to the lower double-precision value of \a __a. 927/// \returns A 128-bit vector. The lower 64 bits contains the comparison 928/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 929static __inline__ __m128d __DEFAULT_FN_ATTRS 930_mm_cmpngt_sd(__m128d __a, __m128d __b) 931{ 932 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 933 return (__m128d) { __c[0], __a[1] }; 934} 935 936/// \brief Compares the lower double-precision floating-point values in each of 937/// the two 128-bit floating-point vectors of [2 x double] to determine if 938/// the value in the first parameter is not greater than or equal to the 939/// corresponding value in the second parameter. The comparison yields 0h 940/// for false, FFFFFFFFFFFFFFFFh for true. 941/// 942/// \headerfile <x86intrin.h> 943/// 944/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 945/// 946/// \param __a 947/// A 128-bit vector of [2 x double]. The lower double-precision value is 948/// compared to the lower double-precision value of \a __b. 949/// \param __b 950/// A 128-bit vector of [2 x double]. The lower double-precision value is 951/// compared to the lower double-precision value of \a __a. 952/// \returns A 128-bit vector. The lower 64 bits contains the comparison 953/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 954static __inline__ __m128d __DEFAULT_FN_ATTRS 955_mm_cmpnge_sd(__m128d __a, __m128d __b) 956{ 957 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 958 return (__m128d) { __c[0], __a[1] }; 959} 960 961/// \brief Compares the lower double-precision floating-point values in each of 962/// the two 128-bit floating-point vectors of [2 x double] for equality. The 963/// comparison yields 0 for false, 1 for true. 964/// 965/// \headerfile <x86intrin.h> 966/// 967/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 968/// 969/// \param __a 970/// A 128-bit vector of [2 x double]. The lower double-precision value is 971/// compared to the lower double-precision value of \a __b. 972/// \param __b 973/// A 128-bit vector of [2 x double]. The lower double-precision value is 974/// compared to the lower double-precision value of \a __a. 975/// \returns An integer containing the comparison results. 976static __inline__ int __DEFAULT_FN_ATTRS 977_mm_comieq_sd(__m128d __a, __m128d __b) 978{ 979 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 980} 981 982/// \brief Compares the lower double-precision floating-point values in each of 983/// the two 128-bit floating-point vectors of [2 x double] to determine if 984/// the value in the first parameter is less than the corresponding value in 985/// the second parameter. The comparison yields 0 for false, 1 for true. 986/// 987/// \headerfile <x86intrin.h> 988/// 989/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 990/// 991/// \param __a 992/// A 128-bit vector of [2 x double]. The lower double-precision value is 993/// compared to the lower double-precision value of \a __b. 994/// \param __b 995/// A 128-bit vector of [2 x double]. The lower double-precision value is 996/// compared to the lower double-precision value of \a __a. 997/// \returns An integer containing the comparison results. 998static __inline__ int __DEFAULT_FN_ATTRS 999_mm_comilt_sd(__m128d __a, __m128d __b) 1000{ 1001 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1002} 1003 1004/// \brief Compares the lower double-precision floating-point values in each of 1005/// the two 128-bit floating-point vectors of [2 x double] to determine if 1006/// the value in the first parameter is less than or equal to the 1007/// corresponding value in the second parameter. The comparison yields 0 for 1008/// false, 1 for true. 1009/// 1010/// \headerfile <x86intrin.h> 1011/// 1012/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1013/// 1014/// \param __a 1015/// A 128-bit vector of [2 x double]. The lower double-precision value is 1016/// compared to the lower double-precision value of \a __b. 1017/// \param __b 1018/// A 128-bit vector of [2 x double]. The lower double-precision value is 1019/// compared to the lower double-precision value of \a __a. 1020/// \returns An integer containing the comparison results. 1021static __inline__ int __DEFAULT_FN_ATTRS 1022_mm_comile_sd(__m128d __a, __m128d __b) 1023{ 1024 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1025} 1026 1027/// \brief Compares the lower double-precision floating-point values in each of 1028/// the two 128-bit floating-point vectors of [2 x double] to determine if 1029/// the value in the first parameter is greater than the corresponding value 1030/// in the second parameter. The comparison yields 0 for false, 1 for true. 1031/// 1032/// \headerfile <x86intrin.h> 1033/// 1034/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1035/// 1036/// \param __a 1037/// A 128-bit vector of [2 x double]. The lower double-precision value is 1038/// compared to the lower double-precision value of \a __b. 1039/// \param __b 1040/// A 128-bit vector of [2 x double]. The lower double-precision value is 1041/// compared to the lower double-precision value of \a __a. 1042/// \returns An integer containing the comparison results. 1043static __inline__ int __DEFAULT_FN_ATTRS 1044_mm_comigt_sd(__m128d __a, __m128d __b) 1045{ 1046 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1047} 1048 1049/// \brief Compares the lower double-precision floating-point values in each of 1050/// the two 128-bit floating-point vectors of [2 x double] to determine if 1051/// the value in the first parameter is greater than or equal to the 1052/// corresponding value in the second parameter. The comparison yields 0 for 1053/// false, 1 for true. 1054/// 1055/// \headerfile <x86intrin.h> 1056/// 1057/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1058/// 1059/// \param __a 1060/// A 128-bit vector of [2 x double]. The lower double-precision value is 1061/// compared to the lower double-precision value of \a __b. 1062/// \param __b 1063/// A 128-bit vector of [2 x double]. The lower double-precision value is 1064/// compared to the lower double-precision value of \a __a. 1065/// \returns An integer containing the comparison results. 1066static __inline__ int __DEFAULT_FN_ATTRS 1067_mm_comige_sd(__m128d __a, __m128d __b) 1068{ 1069 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1070} 1071 1072/// \brief Compares the lower double-precision floating-point values in each of 1073/// the two 128-bit floating-point vectors of [2 x double] to determine if 1074/// the value in the first parameter is unequal to the corresponding value in 1075/// the second parameter. The comparison yields 0 for false, 1 for true. 1076/// 1077/// \headerfile <x86intrin.h> 1078/// 1079/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1080/// 1081/// \param __a 1082/// A 128-bit vector of [2 x double]. The lower double-precision value is 1083/// compared to the lower double-precision value of \a __b. 1084/// \param __b 1085/// A 128-bit vector of [2 x double]. The lower double-precision value is 1086/// compared to the lower double-precision value of \a __a. 1087/// \returns An integer containing the comparison results. 1088static __inline__ int __DEFAULT_FN_ATTRS 1089_mm_comineq_sd(__m128d __a, __m128d __b) 1090{ 1091 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1092} 1093 1094/// \brief Compares the lower double-precision floating-point values in each of 1095/// the two 128-bit floating-point vectors of [2 x double] for equality. The 1096/// comparison yields 0 for false, 1 for true. If either of the two lower 1097/// double-precision values is NaN, 1 is returned. 1098/// 1099/// \headerfile <x86intrin.h> 1100/// 1101/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1102/// 1103/// \param __a 1104/// A 128-bit vector of [2 x double]. The lower double-precision value is 1105/// compared to the lower double-precision value of \a __b. 1106/// \param __b 1107/// A 128-bit vector of [2 x double]. The lower double-precision value is 1108/// compared to the lower double-precision value of \a __a. 1109/// \returns An integer containing the comparison results. If either of the two 1110/// lower double-precision values is NaN, 1 is returned. 1111static __inline__ int __DEFAULT_FN_ATTRS 1112_mm_ucomieq_sd(__m128d __a, __m128d __b) 1113{ 1114 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1115} 1116 1117/// \brief Compares the lower double-precision floating-point values in each of 1118/// the two 128-bit floating-point vectors of [2 x double] to determine if 1119/// the value in the first parameter is less than the corresponding value in 1120/// the second parameter. The comparison yields 0 for false, 1 for true. If 1121/// either of the two lower double-precision values is NaN, 1 is returned. 1122/// 1123/// \headerfile <x86intrin.h> 1124/// 1125/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1126/// 1127/// \param __a 1128/// A 128-bit vector of [2 x double]. The lower double-precision value is 1129/// compared to the lower double-precision value of \a __b. 1130/// \param __b 1131/// A 128-bit vector of [2 x double]. The lower double-precision value is 1132/// compared to the lower double-precision value of \a __a. 1133/// \returns An integer containing the comparison results. If either of the two 1134/// lower double-precision values is NaN, 1 is returned. 1135static __inline__ int __DEFAULT_FN_ATTRS 1136_mm_ucomilt_sd(__m128d __a, __m128d __b) 1137{ 1138 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1139} 1140 1141/// \brief Compares the lower double-precision floating-point values in each of 1142/// the two 128-bit floating-point vectors of [2 x double] to determine if 1143/// the value in the first parameter is less than or equal to the 1144/// corresponding value in the second parameter. The comparison yields 0 for 1145/// false, 1 for true. If either of the two lower double-precision values is 1146/// NaN, 1 is returned. 1147/// 1148/// \headerfile <x86intrin.h> 1149/// 1150/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1151/// 1152/// \param __a 1153/// A 128-bit vector of [2 x double]. The lower double-precision value is 1154/// compared to the lower double-precision value of \a __b. 1155/// \param __b 1156/// A 128-bit vector of [2 x double]. The lower double-precision value is 1157/// compared to the lower double-precision value of \a __a. 1158/// \returns An integer containing the comparison results. If either of the two 1159/// lower double-precision values is NaN, 1 is returned. 1160static __inline__ int __DEFAULT_FN_ATTRS 1161_mm_ucomile_sd(__m128d __a, __m128d __b) 1162{ 1163 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1164} 1165 1166/// \brief Compares the lower double-precision floating-point values in each of 1167/// the two 128-bit floating-point vectors of [2 x double] to determine if 1168/// the value in the first parameter is greater than the corresponding value 1169/// in the second parameter. The comparison yields 0 for false, 1 for true. 1170/// If either of the two lower double-precision values is NaN, 0 is returned. 1171/// 1172/// \headerfile <x86intrin.h> 1173/// 1174/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1175/// 1176/// \param __a 1177/// A 128-bit vector of [2 x double]. The lower double-precision value is 1178/// compared to the lower double-precision value of \a __b. 1179/// \param __b 1180/// A 128-bit vector of [2 x double]. The lower double-precision value is 1181/// compared to the lower double-precision value of \a __a. 1182/// \returns An integer containing the comparison results. If either of the two 1183/// lower double-precision values is NaN, 0 is returned. 1184static __inline__ int __DEFAULT_FN_ATTRS 1185_mm_ucomigt_sd(__m128d __a, __m128d __b) 1186{ 1187 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1188} 1189 1190/// \brief Compares the lower double-precision floating-point values in each of 1191/// the two 128-bit floating-point vectors of [2 x double] to determine if 1192/// the value in the first parameter is greater than or equal to the 1193/// corresponding value in the second parameter. The comparison yields 0 for 1194/// false, 1 for true. If either of the two lower double-precision values 1195/// is NaN, 0 is returned. 1196/// 1197/// \headerfile <x86intrin.h> 1198/// 1199/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1200/// 1201/// \param __a 1202/// A 128-bit vector of [2 x double]. The lower double-precision value is 1203/// compared to the lower double-precision value of \a __b. 1204/// \param __b 1205/// A 128-bit vector of [2 x double]. The lower double-precision value is 1206/// compared to the lower double-precision value of \a __a. 1207/// \returns An integer containing the comparison results. If either of the two 1208/// lower double-precision values is NaN, 0 is returned. 1209static __inline__ int __DEFAULT_FN_ATTRS 1210_mm_ucomige_sd(__m128d __a, __m128d __b) 1211{ 1212 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1213} 1214 1215/// \brief Compares the lower double-precision floating-point values in each of 1216/// the two 128-bit floating-point vectors of [2 x double] to determine if 1217/// the value in the first parameter is unequal to the corresponding value in 1218/// the second parameter. The comparison yields 0 for false, 1 for true. If 1219/// either of the two lower double-precision values is NaN, 0 is returned. 1220/// 1221/// \headerfile <x86intrin.h> 1222/// 1223/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1224/// 1225/// \param __a 1226/// A 128-bit vector of [2 x double]. The lower double-precision value is 1227/// compared to the lower double-precision value of \a __b. 1228/// \param __b 1229/// A 128-bit vector of [2 x double]. The lower double-precision value is 1230/// compared to the lower double-precision value of \a __a. 1231/// \returns An integer containing the comparison result. If either of the two 1232/// lower double-precision values is NaN, 0 is returned. 1233static __inline__ int __DEFAULT_FN_ATTRS 1234_mm_ucomineq_sd(__m128d __a, __m128d __b) 1235{ 1236 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1237} 1238 1239/// \brief Converts the two double-precision floating-point elements of a 1240/// 128-bit vector of [2 x double] into two single-precision floating-point 1241/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1242/// The upper 64 bits of the result vector are set to zero. 1243/// 1244/// \headerfile <x86intrin.h> 1245/// 1246/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1247/// 1248/// \param __a 1249/// A 128-bit vector of [2 x double]. 1250/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1251/// converted values. The upper 64 bits are set to zero. 1252static __inline__ __m128 __DEFAULT_FN_ATTRS 1253_mm_cvtpd_ps(__m128d __a) 1254{ 1255 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1256} 1257 1258/// \brief Converts the lower two single-precision floating-point elements of a 1259/// 128-bit vector of [4 x float] into two double-precision floating-point 1260/// values, returned in a 128-bit vector of [2 x double]. The upper two 1261/// elements of the input vector are unused. 1262/// 1263/// \headerfile <x86intrin.h> 1264/// 1265/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1266/// 1267/// \param __a 1268/// A 128-bit vector of [4 x float]. The lower two single-precision 1269/// floating-point elements are converted to double-precision values. The 1270/// upper two elements are unused. 1271/// \returns A 128-bit vector of [2 x double] containing the converted values. 1272static __inline__ __m128d __DEFAULT_FN_ATTRS 1273_mm_cvtps_pd(__m128 __a) 1274{ 1275 return (__m128d) __builtin_convertvector( 1276 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1277} 1278 1279/// \brief Converts the lower two integer elements of a 128-bit vector of 1280/// [4 x i32] into two double-precision floating-point values, returned in a 1281/// 128-bit vector of [2 x double]. The upper two elements of the input 1282/// vector are unused. 1283/// 1284/// \headerfile <x86intrin.h> 1285/// 1286/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1287/// 1288/// \param __a 1289/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1290/// converted to double-precision values. The upper two elements are unused. 1291/// \returns A 128-bit vector of [2 x double] containing the converted values. 1292static __inline__ __m128d __DEFAULT_FN_ATTRS 1293_mm_cvtepi32_pd(__m128i __a) 1294{ 1295 return (__m128d) __builtin_convertvector( 1296 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1297} 1298 1299/// \brief Converts the two double-precision floating-point elements of a 1300/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1301/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1302/// 64 bits of the result vector are set to zero. 1303/// 1304/// \headerfile <x86intrin.h> 1305/// 1306/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1307/// 1308/// \param __a 1309/// A 128-bit vector of [2 x double]. 1310/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1311/// converted values. The upper 64 bits are set to zero. 1312static __inline__ __m128i __DEFAULT_FN_ATTRS 1313_mm_cvtpd_epi32(__m128d __a) 1314{ 1315 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1316} 1317 1318/// \brief Converts the low-order element of a 128-bit vector of [2 x double] 1319/// into a 32-bit signed integer value. 1320/// 1321/// \headerfile <x86intrin.h> 1322/// 1323/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1324/// 1325/// \param __a 1326/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1327/// conversion. 1328/// \returns A 32-bit signed integer containing the converted value. 1329static __inline__ int __DEFAULT_FN_ATTRS 1330_mm_cvtsd_si32(__m128d __a) 1331{ 1332 return __builtin_ia32_cvtsd2si((__v2df)__a); 1333} 1334 1335/// \brief Converts the lower double-precision floating-point element of a 1336/// 128-bit vector of [2 x double], in the second parameter, into a 1337/// single-precision floating-point value, returned in the lower 32 bits of a 1338/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1339/// copied from the upper 96 bits of the first parameter. 1340/// 1341/// \headerfile <x86intrin.h> 1342/// 1343/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1344/// 1345/// \param __a 1346/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1347/// copied to the upper 96 bits of the result. 1348/// \param __b 1349/// A 128-bit vector of [2 x double]. The lower double-precision 1350/// floating-point element is used in the conversion. 1351/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1352/// converted value from the second parameter. The upper 96 bits are copied 1353/// from the upper 96 bits of the first parameter. 1354static __inline__ __m128 __DEFAULT_FN_ATTRS 1355_mm_cvtsd_ss(__m128 __a, __m128d __b) 1356{ 1357 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1358} 1359 1360/// \brief Converts a 32-bit signed integer value, in the second parameter, into 1361/// a double-precision floating-point value, returned in the lower 64 bits of 1362/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1363/// are copied from the upper 64 bits of the first parameter. 1364/// 1365/// \headerfile <x86intrin.h> 1366/// 1367/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1368/// 1369/// \param __a 1370/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1371/// copied to the upper 64 bits of the result. 1372/// \param __b 1373/// A 32-bit signed integer containing the value to be converted. 1374/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1375/// converted value from the second parameter. The upper 64 bits are copied 1376/// from the upper 64 bits of the first parameter. 1377static __inline__ __m128d __DEFAULT_FN_ATTRS 1378_mm_cvtsi32_sd(__m128d __a, int __b) 1379{ 1380 __a[0] = __b; 1381 return __a; 1382} 1383 1384/// \brief Converts the lower single-precision floating-point element of a 1385/// 128-bit vector of [4 x float], in the second parameter, into a 1386/// double-precision floating-point value, returned in the lower 64 bits of 1387/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1388/// are copied from the upper 64 bits of the first parameter. 1389/// 1390/// \headerfile <x86intrin.h> 1391/// 1392/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1393/// 1394/// \param __a 1395/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1396/// copied to the upper 64 bits of the result. 1397/// \param __b 1398/// A 128-bit vector of [4 x float]. The lower single-precision 1399/// floating-point element is used in the conversion. 1400/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1401/// converted value from the second parameter. The upper 64 bits are copied 1402/// from the upper 64 bits of the first parameter. 1403static __inline__ __m128d __DEFAULT_FN_ATTRS 1404_mm_cvtss_sd(__m128d __a, __m128 __b) 1405{ 1406 __a[0] = __b[0]; 1407 return __a; 1408} 1409 1410/// \brief Converts the two double-precision floating-point elements of a 1411/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1412/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the 1413/// result of either conversion is inexact, the result is truncated (rounded 1414/// towards zero) regardless of the current MXCSR setting. The upper 64 bits 1415/// of the result vector are set to zero. 1416/// 1417/// \headerfile <x86intrin.h> 1418/// 1419/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1420/// instruction. 1421/// 1422/// \param __a 1423/// A 128-bit vector of [2 x double]. 1424/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1425/// converted values. The upper 64 bits are set to zero. 1426static __inline__ __m128i __DEFAULT_FN_ATTRS 1427_mm_cvttpd_epi32(__m128d __a) 1428{ 1429 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1430} 1431 1432/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit 1433/// signed integer value, truncating the result when it is inexact. 1434/// 1435/// \headerfile <x86intrin.h> 1436/// 1437/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1438/// instruction. 1439/// 1440/// \param __a 1441/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1442/// conversion. 1443/// \returns A 32-bit signed integer containing the converted value. 1444static __inline__ int __DEFAULT_FN_ATTRS 1445_mm_cvttsd_si32(__m128d __a) 1446{ 1447 return __builtin_ia32_cvttsd2si((__v2df)__a); 1448} 1449 1450/// \brief Converts the two double-precision floating-point elements of a 1451/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1452/// returned in a 64-bit vector of [2 x i32]. 1453/// 1454/// \headerfile <x86intrin.h> 1455/// 1456/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1457/// 1458/// \param __a 1459/// A 128-bit vector of [2 x double]. 1460/// \returns A 64-bit vector of [2 x i32] containing the converted values. 1461static __inline__ __m64 __DEFAULT_FN_ATTRS 1462_mm_cvtpd_pi32(__m128d __a) 1463{ 1464 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1465} 1466 1467/// \brief Converts the two double-precision floating-point elements of a 1468/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1469/// returned in a 64-bit vector of [2 x i32]. If the result of either 1470/// conversion is inexact, the result is truncated (rounded towards zero) 1471/// regardless of the current MXCSR setting. 1472/// 1473/// \headerfile <x86intrin.h> 1474/// 1475/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1476/// 1477/// \param __a 1478/// A 128-bit vector of [2 x double]. 1479/// \returns A 64-bit vector of [2 x i32] containing the converted values. 1480static __inline__ __m64 __DEFAULT_FN_ATTRS 1481_mm_cvttpd_pi32(__m128d __a) 1482{ 1483 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1484} 1485 1486/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of 1487/// [2 x i32] into two double-precision floating-point values, returned in a 1488/// 128-bit vector of [2 x double]. 1489/// 1490/// \headerfile <x86intrin.h> 1491/// 1492/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1493/// 1494/// \param __a 1495/// A 64-bit vector of [2 x i32]. 1496/// \returns A 128-bit vector of [2 x double] containing the converted values. 1497static __inline__ __m128d __DEFAULT_FN_ATTRS 1498_mm_cvtpi32_pd(__m64 __a) 1499{ 1500 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1501} 1502 1503/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as 1504/// a double-precision floating-point value. 1505/// 1506/// \headerfile <x86intrin.h> 1507/// 1508/// This intrinsic has no corresponding instruction. 1509/// 1510/// \param __a 1511/// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1512/// \returns A double-precision floating-point value copied from the lower 64 1513/// bits of \a __a. 1514static __inline__ double __DEFAULT_FN_ATTRS 1515_mm_cvtsd_f64(__m128d __a) 1516{ 1517 return __a[0]; 1518} 1519 1520/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned 1521/// memory location. 1522/// 1523/// \headerfile <x86intrin.h> 1524/// 1525/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1526/// 1527/// \param __dp 1528/// A pointer to a 128-bit memory location. The address of the memory 1529/// location has to be 16-byte aligned. 1530/// \returns A 128-bit vector of [2 x double] containing the loaded values. 1531static __inline__ __m128d __DEFAULT_FN_ATTRS 1532_mm_load_pd(double const *__dp) 1533{ 1534 return *(__m128d*)__dp; 1535} 1536 1537/// \brief Loads a double-precision floating-point value from a specified memory 1538/// location and duplicates it to both vector elements of a 128-bit vector of 1539/// [2 x double]. 1540/// 1541/// \headerfile <x86intrin.h> 1542/// 1543/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1544/// 1545/// \param __dp 1546/// A pointer to a memory location containing a double-precision value. 1547/// \returns A 128-bit vector of [2 x double] containing the loaded and 1548/// duplicated values. 1549static __inline__ __m128d __DEFAULT_FN_ATTRS 1550_mm_load1_pd(double const *__dp) 1551{ 1552 struct __mm_load1_pd_struct { 1553 double __u; 1554 } __attribute__((__packed__, __may_alias__)); 1555 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 1556 return (__m128d){ __u, __u }; 1557} 1558 1559#define _mm_load_pd1(dp) _mm_load1_pd(dp) 1560 1561/// \brief Loads two double-precision values, in reverse order, from an aligned 1562/// memory location into a 128-bit vector of [2 x double]. 1563/// 1564/// \headerfile <x86intrin.h> 1565/// 1566/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1567/// needed shuffling instructions. In AVX mode, the shuffling may be combined 1568/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1569/// 1570/// \param __dp 1571/// A 16-byte aligned pointer to an array of double-precision values to be 1572/// loaded in reverse order. 1573/// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1574/// values. 1575static __inline__ __m128d __DEFAULT_FN_ATTRS 1576_mm_loadr_pd(double const *__dp) 1577{ 1578 __m128d __u = *(__m128d*)__dp; 1579 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1580} 1581 1582/// \brief Loads a 128-bit floating-point vector of [2 x double] from an 1583/// unaligned memory location. 1584/// 1585/// \headerfile <x86intrin.h> 1586/// 1587/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1588/// 1589/// \param __dp 1590/// A pointer to a 128-bit memory location. The address of the memory 1591/// location does not have to be aligned. 1592/// \returns A 128-bit vector of [2 x double] containing the loaded values. 1593static __inline__ __m128d __DEFAULT_FN_ATTRS 1594_mm_loadu_pd(double const *__dp) 1595{ 1596 struct __loadu_pd { 1597 __m128d __v; 1598 } __attribute__((__packed__, __may_alias__)); 1599 return ((struct __loadu_pd*)__dp)->__v; 1600} 1601 1602/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer 1603/// vector and clears the upper element. 1604/// 1605/// \headerfile <x86intrin.h> 1606/// 1607/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1608/// 1609/// \param __dp 1610/// A pointer to a 64-bit memory location. The address of the memory 1611/// location does not have to be aligned. 1612/// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1613static __inline__ __m128i __DEFAULT_FN_ATTRS 1614_mm_loadu_si64(void const *__a) 1615{ 1616 struct __loadu_si64 { 1617 long long __v; 1618 } __attribute__((__packed__, __may_alias__)); 1619 long long __u = ((struct __loadu_si64*)__a)->__v; 1620 return (__m128i){__u, 0L}; 1621} 1622 1623/// \brief Loads a 64-bit double-precision value to the low element of a 1624/// 128-bit integer vector and clears the upper element. 1625/// 1626/// \headerfile <x86intrin.h> 1627/// 1628/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1629/// 1630/// \param __dp 1631/// An pointer to a memory location containing a double-precision value. 1632/// The address of the memory location does not have to be aligned. 1633/// \returns A 128-bit vector of [2 x double] containing the loaded value. 1634static __inline__ __m128d __DEFAULT_FN_ATTRS 1635_mm_load_sd(double const *__dp) 1636{ 1637 struct __mm_load_sd_struct { 1638 double __u; 1639 } __attribute__((__packed__, __may_alias__)); 1640 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 1641 return (__m128d){ __u, 0 }; 1642} 1643 1644/// \brief Loads a double-precision value into the high-order bits of a 128-bit 1645/// vector of [2 x double]. The low-order bits are copied from the low-order 1646/// bits of the first operand. 1647/// 1648/// \headerfile <x86intrin.h> 1649/// 1650/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1651/// 1652/// \param __a 1653/// A 128-bit vector of [2 x double]. \n 1654/// Bits [63:0] are written to bits [63:0] of the result. 1655/// \param __dp 1656/// A pointer to a 64-bit memory location containing a double-precision 1657/// floating-point value that is loaded. The loaded value is written to bits 1658/// [127:64] of the result. The address of the memory location does not have 1659/// to be aligned. 1660/// \returns A 128-bit vector of [2 x double] containing the moved values. 1661static __inline__ __m128d __DEFAULT_FN_ATTRS 1662_mm_loadh_pd(__m128d __a, double const *__dp) 1663{ 1664 struct __mm_loadh_pd_struct { 1665 double __u; 1666 } __attribute__((__packed__, __may_alias__)); 1667 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 1668 return (__m128d){ __a[0], __u }; 1669} 1670 1671/// \brief Loads a double-precision value into the low-order bits of a 128-bit 1672/// vector of [2 x double]. The high-order bits are copied from the 1673/// high-order bits of the first operand. 1674/// 1675/// \headerfile <x86intrin.h> 1676/// 1677/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1678/// 1679/// \param __a 1680/// A 128-bit vector of [2 x double]. \n 1681/// Bits [127:64] are written to bits [127:64] of the result. 1682/// \param __dp 1683/// A pointer to a 64-bit memory location containing a double-precision 1684/// floating-point value that is loaded. The loaded value is written to bits 1685/// [63:0] of the result. The address of the memory location does not have to 1686/// be aligned. 1687/// \returns A 128-bit vector of [2 x double] containing the moved values. 1688static __inline__ __m128d __DEFAULT_FN_ATTRS 1689_mm_loadl_pd(__m128d __a, double const *__dp) 1690{ 1691 struct __mm_loadl_pd_struct { 1692 double __u; 1693 } __attribute__((__packed__, __may_alias__)); 1694 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 1695 return (__m128d){ __u, __a[1] }; 1696} 1697 1698/// \brief Constructs a 128-bit floating-point vector of [2 x double] with 1699/// unspecified content. This could be used as an argument to another 1700/// intrinsic function where the argument is required but the value is not 1701/// actually used. 1702/// 1703/// \headerfile <x86intrin.h> 1704/// 1705/// This intrinsic has no corresponding instruction. 1706/// 1707/// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1708/// content. 1709static __inline__ __m128d __DEFAULT_FN_ATTRS 1710_mm_undefined_pd(void) 1711{ 1712 return (__m128d)__builtin_ia32_undef128(); 1713} 1714 1715/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower 1716/// 64 bits of the vector are initialized with the specified double-precision 1717/// floating-point value. The upper 64 bits are set to zero. 1718/// 1719/// \headerfile <x86intrin.h> 1720/// 1721/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1722/// 1723/// \param __w 1724/// A double-precision floating-point value used to initialize the lower 64 1725/// bits of the result. 1726/// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1727/// lower 64 bits contain the value of the parameter. The upper 64 bits are 1728/// set to zero. 1729static __inline__ __m128d __DEFAULT_FN_ATTRS 1730_mm_set_sd(double __w) 1731{ 1732 return (__m128d){ __w, 0 }; 1733} 1734 1735/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each 1736/// of the two double-precision floating-point vector elements set to the 1737/// specified double-precision floating-point value. 1738/// 1739/// \headerfile <x86intrin.h> 1740/// 1741/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1742/// 1743/// \param __w 1744/// A double-precision floating-point value used to initialize each vector 1745/// element of the result. 1746/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1747static __inline__ __m128d __DEFAULT_FN_ATTRS 1748_mm_set1_pd(double __w) 1749{ 1750 return (__m128d){ __w, __w }; 1751} 1752 1753/// \brief Constructs a 128-bit floating-point vector of [2 x double] 1754/// initialized with the specified double-precision floating-point values. 1755/// 1756/// \headerfile <x86intrin.h> 1757/// 1758/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1759/// 1760/// \param __w 1761/// A double-precision floating-point value used to initialize the upper 64 1762/// bits of the result. 1763/// \param __x 1764/// A double-precision floating-point value used to initialize the lower 64 1765/// bits of the result. 1766/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1767static __inline__ __m128d __DEFAULT_FN_ATTRS 1768_mm_set_pd(double __w, double __x) 1769{ 1770 return (__m128d){ __x, __w }; 1771} 1772 1773/// \brief Constructs a 128-bit floating-point vector of [2 x double], 1774/// initialized in reverse order with the specified double-precision 1775/// floating-point values. 1776/// 1777/// \headerfile <x86intrin.h> 1778/// 1779/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1780/// 1781/// \param __w 1782/// A double-precision floating-point value used to initialize the lower 64 1783/// bits of the result. 1784/// \param __x 1785/// A double-precision floating-point value used to initialize the upper 64 1786/// bits of the result. 1787/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1788static __inline__ __m128d __DEFAULT_FN_ATTRS 1789_mm_setr_pd(double __w, double __x) 1790{ 1791 return (__m128d){ __w, __x }; 1792} 1793 1794/// \brief Constructs a 128-bit floating-point vector of [2 x double] 1795/// initialized to zero. 1796/// 1797/// \headerfile <x86intrin.h> 1798/// 1799/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1800/// 1801/// \returns An initialized 128-bit floating-point vector of [2 x double] with 1802/// all elements set to zero. 1803static __inline__ __m128d __DEFAULT_FN_ATTRS 1804_mm_setzero_pd(void) 1805{ 1806 return (__m128d){ 0, 0 }; 1807} 1808 1809/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower 1810/// 64 bits are set to the lower 64 bits of the second parameter. The upper 1811/// 64 bits are set to the upper 64 bits of the first parameter. 1812/// 1813/// \headerfile <x86intrin.h> 1814/// 1815/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1816/// 1817/// \param __a 1818/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1819/// upper 64 bits of the result. 1820/// \param __b 1821/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1822/// lower 64 bits of the result. 1823/// \returns A 128-bit vector of [2 x double] containing the moved values. 1824static __inline__ __m128d __DEFAULT_FN_ATTRS 1825_mm_move_sd(__m128d __a, __m128d __b) 1826{ 1827 return (__m128d){ __b[0], __a[1] }; 1828} 1829 1830/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1831/// memory location. 1832/// 1833/// \headerfile <x86intrin.h> 1834/// 1835/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1836/// 1837/// \param __dp 1838/// A pointer to a 64-bit memory location. 1839/// \param __a 1840/// A 128-bit vector of [2 x double] containing the value to be stored. 1841static __inline__ void __DEFAULT_FN_ATTRS 1842_mm_store_sd(double *__dp, __m128d __a) 1843{ 1844 struct __mm_store_sd_struct { 1845 double __u; 1846 } __attribute__((__packed__, __may_alias__)); 1847 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 1848} 1849 1850static __inline__ void __DEFAULT_FN_ATTRS 1851_mm_store_pd(double *__dp, __m128d __a) 1852{ 1853 *(__m128d*)__dp = __a; 1854} 1855 1856static __inline__ void __DEFAULT_FN_ATTRS 1857_mm_store1_pd(double *__dp, __m128d __a) 1858{ 1859 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1860 _mm_store_pd(__dp, __a); 1861} 1862 1863/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory 1864/// location. 1865/// 1866/// \headerfile <x86intrin.h> 1867/// 1868/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1869/// 1870/// \param __dp 1871/// A pointer to a 128-bit memory location. The address of the memory 1872/// location has to be 16-byte aligned. 1873/// \param __a 1874/// A 128-bit vector of [2 x double] containing the values to be stored. 1875static __inline__ void __DEFAULT_FN_ATTRS 1876_mm_store_pd1(double *__dp, __m128d __a) 1877{ 1878 return _mm_store1_pd(__dp, __a); 1879} 1880 1881/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory 1882/// location. 1883/// 1884/// \headerfile <x86intrin.h> 1885/// 1886/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1887/// 1888/// \param __dp 1889/// A pointer to a 128-bit memory location. The address of the memory 1890/// location does not have to be aligned. 1891/// \param __a 1892/// A 128-bit vector of [2 x double] containing the values to be stored. 1893static __inline__ void __DEFAULT_FN_ATTRS 1894_mm_storeu_pd(double *__dp, __m128d __a) 1895{ 1896 struct __storeu_pd { 1897 __m128d __v; 1898 } __attribute__((__packed__, __may_alias__)); 1899 ((struct __storeu_pd*)__dp)->__v = __a; 1900} 1901 1902/// \brief Stores two double-precision values, in reverse order, from a 128-bit 1903/// vector of [2 x double] to a 16-byte aligned memory location. 1904/// 1905/// \headerfile <x86intrin.h> 1906/// 1907/// This intrinsic corresponds to a shuffling instruction followed by a 1908/// <c> VMOVAPD / MOVAPD </c> instruction. 1909/// 1910/// \param __dp 1911/// A pointer to a 16-byte aligned memory location that can store two 1912/// double-precision values. 1913/// \param __a 1914/// A 128-bit vector of [2 x double] containing the values to be reversed and 1915/// stored. 1916static __inline__ void __DEFAULT_FN_ATTRS 1917_mm_storer_pd(double *__dp, __m128d __a) 1918{ 1919 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 1920 *(__m128d *)__dp = __a; 1921} 1922 1923/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 1924/// memory location. 1925/// 1926/// \headerfile <x86intrin.h> 1927/// 1928/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1929/// 1930/// \param __dp 1931/// A pointer to a 64-bit memory location. 1932/// \param __a 1933/// A 128-bit vector of [2 x double] containing the value to be stored. 1934static __inline__ void __DEFAULT_FN_ATTRS 1935_mm_storeh_pd(double *__dp, __m128d __a) 1936{ 1937 struct __mm_storeh_pd_struct { 1938 double __u; 1939 } __attribute__((__packed__, __may_alias__)); 1940 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 1941} 1942 1943/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1944/// memory location. 1945/// 1946/// \headerfile <x86intrin.h> 1947/// 1948/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1949/// 1950/// \param __dp 1951/// A pointer to a 64-bit memory location. 1952/// \param __a 1953/// A 128-bit vector of [2 x double] containing the value to be stored. 1954static __inline__ void __DEFAULT_FN_ATTRS 1955_mm_storel_pd(double *__dp, __m128d __a) 1956{ 1957 struct __mm_storeh_pd_struct { 1958 double __u; 1959 } __attribute__((__packed__, __may_alias__)); 1960 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 1961} 1962 1963/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8], 1964/// saving the lower 8 bits of each sum in the corresponding element of a 1965/// 128-bit result vector of [16 x i8]. The integer elements of both 1966/// parameters can be either signed or unsigned. 1967/// 1968/// \headerfile <x86intrin.h> 1969/// 1970/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 1971/// 1972/// \param __a 1973/// A 128-bit vector of [16 x i8]. 1974/// \param __b 1975/// A 128-bit vector of [16 x i8]. 1976/// \returns A 128-bit vector of [16 x i8] containing the sums of both 1977/// parameters. 1978static __inline__ __m128i __DEFAULT_FN_ATTRS 1979_mm_add_epi8(__m128i __a, __m128i __b) 1980{ 1981 return (__m128i)((__v16qu)__a + (__v16qu)__b); 1982} 1983 1984/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16], 1985/// saving the lower 16 bits of each sum in the corresponding element of a 1986/// 128-bit result vector of [8 x i16]. The integer elements of both 1987/// parameters can be either signed or unsigned. 1988/// 1989/// \headerfile <x86intrin.h> 1990/// 1991/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 1992/// 1993/// \param __a 1994/// A 128-bit vector of [8 x i16]. 1995/// \param __b 1996/// A 128-bit vector of [8 x i16]. 1997/// \returns A 128-bit vector of [8 x i16] containing the sums of both 1998/// parameters. 1999static __inline__ __m128i __DEFAULT_FN_ATTRS 2000_mm_add_epi16(__m128i __a, __m128i __b) 2001{ 2002 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2003} 2004 2005/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2006/// saving the lower 32 bits of each sum in the corresponding element of a 2007/// 128-bit result vector of [4 x i32]. The integer elements of both 2008/// parameters can be either signed or unsigned. 2009/// 2010/// \headerfile <x86intrin.h> 2011/// 2012/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2013/// 2014/// \param __a 2015/// A 128-bit vector of [4 x i32]. 2016/// \param __b 2017/// A 128-bit vector of [4 x i32]. 2018/// \returns A 128-bit vector of [4 x i32] containing the sums of both 2019/// parameters. 2020static __inline__ __m128i __DEFAULT_FN_ATTRS 2021_mm_add_epi32(__m128i __a, __m128i __b) 2022{ 2023 return (__m128i)((__v4su)__a + (__v4su)__b); 2024} 2025 2026/// \brief Adds two signed or unsigned 64-bit integer values, returning the 2027/// lower 64 bits of the sum. 2028/// 2029/// \headerfile <x86intrin.h> 2030/// 2031/// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2032/// 2033/// \param __a 2034/// A 64-bit integer. 2035/// \param __b 2036/// A 64-bit integer. 2037/// \returns A 64-bit integer containing the sum of both parameters. 2038static __inline__ __m64 __DEFAULT_FN_ATTRS 2039_mm_add_si64(__m64 __a, __m64 __b) 2040{ 2041 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2042} 2043 2044/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2045/// saving the lower 64 bits of each sum in the corresponding element of a 2046/// 128-bit result vector of [2 x i64]. The integer elements of both 2047/// parameters can be either signed or unsigned. 2048/// 2049/// \headerfile <x86intrin.h> 2050/// 2051/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2052/// 2053/// \param __a 2054/// A 128-bit vector of [2 x i64]. 2055/// \param __b 2056/// A 128-bit vector of [2 x i64]. 2057/// \returns A 128-bit vector of [2 x i64] containing the sums of both 2058/// parameters. 2059static __inline__ __m128i __DEFAULT_FN_ATTRS 2060_mm_add_epi64(__m128i __a, __m128i __b) 2061{ 2062 return (__m128i)((__v2du)__a + (__v2du)__b); 2063} 2064 2065/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2066/// signed [16 x i8] vectors, saving each sum in the corresponding element of 2067/// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are 2068/// saturated to 7Fh. Negative sums less than 80h are saturated to 80h. 2069/// 2070/// \headerfile <x86intrin.h> 2071/// 2072/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2073/// 2074/// \param __a 2075/// A 128-bit signed [16 x i8] vector. 2076/// \param __b 2077/// A 128-bit signed [16 x i8] vector. 2078/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2079/// both parameters. 2080static __inline__ __m128i __DEFAULT_FN_ATTRS 2081_mm_adds_epi8(__m128i __a, __m128i __b) 2082{ 2083 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 2084} 2085 2086/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2087/// signed [8 x i16] vectors, saving each sum in the corresponding element of 2088/// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh 2089/// are saturated to 7FFFh. Negative sums less than 8000h are saturated to 2090/// 8000h. 2091/// 2092/// \headerfile <x86intrin.h> 2093/// 2094/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2095/// 2096/// \param __a 2097/// A 128-bit signed [8 x i16] vector. 2098/// \param __b 2099/// A 128-bit signed [8 x i16] vector. 2100/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2101/// both parameters. 2102static __inline__ __m128i __DEFAULT_FN_ATTRS 2103_mm_adds_epi16(__m128i __a, __m128i __b) 2104{ 2105 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 2106} 2107 2108/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2109/// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2110/// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh 2111/// are saturated to FFh. Negative sums are saturated to 00h. 2112/// 2113/// \headerfile <x86intrin.h> 2114/// 2115/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2116/// 2117/// \param __a 2118/// A 128-bit unsigned [16 x i8] vector. 2119/// \param __b 2120/// A 128-bit unsigned [16 x i8] vector. 2121/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2122/// of both parameters. 2123static __inline__ __m128i __DEFAULT_FN_ATTRS 2124_mm_adds_epu8(__m128i __a, __m128i __b) 2125{ 2126 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 2127} 2128 2129/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2130/// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2131/// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh 2132/// are saturated to FFFFh. Negative sums are saturated to 0000h. 2133/// 2134/// \headerfile <x86intrin.h> 2135/// 2136/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2137/// 2138/// \param __a 2139/// A 128-bit unsigned [8 x i16] vector. 2140/// \param __b 2141/// A 128-bit unsigned [8 x i16] vector. 2142/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2143/// of both parameters. 2144static __inline__ __m128i __DEFAULT_FN_ATTRS 2145_mm_adds_epu16(__m128i __a, __m128i __b) 2146{ 2147 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 2148} 2149 2150/// \brief Computes the rounded avarages of corresponding elements of two 2151/// 128-bit unsigned [16 x i8] vectors, saving each result in the 2152/// corresponding element of a 128-bit result vector of [16 x i8]. 2153/// 2154/// \headerfile <x86intrin.h> 2155/// 2156/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2157/// 2158/// \param __a 2159/// A 128-bit unsigned [16 x i8] vector. 2160/// \param __b 2161/// A 128-bit unsigned [16 x i8] vector. 2162/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2163/// averages of both parameters. 2164static __inline__ __m128i __DEFAULT_FN_ATTRS 2165_mm_avg_epu8(__m128i __a, __m128i __b) 2166{ 2167 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2168} 2169 2170/// \brief Computes the rounded avarages of corresponding elements of two 2171/// 128-bit unsigned [8 x i16] vectors, saving each result in the 2172/// corresponding element of a 128-bit result vector of [8 x i16]. 2173/// 2174/// \headerfile <x86intrin.h> 2175/// 2176/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2177/// 2178/// \param __a 2179/// A 128-bit unsigned [8 x i16] vector. 2180/// \param __b 2181/// A 128-bit unsigned [8 x i16] vector. 2182/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2183/// averages of both parameters. 2184static __inline__ __m128i __DEFAULT_FN_ATTRS 2185_mm_avg_epu16(__m128i __a, __m128i __b) 2186{ 2187 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2188} 2189 2190/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2191/// vectors, producing eight intermediate 32-bit signed integer products, and 2192/// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2193/// [4 x i32] vector. For example, bits [15:0] of both parameters are 2194/// multiplied producing a 32-bit product, bits [31:16] of both parameters 2195/// are multiplied producing a 32-bit product, and the sum of those two 2196/// products becomes bits [31:0] of the result. 2197/// 2198/// \headerfile <x86intrin.h> 2199/// 2200/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2201/// 2202/// \param __a 2203/// A 128-bit signed [8 x i16] vector. 2204/// \param __b 2205/// A 128-bit signed [8 x i16] vector. 2206/// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2207/// of both parameters. 2208static __inline__ __m128i __DEFAULT_FN_ATTRS 2209_mm_madd_epi16(__m128i __a, __m128i __b) 2210{ 2211 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2212} 2213 2214/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] 2215/// vectors, saving the greater value from each comparison in the 2216/// corresponding element of a 128-bit result vector of [8 x i16]. 2217/// 2218/// \headerfile <x86intrin.h> 2219/// 2220/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2221/// 2222/// \param __a 2223/// A 128-bit signed [8 x i16] vector. 2224/// \param __b 2225/// A 128-bit signed [8 x i16] vector. 2226/// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2227/// each comparison. 2228static __inline__ __m128i __DEFAULT_FN_ATTRS 2229_mm_max_epi16(__m128i __a, __m128i __b) 2230{ 2231 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 2232} 2233 2234/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] 2235/// vectors, saving the greater value from each comparison in the 2236/// corresponding element of a 128-bit result vector of [16 x i8]. 2237/// 2238/// \headerfile <x86intrin.h> 2239/// 2240/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2241/// 2242/// \param __a 2243/// A 128-bit unsigned [16 x i8] vector. 2244/// \param __b 2245/// A 128-bit unsigned [16 x i8] vector. 2246/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2247/// each comparison. 2248static __inline__ __m128i __DEFAULT_FN_ATTRS 2249_mm_max_epu8(__m128i __a, __m128i __b) 2250{ 2251 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 2252} 2253 2254/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] 2255/// vectors, saving the smaller value from each comparison in the 2256/// corresponding element of a 128-bit result vector of [8 x i16]. 2257/// 2258/// \headerfile <x86intrin.h> 2259/// 2260/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2261/// 2262/// \param __a 2263/// A 128-bit signed [8 x i16] vector. 2264/// \param __b 2265/// A 128-bit signed [8 x i16] vector. 2266/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2267/// each comparison. 2268static __inline__ __m128i __DEFAULT_FN_ATTRS 2269_mm_min_epi16(__m128i __a, __m128i __b) 2270{ 2271 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 2272} 2273 2274/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] 2275/// vectors, saving the smaller value from each comparison in the 2276/// corresponding element of a 128-bit result vector of [16 x i8]. 2277/// 2278/// \headerfile <x86intrin.h> 2279/// 2280/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2281/// 2282/// \param __a 2283/// A 128-bit unsigned [16 x i8] vector. 2284/// \param __b 2285/// A 128-bit unsigned [16 x i8] vector. 2286/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2287/// each comparison. 2288static __inline__ __m128i __DEFAULT_FN_ATTRS 2289_mm_min_epu8(__m128i __a, __m128i __b) 2290{ 2291 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 2292} 2293 2294/// \brief Multiplies the corresponding elements of two signed [8 x i16] 2295/// vectors, saving the upper 16 bits of each 32-bit product in the 2296/// corresponding element of a 128-bit signed [8 x i16] result vector. 2297/// 2298/// \headerfile <x86intrin.h> 2299/// 2300/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2301/// 2302/// \param __a 2303/// A 128-bit signed [8 x i16] vector. 2304/// \param __b 2305/// A 128-bit signed [8 x i16] vector. 2306/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2307/// each of the eight 32-bit products. 2308static __inline__ __m128i __DEFAULT_FN_ATTRS 2309_mm_mulhi_epi16(__m128i __a, __m128i __b) 2310{ 2311 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2312} 2313 2314/// \brief Multiplies the corresponding elements of two unsigned [8 x i16] 2315/// vectors, saving the upper 16 bits of each 32-bit product in the 2316/// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2317/// 2318/// \headerfile <x86intrin.h> 2319/// 2320/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2321/// 2322/// \param __a 2323/// A 128-bit unsigned [8 x i16] vector. 2324/// \param __b 2325/// A 128-bit unsigned [8 x i16] vector. 2326/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2327/// of each of the eight 32-bit products. 2328static __inline__ __m128i __DEFAULT_FN_ATTRS 2329_mm_mulhi_epu16(__m128i __a, __m128i __b) 2330{ 2331 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2332} 2333 2334/// \brief Multiplies the corresponding elements of two signed [8 x i16] 2335/// vectors, saving the lower 16 bits of each 32-bit product in the 2336/// corresponding element of a 128-bit signed [8 x i16] result vector. 2337/// 2338/// \headerfile <x86intrin.h> 2339/// 2340/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2341/// 2342/// \param __a 2343/// A 128-bit signed [8 x i16] vector. 2344/// \param __b 2345/// A 128-bit signed [8 x i16] vector. 2346/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2347/// each of the eight 32-bit products. 2348static __inline__ __m128i __DEFAULT_FN_ATTRS 2349_mm_mullo_epi16(__m128i __a, __m128i __b) 2350{ 2351 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2352} 2353 2354/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 2355/// of the two 64-bit integer vectors and returns the 64-bit unsigned 2356/// product. 2357/// 2358/// \headerfile <x86intrin.h> 2359/// 2360/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2361/// 2362/// \param __a 2363/// A 64-bit integer containing one of the source operands. 2364/// \param __b 2365/// A 64-bit integer containing one of the source operands. 2366/// \returns A 64-bit integer vector containing the product of both operands. 2367static __inline__ __m64 __DEFAULT_FN_ATTRS 2368_mm_mul_su32(__m64 __a, __m64 __b) 2369{ 2370 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2371} 2372 2373/// \brief Multiplies 32-bit unsigned integer values contained in the lower 2374/// bits of the corresponding elements of two [2 x i64] vectors, and returns 2375/// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2376/// 2377/// \headerfile <x86intrin.h> 2378/// 2379/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2380/// 2381/// \param __a 2382/// A [2 x i64] vector containing one of the source operands. 2383/// \param __b 2384/// A [2 x i64] vector containing one of the source operands. 2385/// \returns A [2 x i64] vector containing the product of both operands. 2386static __inline__ __m128i __DEFAULT_FN_ATTRS 2387_mm_mul_epu32(__m128i __a, __m128i __b) 2388{ 2389 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2390} 2391 2392/// \brief Computes the absolute differences of corresponding 8-bit integer 2393/// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2394/// separately sums the second 8 absolute differences. Packs these two 2395/// unsigned 16-bit integer sums into the upper and lower elements of a 2396/// [2 x i64] vector. 2397/// 2398/// \headerfile <x86intrin.h> 2399/// 2400/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2401/// 2402/// \param __a 2403/// A 128-bit integer vector containing one of the source operands. 2404/// \param __b 2405/// A 128-bit integer vector containing one of the source operands. 2406/// \returns A [2 x i64] vector containing the sums of the sets of absolute 2407/// differences between both operands. 2408static __inline__ __m128i __DEFAULT_FN_ATTRS 2409_mm_sad_epu8(__m128i __a, __m128i __b) 2410{ 2411 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2412} 2413 2414/// \brief Subtracts the corresponding 8-bit integer values in the operands. 2415/// 2416/// \headerfile <x86intrin.h> 2417/// 2418/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2419/// 2420/// \param __a 2421/// A 128-bit integer vector containing the minuends. 2422/// \param __b 2423/// A 128-bit integer vector containing the subtrahends. 2424/// \returns A 128-bit integer vector containing the differences of the values 2425/// in the operands. 2426static __inline__ __m128i __DEFAULT_FN_ATTRS 2427_mm_sub_epi8(__m128i __a, __m128i __b) 2428{ 2429 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2430} 2431 2432/// \brief Subtracts the corresponding 16-bit integer values in the operands. 2433/// 2434/// \headerfile <x86intrin.h> 2435/// 2436/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2437/// 2438/// \param __a 2439/// A 128-bit integer vector containing the minuends. 2440/// \param __b 2441/// A 128-bit integer vector containing the subtrahends. 2442/// \returns A 128-bit integer vector containing the differences of the values 2443/// in the operands. 2444static __inline__ __m128i __DEFAULT_FN_ATTRS 2445_mm_sub_epi16(__m128i __a, __m128i __b) 2446{ 2447 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2448} 2449 2450/// \brief Subtracts the corresponding 32-bit integer values in the operands. 2451/// 2452/// \headerfile <x86intrin.h> 2453/// 2454/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2455/// 2456/// \param __a 2457/// A 128-bit integer vector containing the minuends. 2458/// \param __b 2459/// A 128-bit integer vector containing the subtrahends. 2460/// \returns A 128-bit integer vector containing the differences of the values 2461/// in the operands. 2462static __inline__ __m128i __DEFAULT_FN_ATTRS 2463_mm_sub_epi32(__m128i __a, __m128i __b) 2464{ 2465 return (__m128i)((__v4su)__a - (__v4su)__b); 2466} 2467 2468/// \brief Subtracts signed or unsigned 64-bit integer values and writes the 2469/// difference to the corresponding bits in the destination. 2470/// 2471/// \headerfile <x86intrin.h> 2472/// 2473/// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2474/// 2475/// \param __a 2476/// A 64-bit integer vector containing the minuend. 2477/// \param __b 2478/// A 64-bit integer vector containing the subtrahend. 2479/// \returns A 64-bit integer vector containing the difference of the values in 2480/// the operands. 2481static __inline__ __m64 __DEFAULT_FN_ATTRS 2482_mm_sub_si64(__m64 __a, __m64 __b) 2483{ 2484 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2485} 2486 2487/// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 2488/// 2489/// \headerfile <x86intrin.h> 2490/// 2491/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2492/// 2493/// \param __a 2494/// A 128-bit integer vector containing the minuends. 2495/// \param __b 2496/// A 128-bit integer vector containing the subtrahends. 2497/// \returns A 128-bit integer vector containing the differences of the values 2498/// in the operands. 2499static __inline__ __m128i __DEFAULT_FN_ATTRS 2500_mm_sub_epi64(__m128i __a, __m128i __b) 2501{ 2502 return (__m128i)((__v2du)__a - (__v2du)__b); 2503} 2504 2505/// \brief Subtracts corresponding 8-bit signed integer values in the input and 2506/// returns the differences in the corresponding bytes in the destination. 2507/// Differences greater than 7Fh are saturated to 7Fh, and differences less 2508/// than 80h are saturated to 80h. 2509/// 2510/// \headerfile <x86intrin.h> 2511/// 2512/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2513/// 2514/// \param __a 2515/// A 128-bit integer vector containing the minuends. 2516/// \param __b 2517/// A 128-bit integer vector containing the subtrahends. 2518/// \returns A 128-bit integer vector containing the differences of the values 2519/// in the operands. 2520static __inline__ __m128i __DEFAULT_FN_ATTRS 2521_mm_subs_epi8(__m128i __a, __m128i __b) 2522{ 2523 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 2524} 2525 2526/// \brief Subtracts corresponding 16-bit signed integer values in the input and 2527/// returns the differences in the corresponding bytes in the destination. 2528/// Differences greater than 7FFFh are saturated to 7FFFh, and values less 2529/// than 8000h are saturated to 8000h. 2530/// 2531/// \headerfile <x86intrin.h> 2532/// 2533/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2534/// 2535/// \param __a 2536/// A 128-bit integer vector containing the minuends. 2537/// \param __b 2538/// A 128-bit integer vector containing the subtrahends. 2539/// \returns A 128-bit integer vector containing the differences of the values 2540/// in the operands. 2541static __inline__ __m128i __DEFAULT_FN_ATTRS 2542_mm_subs_epi16(__m128i __a, __m128i __b) 2543{ 2544 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 2545} 2546 2547/// \brief Subtracts corresponding 8-bit unsigned integer values in the input 2548/// and returns the differences in the corresponding bytes in the 2549/// destination. Differences less than 00h are saturated to 00h. 2550/// 2551/// \headerfile <x86intrin.h> 2552/// 2553/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2554/// 2555/// \param __a 2556/// A 128-bit integer vector containing the minuends. 2557/// \param __b 2558/// A 128-bit integer vector containing the subtrahends. 2559/// \returns A 128-bit integer vector containing the unsigned integer 2560/// differences of the values in the operands. 2561static __inline__ __m128i __DEFAULT_FN_ATTRS 2562_mm_subs_epu8(__m128i __a, __m128i __b) 2563{ 2564 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 2565} 2566 2567/// \brief Subtracts corresponding 16-bit unsigned integer values in the input 2568/// and returns the differences in the corresponding bytes in the 2569/// destination. Differences less than 0000h are saturated to 0000h. 2570/// 2571/// \headerfile <x86intrin.h> 2572/// 2573/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2574/// 2575/// \param __a 2576/// A 128-bit integer vector containing the minuends. 2577/// \param __b 2578/// A 128-bit integer vector containing the subtrahends. 2579/// \returns A 128-bit integer vector containing the unsigned integer 2580/// differences of the values in the operands. 2581static __inline__ __m128i __DEFAULT_FN_ATTRS 2582_mm_subs_epu16(__m128i __a, __m128i __b) 2583{ 2584 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 2585} 2586 2587/// \brief Performs a bitwise AND of two 128-bit integer vectors. 2588/// 2589/// \headerfile <x86intrin.h> 2590/// 2591/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2592/// 2593/// \param __a 2594/// A 128-bit integer vector containing one of the source operands. 2595/// \param __b 2596/// A 128-bit integer vector containing one of the source operands. 2597/// \returns A 128-bit integer vector containing the bitwise AND of the values 2598/// in both operands. 2599static __inline__ __m128i __DEFAULT_FN_ATTRS 2600_mm_and_si128(__m128i __a, __m128i __b) 2601{ 2602 return (__m128i)((__v2du)__a & (__v2du)__b); 2603} 2604 2605/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 2606/// one's complement of the values contained in the first source operand. 2607/// 2608/// \headerfile <x86intrin.h> 2609/// 2610/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2611/// 2612/// \param __a 2613/// A 128-bit vector containing the left source operand. The one's complement 2614/// of this value is used in the bitwise AND. 2615/// \param __b 2616/// A 128-bit vector containing the right source operand. 2617/// \returns A 128-bit integer vector containing the bitwise AND of the one's 2618/// complement of the first operand and the values in the second operand. 2619static __inline__ __m128i __DEFAULT_FN_ATTRS 2620_mm_andnot_si128(__m128i __a, __m128i __b) 2621{ 2622 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2623} 2624/// \brief Performs a bitwise OR of two 128-bit integer vectors. 2625/// 2626/// \headerfile <x86intrin.h> 2627/// 2628/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2629/// 2630/// \param __a 2631/// A 128-bit integer vector containing one of the source operands. 2632/// \param __b 2633/// A 128-bit integer vector containing one of the source operands. 2634/// \returns A 128-bit integer vector containing the bitwise OR of the values 2635/// in both operands. 2636static __inline__ __m128i __DEFAULT_FN_ATTRS 2637_mm_or_si128(__m128i __a, __m128i __b) 2638{ 2639 return (__m128i)((__v2du)__a | (__v2du)__b); 2640} 2641 2642/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 2643/// 2644/// \headerfile <x86intrin.h> 2645/// 2646/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2647/// 2648/// \param __a 2649/// A 128-bit integer vector containing one of the source operands. 2650/// \param __b 2651/// A 128-bit integer vector containing one of the source operands. 2652/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2653/// values in both operands. 2654static __inline__ __m128i __DEFAULT_FN_ATTRS 2655_mm_xor_si128(__m128i __a, __m128i __b) 2656{ 2657 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2658} 2659 2660/// \brief Left-shifts the 128-bit integer vector operand by the specified 2661/// number of bytes. Low-order bits are cleared. 2662/// 2663/// \headerfile <x86intrin.h> 2664/// 2665/// \code 2666/// __m128i _mm_slli_si128(__m128i a, const int imm); 2667/// \endcode 2668/// 2669/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2670/// 2671/// \param a 2672/// A 128-bit integer vector containing the source operand. 2673/// \param imm 2674/// An immediate value specifying the number of bytes to left-shift operand 2675/// \a a. 2676/// \returns A 128-bit integer vector containing the left-shifted value. 2677#define _mm_slli_si128(a, imm) __extension__ ({ \ 2678 (__m128i)__builtin_shufflevector( \ 2679 (__v16qi)_mm_setzero_si128(), \ 2680 (__v16qi)(__m128i)(a), \ 2681 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \ 2682 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \ 2683 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \ 2684 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \ 2685 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \ 2686 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \ 2687 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \ 2688 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \ 2689 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \ 2690 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \ 2691 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \ 2692 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \ 2693 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ 2694 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ 2695 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ 2696 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) 2697 2698#define _mm_bslli_si128(a, imm) \ 2699 _mm_slli_si128((a), (imm)) 2700 2701/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 2702/// by the specified number of bits. Low-order bits are cleared. 2703/// 2704/// \headerfile <x86intrin.h> 2705/// 2706/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2707/// 2708/// \param __a 2709/// A 128-bit integer vector containing the source operand. 2710/// \param __count 2711/// An integer value specifying the number of bits to left-shift each value 2712/// in operand \a __a. 2713/// \returns A 128-bit integer vector containing the left-shifted values. 2714static __inline__ __m128i __DEFAULT_FN_ATTRS 2715_mm_slli_epi16(__m128i __a, int __count) 2716{ 2717 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2718} 2719 2720/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 2721/// by the specified number of bits. Low-order bits are cleared. 2722/// 2723/// \headerfile <x86intrin.h> 2724/// 2725/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2726/// 2727/// \param __a 2728/// A 128-bit integer vector containing the source operand. 2729/// \param __count 2730/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2731/// to left-shift each value in operand \a __a. 2732/// \returns A 128-bit integer vector containing the left-shifted values. 2733static __inline__ __m128i __DEFAULT_FN_ATTRS 2734_mm_sll_epi16(__m128i __a, __m128i __count) 2735{ 2736 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2737} 2738 2739/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 2740/// by the specified number of bits. Low-order bits are cleared. 2741/// 2742/// \headerfile <x86intrin.h> 2743/// 2744/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2745/// 2746/// \param __a 2747/// A 128-bit integer vector containing the source operand. 2748/// \param __count 2749/// An integer value specifying the number of bits to left-shift each value 2750/// in operand \a __a. 2751/// \returns A 128-bit integer vector containing the left-shifted values. 2752static __inline__ __m128i __DEFAULT_FN_ATTRS 2753_mm_slli_epi32(__m128i __a, int __count) 2754{ 2755 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2756} 2757 2758/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 2759/// by the specified number of bits. Low-order bits are cleared. 2760/// 2761/// \headerfile <x86intrin.h> 2762/// 2763/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2764/// 2765/// \param __a 2766/// A 128-bit integer vector containing the source operand. 2767/// \param __count 2768/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2769/// to left-shift each value in operand \a __a. 2770/// \returns A 128-bit integer vector containing the left-shifted values. 2771static __inline__ __m128i __DEFAULT_FN_ATTRS 2772_mm_sll_epi32(__m128i __a, __m128i __count) 2773{ 2774 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2775} 2776 2777/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 2778/// by the specified number of bits. Low-order bits are cleared. 2779/// 2780/// \headerfile <x86intrin.h> 2781/// 2782/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2783/// 2784/// \param __a 2785/// A 128-bit integer vector containing the source operand. 2786/// \param __count 2787/// An integer value specifying the number of bits to left-shift each value 2788/// in operand \a __a. 2789/// \returns A 128-bit integer vector containing the left-shifted values. 2790static __inline__ __m128i __DEFAULT_FN_ATTRS 2791_mm_slli_epi64(__m128i __a, int __count) 2792{ 2793 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2794} 2795 2796/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 2797/// by the specified number of bits. Low-order bits are cleared. 2798/// 2799/// \headerfile <x86intrin.h> 2800/// 2801/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2802/// 2803/// \param __a 2804/// A 128-bit integer vector containing the source operand. 2805/// \param __count 2806/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2807/// to left-shift each value in operand \a __a. 2808/// \returns A 128-bit integer vector containing the left-shifted values. 2809static __inline__ __m128i __DEFAULT_FN_ATTRS 2810_mm_sll_epi64(__m128i __a, __m128i __count) 2811{ 2812 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2813} 2814 2815/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 2816/// by the specified number of bits. High-order bits are filled with the sign 2817/// bit of the initial value. 2818/// 2819/// \headerfile <x86intrin.h> 2820/// 2821/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2822/// 2823/// \param __a 2824/// A 128-bit integer vector containing the source operand. 2825/// \param __count 2826/// An integer value specifying the number of bits to right-shift each value 2827/// in operand \a __a. 2828/// \returns A 128-bit integer vector containing the right-shifted values. 2829static __inline__ __m128i __DEFAULT_FN_ATTRS 2830_mm_srai_epi16(__m128i __a, int __count) 2831{ 2832 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2833} 2834 2835/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 2836/// by the specified number of bits. High-order bits are filled with the sign 2837/// bit of the initial value. 2838/// 2839/// \headerfile <x86intrin.h> 2840/// 2841/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2842/// 2843/// \param __a 2844/// A 128-bit integer vector containing the source operand. 2845/// \param __count 2846/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2847/// to right-shift each value in operand \a __a. 2848/// \returns A 128-bit integer vector containing the right-shifted values. 2849static __inline__ __m128i __DEFAULT_FN_ATTRS 2850_mm_sra_epi16(__m128i __a, __m128i __count) 2851{ 2852 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2853} 2854 2855/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 2856/// by the specified number of bits. High-order bits are filled with the sign 2857/// bit of the initial value. 2858/// 2859/// \headerfile <x86intrin.h> 2860/// 2861/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2862/// 2863/// \param __a 2864/// A 128-bit integer vector containing the source operand. 2865/// \param __count 2866/// An integer value specifying the number of bits to right-shift each value 2867/// in operand \a __a. 2868/// \returns A 128-bit integer vector containing the right-shifted values. 2869static __inline__ __m128i __DEFAULT_FN_ATTRS 2870_mm_srai_epi32(__m128i __a, int __count) 2871{ 2872 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2873} 2874 2875/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 2876/// by the specified number of bits. High-order bits are filled with the sign 2877/// bit of the initial value. 2878/// 2879/// \headerfile <x86intrin.h> 2880/// 2881/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2882/// 2883/// \param __a 2884/// A 128-bit integer vector containing the source operand. 2885/// \param __count 2886/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2887/// to right-shift each value in operand \a __a. 2888/// \returns A 128-bit integer vector containing the right-shifted values. 2889static __inline__ __m128i __DEFAULT_FN_ATTRS 2890_mm_sra_epi32(__m128i __a, __m128i __count) 2891{ 2892 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2893} 2894 2895/// \brief Right-shifts the 128-bit integer vector operand by the specified 2896/// number of bytes. High-order bits are cleared. 2897/// 2898/// \headerfile <x86intrin.h> 2899/// 2900/// \code 2901/// __m128i _mm_srli_si128(__m128i a, const int imm); 2902/// \endcode 2903/// 2904/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 2905/// 2906/// \param a 2907/// A 128-bit integer vector containing the source operand. 2908/// \param imm 2909/// An immediate value specifying the number of bytes to right-shift operand 2910/// \a a. 2911/// \returns A 128-bit integer vector containing the right-shifted value. 2912#define _mm_srli_si128(a, imm) __extension__ ({ \ 2913 (__m128i)__builtin_shufflevector( \ 2914 (__v16qi)(__m128i)(a), \ 2915 (__v16qi)_mm_setzero_si128(), \ 2916 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \ 2917 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \ 2918 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \ 2919 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \ 2920 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \ 2921 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \ 2922 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \ 2923 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \ 2924 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \ 2925 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \ 2926 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \ 2927 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \ 2928 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ 2929 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ 2930 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ 2931 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) 2932 2933#define _mm_bsrli_si128(a, imm) \ 2934 _mm_srli_si128((a), (imm)) 2935 2936/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 2937/// operand by the specified number of bits. High-order bits are cleared. 2938/// 2939/// \headerfile <x86intrin.h> 2940/// 2941/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2942/// 2943/// \param __a 2944/// A 128-bit integer vector containing the source operand. 2945/// \param __count 2946/// An integer value specifying the number of bits to right-shift each value 2947/// in operand \a __a. 2948/// \returns A 128-bit integer vector containing the right-shifted values. 2949static __inline__ __m128i __DEFAULT_FN_ATTRS 2950_mm_srli_epi16(__m128i __a, int __count) 2951{ 2952 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 2953} 2954 2955/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 2956/// operand by the specified number of bits. High-order bits are cleared. 2957/// 2958/// \headerfile <x86intrin.h> 2959/// 2960/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2961/// 2962/// \param __a 2963/// A 128-bit integer vector containing the source operand. 2964/// \param __count 2965/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2966/// to right-shift each value in operand \a __a. 2967/// \returns A 128-bit integer vector containing the right-shifted values. 2968static __inline__ __m128i __DEFAULT_FN_ATTRS 2969_mm_srl_epi16(__m128i __a, __m128i __count) 2970{ 2971 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 2972} 2973 2974/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 2975/// operand by the specified number of bits. High-order bits are cleared. 2976/// 2977/// \headerfile <x86intrin.h> 2978/// 2979/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2980/// 2981/// \param __a 2982/// A 128-bit integer vector containing the source operand. 2983/// \param __count 2984/// An integer value specifying the number of bits to right-shift each value 2985/// in operand \a __a. 2986/// \returns A 128-bit integer vector containing the right-shifted values. 2987static __inline__ __m128i __DEFAULT_FN_ATTRS 2988_mm_srli_epi32(__m128i __a, int __count) 2989{ 2990 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 2991} 2992 2993/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 2994/// operand by the specified number of bits. High-order bits are cleared. 2995/// 2996/// \headerfile <x86intrin.h> 2997/// 2998/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2999/// 3000/// \param __a 3001/// A 128-bit integer vector containing the source operand. 3002/// \param __count 3003/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3004/// to right-shift each value in operand \a __a. 3005/// \returns A 128-bit integer vector containing the right-shifted values. 3006static __inline__ __m128i __DEFAULT_FN_ATTRS 3007_mm_srl_epi32(__m128i __a, __m128i __count) 3008{ 3009 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3010} 3011 3012/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 3013/// operand by the specified number of bits. High-order bits are cleared. 3014/// 3015/// \headerfile <x86intrin.h> 3016/// 3017/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3018/// 3019/// \param __a 3020/// A 128-bit integer vector containing the source operand. 3021/// \param __count 3022/// An integer value specifying the number of bits to right-shift each value 3023/// in operand \a __a. 3024/// \returns A 128-bit integer vector containing the right-shifted values. 3025static __inline__ __m128i __DEFAULT_FN_ATTRS 3026_mm_srli_epi64(__m128i __a, int __count) 3027{ 3028 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3029} 3030 3031/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 3032/// operand by the specified number of bits. High-order bits are cleared. 3033/// 3034/// \headerfile <x86intrin.h> 3035/// 3036/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3037/// 3038/// \param __a 3039/// A 128-bit integer vector containing the source operand. 3040/// \param __count 3041/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3042/// to right-shift each value in operand \a __a. 3043/// \returns A 128-bit integer vector containing the right-shifted values. 3044static __inline__ __m128i __DEFAULT_FN_ATTRS 3045_mm_srl_epi64(__m128i __a, __m128i __count) 3046{ 3047 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3048} 3049 3050/// \brief Compares each of the corresponding 8-bit values of the 128-bit 3051/// integer vectors for equality. Each comparison yields 0h for false, FFh 3052/// for true. 3053/// 3054/// \headerfile <x86intrin.h> 3055/// 3056/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3057/// 3058/// \param __a 3059/// A 128-bit integer vector. 3060/// \param __b 3061/// A 128-bit integer vector. 3062/// \returns A 128-bit integer vector containing the comparison results. 3063static __inline__ __m128i __DEFAULT_FN_ATTRS 3064_mm_cmpeq_epi8(__m128i __a, __m128i __b) 3065{ 3066 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3067} 3068 3069/// \brief Compares each of the corresponding 16-bit values of the 128-bit 3070/// integer vectors for equality. Each comparison yields 0h for false, FFFFh 3071/// for true. 3072/// 3073/// \headerfile <x86intrin.h> 3074/// 3075/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3076/// 3077/// \param __a 3078/// A 128-bit integer vector. 3079/// \param __b 3080/// A 128-bit integer vector. 3081/// \returns A 128-bit integer vector containing the comparison results. 3082static __inline__ __m128i __DEFAULT_FN_ATTRS 3083_mm_cmpeq_epi16(__m128i __a, __m128i __b) 3084{ 3085 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3086} 3087 3088/// \brief Compares each of the corresponding 32-bit values of the 128-bit 3089/// integer vectors for equality. Each comparison yields 0h for false, 3090/// FFFFFFFFh for true. 3091/// 3092/// \headerfile <x86intrin.h> 3093/// 3094/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3095/// 3096/// \param __a 3097/// A 128-bit integer vector. 3098/// \param __b 3099/// A 128-bit integer vector. 3100/// \returns A 128-bit integer vector containing the comparison results. 3101static __inline__ __m128i __DEFAULT_FN_ATTRS 3102_mm_cmpeq_epi32(__m128i __a, __m128i __b) 3103{ 3104 return (__m128i)((__v4si)__a == (__v4si)__b); 3105} 3106 3107/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 3108/// integer vectors to determine if the values in the first operand are 3109/// greater than those in the second operand. Each comparison yields 0h for 3110/// false, FFh for true. 3111/// 3112/// \headerfile <x86intrin.h> 3113/// 3114/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3115/// 3116/// \param __a 3117/// A 128-bit integer vector. 3118/// \param __b 3119/// A 128-bit integer vector. 3120/// \returns A 128-bit integer vector containing the comparison results. 3121static __inline__ __m128i __DEFAULT_FN_ATTRS 3122_mm_cmpgt_epi8(__m128i __a, __m128i __b) 3123{ 3124 /* This function always performs a signed comparison, but __v16qi is a char 3125 which may be signed or unsigned, so use __v16qs. */ 3126 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3127} 3128 3129/// \brief Compares each of the corresponding signed 16-bit values of the 3130/// 128-bit integer vectors to determine if the values in the first operand 3131/// are greater than those in the second operand. Each comparison yields 0h 3132/// for false, FFFFh for true. 3133/// 3134/// \headerfile <x86intrin.h> 3135/// 3136/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3137/// 3138/// \param __a 3139/// A 128-bit integer vector. 3140/// \param __b 3141/// A 128-bit integer vector. 3142/// \returns A 128-bit integer vector containing the comparison results. 3143static __inline__ __m128i __DEFAULT_FN_ATTRS 3144_mm_cmpgt_epi16(__m128i __a, __m128i __b) 3145{ 3146 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3147} 3148 3149/// \brief Compares each of the corresponding signed 32-bit values of the 3150/// 128-bit integer vectors to determine if the values in the first operand 3151/// are greater than those in the second operand. Each comparison yields 0h 3152/// for false, FFFFFFFFh for true. 3153/// 3154/// \headerfile <x86intrin.h> 3155/// 3156/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3157/// 3158/// \param __a 3159/// A 128-bit integer vector. 3160/// \param __b 3161/// A 128-bit integer vector. 3162/// \returns A 128-bit integer vector containing the comparison results. 3163static __inline__ __m128i __DEFAULT_FN_ATTRS 3164_mm_cmpgt_epi32(__m128i __a, __m128i __b) 3165{ 3166 return (__m128i)((__v4si)__a > (__v4si)__b); 3167} 3168 3169/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 3170/// integer vectors to determine if the values in the first operand are less 3171/// than those in the second operand. Each comparison yields 0h for false, 3172/// FFh for true. 3173/// 3174/// \headerfile <x86intrin.h> 3175/// 3176/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3177/// 3178/// \param __a 3179/// A 128-bit integer vector. 3180/// \param __b 3181/// A 128-bit integer vector. 3182/// \returns A 128-bit integer vector containing the comparison results. 3183static __inline__ __m128i __DEFAULT_FN_ATTRS 3184_mm_cmplt_epi8(__m128i __a, __m128i __b) 3185{ 3186 return _mm_cmpgt_epi8(__b, __a); 3187} 3188 3189/// \brief Compares each of the corresponding signed 16-bit values of the 3190/// 128-bit integer vectors to determine if the values in the first operand 3191/// are less than those in the second operand. Each comparison yields 0h for 3192/// false, FFFFh for true. 3193/// 3194/// \headerfile <x86intrin.h> 3195/// 3196/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3197/// 3198/// \param __a 3199/// A 128-bit integer vector. 3200/// \param __b 3201/// A 128-bit integer vector. 3202/// \returns A 128-bit integer vector containing the comparison results. 3203static __inline__ __m128i __DEFAULT_FN_ATTRS 3204_mm_cmplt_epi16(__m128i __a, __m128i __b) 3205{ 3206 return _mm_cmpgt_epi16(__b, __a); 3207} 3208 3209/// \brief Compares each of the corresponding signed 32-bit values of the 3210/// 128-bit integer vectors to determine if the values in the first operand 3211/// are less than those in the second operand. Each comparison yields 0h for 3212/// false, FFFFFFFFh for true. 3213/// 3214/// \headerfile <x86intrin.h> 3215/// 3216/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3217/// 3218/// \param __a 3219/// A 128-bit integer vector. 3220/// \param __b 3221/// A 128-bit integer vector. 3222/// \returns A 128-bit integer vector containing the comparison results. 3223static __inline__ __m128i __DEFAULT_FN_ATTRS 3224_mm_cmplt_epi32(__m128i __a, __m128i __b) 3225{ 3226 return _mm_cmpgt_epi32(__b, __a); 3227} 3228 3229#ifdef __x86_64__ 3230/// \brief Converts a 64-bit signed integer value from the second operand into a 3231/// double-precision value and returns it in the lower element of a [2 x 3232/// double] vector; the upper element of the returned vector is copied from 3233/// the upper element of the first operand. 3234/// 3235/// \headerfile <x86intrin.h> 3236/// 3237/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3238/// 3239/// \param __a 3240/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3241/// copied to the upper 64 bits of the destination. 3242/// \param __b 3243/// A 64-bit signed integer operand containing the value to be converted. 3244/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3245/// converted value of the second operand. The upper 64 bits are copied from 3246/// the upper 64 bits of the first operand. 3247static __inline__ __m128d __DEFAULT_FN_ATTRS 3248_mm_cvtsi64_sd(__m128d __a, long long __b) 3249{ 3250 __a[0] = __b; 3251 return __a; 3252} 3253 3254/// \brief Converts the first (lower) element of a vector of [2 x double] into a 3255/// 64-bit signed integer value, according to the current rounding mode. 3256/// 3257/// \headerfile <x86intrin.h> 3258/// 3259/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3260/// 3261/// \param __a 3262/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3263/// conversion. 3264/// \returns A 64-bit signed integer containing the converted value. 3265static __inline__ long long __DEFAULT_FN_ATTRS 3266_mm_cvtsd_si64(__m128d __a) 3267{ 3268 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3269} 3270 3271/// \brief Converts the first (lower) element of a vector of [2 x double] into a 3272/// 64-bit signed integer value, truncating the result when it is inexact. 3273/// 3274/// \headerfile <x86intrin.h> 3275/// 3276/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3277/// instruction. 3278/// 3279/// \param __a 3280/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3281/// conversion. 3282/// \returns A 64-bit signed integer containing the converted value. 3283static __inline__ long long __DEFAULT_FN_ATTRS 3284_mm_cvttsd_si64(__m128d __a) 3285{ 3286 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3287} 3288#endif 3289 3290/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 3291/// 3292/// \headerfile <x86intrin.h> 3293/// 3294/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3295/// 3296/// \param __a 3297/// A 128-bit integer vector. 3298/// \returns A 128-bit vector of [4 x float] containing the converted values. 3299static __inline__ __m128 __DEFAULT_FN_ATTRS 3300_mm_cvtepi32_ps(__m128i __a) 3301{ 3302 return __builtin_ia32_cvtdq2ps((__v4si)__a); 3303} 3304 3305/// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 3306/// 3307/// \headerfile <x86intrin.h> 3308/// 3309/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3310/// 3311/// \param __a 3312/// A 128-bit vector of [4 x float]. 3313/// \returns A 128-bit integer vector of [4 x i32] containing the converted 3314/// values. 3315static __inline__ __m128i __DEFAULT_FN_ATTRS 3316_mm_cvtps_epi32(__m128 __a) 3317{ 3318 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3319} 3320 3321/// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 3322/// truncating the result when it is inexact. 3323/// 3324/// \headerfile <x86intrin.h> 3325/// 3326/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3327/// instruction. 3328/// 3329/// \param __a 3330/// A 128-bit vector of [4 x float]. 3331/// \returns A 128-bit vector of [4 x i32] containing the converted values. 3332static __inline__ __m128i __DEFAULT_FN_ATTRS 3333_mm_cvttps_epi32(__m128 __a) 3334{ 3335 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3336} 3337 3338/// \brief Returns a vector of [4 x i32] where the lowest element is the input 3339/// operand and the remaining elements are zero. 3340/// 3341/// \headerfile <x86intrin.h> 3342/// 3343/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3344/// 3345/// \param __a 3346/// A 32-bit signed integer operand. 3347/// \returns A 128-bit vector of [4 x i32]. 3348static __inline__ __m128i __DEFAULT_FN_ATTRS 3349_mm_cvtsi32_si128(int __a) 3350{ 3351 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 3352} 3353 3354#ifdef __x86_64__ 3355/// \brief Returns a vector of [2 x i64] where the lower element is the input 3356/// operand and the upper element is zero. 3357/// 3358/// \headerfile <x86intrin.h> 3359/// 3360/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3361/// 3362/// \param __a 3363/// A 64-bit signed integer operand containing the value to be converted. 3364/// \returns A 128-bit vector of [2 x i64] containing the converted value. 3365static __inline__ __m128i __DEFAULT_FN_ATTRS 3366_mm_cvtsi64_si128(long long __a) 3367{ 3368 return (__m128i){ __a, 0 }; 3369} 3370#endif 3371 3372/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 3373/// 32-bit signed integer value. 3374/// 3375/// \headerfile <x86intrin.h> 3376/// 3377/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3378/// 3379/// \param __a 3380/// A vector of [4 x i32]. The least significant 32 bits are moved to the 3381/// destination. 3382/// \returns A 32-bit signed integer containing the moved value. 3383static __inline__ int __DEFAULT_FN_ATTRS 3384_mm_cvtsi128_si32(__m128i __a) 3385{ 3386 __v4si __b = (__v4si)__a; 3387 return __b[0]; 3388} 3389 3390#ifdef __x86_64__ 3391/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 3392/// 64-bit signed integer value. 3393/// 3394/// \headerfile <x86intrin.h> 3395/// 3396/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3397/// 3398/// \param __a 3399/// A vector of [2 x i64]. The least significant 64 bits are moved to the 3400/// destination. 3401/// \returns A 64-bit signed integer containing the moved value. 3402static __inline__ long long __DEFAULT_FN_ATTRS 3403_mm_cvtsi128_si64(__m128i __a) 3404{ 3405 return __a[0]; 3406} 3407#endif 3408 3409/// \brief Moves packed integer values from an aligned 128-bit memory location 3410/// to elements in a 128-bit integer vector. 3411/// 3412/// \headerfile <x86intrin.h> 3413/// 3414/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3415/// 3416/// \param __p 3417/// An aligned pointer to a memory location containing integer values. 3418/// \returns A 128-bit integer vector containing the moved values. 3419static __inline__ __m128i __DEFAULT_FN_ATTRS 3420_mm_load_si128(__m128i const *__p) 3421{ 3422 return *__p; 3423} 3424 3425/// \brief Moves packed integer values from an unaligned 128-bit memory location 3426/// to elements in a 128-bit integer vector. 3427/// 3428/// \headerfile <x86intrin.h> 3429/// 3430/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3431/// 3432/// \param __p 3433/// A pointer to a memory location containing integer values. 3434/// \returns A 128-bit integer vector containing the moved values. 3435static __inline__ __m128i __DEFAULT_FN_ATTRS 3436_mm_loadu_si128(__m128i const *__p) 3437{ 3438 struct __loadu_si128 { 3439 __m128i __v; 3440 } __attribute__((__packed__, __may_alias__)); 3441 return ((struct __loadu_si128*)__p)->__v; 3442} 3443 3444/// \brief Returns a vector of [2 x i64] where the lower element is taken from 3445/// the lower element of the operand, and the upper element is zero. 3446/// 3447/// \headerfile <x86intrin.h> 3448/// 3449/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3450/// 3451/// \param __p 3452/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3453/// the destination. 3454/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3455/// moved value. The higher order bits are cleared. 3456static __inline__ __m128i __DEFAULT_FN_ATTRS 3457_mm_loadl_epi64(__m128i const *__p) 3458{ 3459 struct __mm_loadl_epi64_struct { 3460 long long __u; 3461 } __attribute__((__packed__, __may_alias__)); 3462 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 3463} 3464 3465/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 3466/// This could be used as an argument to another intrinsic function where the 3467/// argument is required but the value is not actually used. 3468/// 3469/// \headerfile <x86intrin.h> 3470/// 3471/// This intrinsic has no corresponding instruction. 3472/// 3473/// \returns A 128-bit vector of [4 x i32] with unspecified content. 3474static __inline__ __m128i __DEFAULT_FN_ATTRS 3475_mm_undefined_si128(void) 3476{ 3477 return (__m128i)__builtin_ia32_undef128(); 3478} 3479 3480/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3481/// the specified 64-bit integer values. 3482/// 3483/// \headerfile <x86intrin.h> 3484/// 3485/// This intrinsic is a utility function and does not correspond to a specific 3486/// instruction. 3487/// 3488/// \param __q1 3489/// A 64-bit integer value used to initialize the upper 64 bits of the 3490/// destination vector of [2 x i64]. 3491/// \param __q0 3492/// A 64-bit integer value used to initialize the lower 64 bits of the 3493/// destination vector of [2 x i64]. 3494/// \returns An initialized 128-bit vector of [2 x i64] containing the values 3495/// provided in the operands. 3496static __inline__ __m128i __DEFAULT_FN_ATTRS 3497_mm_set_epi64x(long long __q1, long long __q0) 3498{ 3499 return (__m128i){ __q0, __q1 }; 3500} 3501 3502/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3503/// the specified 64-bit integer values. 3504/// 3505/// \headerfile <x86intrin.h> 3506/// 3507/// This intrinsic is a utility function and does not correspond to a specific 3508/// instruction. 3509/// 3510/// \param __q1 3511/// A 64-bit integer value used to initialize the upper 64 bits of the 3512/// destination vector of [2 x i64]. 3513/// \param __q0 3514/// A 64-bit integer value used to initialize the lower 64 bits of the 3515/// destination vector of [2 x i64]. 3516/// \returns An initialized 128-bit vector of [2 x i64] containing the values 3517/// provided in the operands. 3518static __inline__ __m128i __DEFAULT_FN_ATTRS 3519_mm_set_epi64(__m64 __q1, __m64 __q0) 3520{ 3521 return (__m128i){ (long long)__q0, (long long)__q1 }; 3522} 3523 3524/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3525/// the specified 32-bit integer values. 3526/// 3527/// \headerfile <x86intrin.h> 3528/// 3529/// This intrinsic is a utility function and does not correspond to a specific 3530/// instruction. 3531/// 3532/// \param __i3 3533/// A 32-bit integer value used to initialize bits [127:96] of the 3534/// destination vector. 3535/// \param __i2 3536/// A 32-bit integer value used to initialize bits [95:64] of the destination 3537/// vector. 3538/// \param __i1 3539/// A 32-bit integer value used to initialize bits [63:32] of the destination 3540/// vector. 3541/// \param __i0 3542/// A 32-bit integer value used to initialize bits [31:0] of the destination 3543/// vector. 3544/// \returns An initialized 128-bit vector of [4 x i32] containing the values 3545/// provided in the operands. 3546static __inline__ __m128i __DEFAULT_FN_ATTRS 3547_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 3548{ 3549 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3550} 3551 3552/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3553/// the specified 16-bit integer values. 3554/// 3555/// \headerfile <x86intrin.h> 3556/// 3557/// This intrinsic is a utility function and does not correspond to a specific 3558/// instruction. 3559/// 3560/// \param __w7 3561/// A 16-bit integer value used to initialize bits [127:112] of the 3562/// destination vector. 3563/// \param __w6 3564/// A 16-bit integer value used to initialize bits [111:96] of the 3565/// destination vector. 3566/// \param __w5 3567/// A 16-bit integer value used to initialize bits [95:80] of the destination 3568/// vector. 3569/// \param __w4 3570/// A 16-bit integer value used to initialize bits [79:64] of the destination 3571/// vector. 3572/// \param __w3 3573/// A 16-bit integer value used to initialize bits [63:48] of the destination 3574/// vector. 3575/// \param __w2 3576/// A 16-bit integer value used to initialize bits [47:32] of the destination 3577/// vector. 3578/// \param __w1 3579/// A 16-bit integer value used to initialize bits [31:16] of the destination 3580/// vector. 3581/// \param __w0 3582/// A 16-bit integer value used to initialize bits [15:0] of the destination 3583/// vector. 3584/// \returns An initialized 128-bit vector of [8 x i16] containing the values 3585/// provided in the operands. 3586static __inline__ __m128i __DEFAULT_FN_ATTRS 3587_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 3588{ 3589 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3590} 3591 3592/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3593/// the specified 8-bit integer values. 3594/// 3595/// \headerfile <x86intrin.h> 3596/// 3597/// This intrinsic is a utility function and does not correspond to a specific 3598/// instruction. 3599/// 3600/// \param __b15 3601/// Initializes bits [127:120] of the destination vector. 3602/// \param __b14 3603/// Initializes bits [119:112] of the destination vector. 3604/// \param __b13 3605/// Initializes bits [111:104] of the destination vector. 3606/// \param __b12 3607/// Initializes bits [103:96] of the destination vector. 3608/// \param __b11 3609/// Initializes bits [95:88] of the destination vector. 3610/// \param __b10 3611/// Initializes bits [87:80] of the destination vector. 3612/// \param __b9 3613/// Initializes bits [79:72] of the destination vector. 3614/// \param __b8 3615/// Initializes bits [71:64] of the destination vector. 3616/// \param __b7 3617/// Initializes bits [63:56] of the destination vector. 3618/// \param __b6 3619/// Initializes bits [55:48] of the destination vector. 3620/// \param __b5 3621/// Initializes bits [47:40] of the destination vector. 3622/// \param __b4 3623/// Initializes bits [39:32] of the destination vector. 3624/// \param __b3 3625/// Initializes bits [31:24] of the destination vector. 3626/// \param __b2 3627/// Initializes bits [23:16] of the destination vector. 3628/// \param __b1 3629/// Initializes bits [15:8] of the destination vector. 3630/// \param __b0 3631/// Initializes bits [7:0] of the destination vector. 3632/// \returns An initialized 128-bit vector of [16 x i8] containing the values 3633/// provided in the operands. 3634static __inline__ __m128i __DEFAULT_FN_ATTRS 3635_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 3636{ 3637 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3638} 3639 3640/// \brief Initializes both values in a 128-bit integer vector with the 3641/// specified 64-bit integer value. 3642/// 3643/// \headerfile <x86intrin.h> 3644/// 3645/// This intrinsic is a utility function and does not correspond to a specific 3646/// instruction. 3647/// 3648/// \param __q 3649/// Integer value used to initialize the elements of the destination integer 3650/// vector. 3651/// \returns An initialized 128-bit integer vector of [2 x i64] with both 3652/// elements containing the value provided in the operand. 3653static __inline__ __m128i __DEFAULT_FN_ATTRS 3654_mm_set1_epi64x(long long __q) 3655{ 3656 return (__m128i){ __q, __q }; 3657} 3658 3659/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 3660/// specified 64-bit value. 3661/// 3662/// \headerfile <x86intrin.h> 3663/// 3664/// This intrinsic is a utility function and does not correspond to a specific 3665/// instruction. 3666/// 3667/// \param __q 3668/// A 64-bit value used to initialize the elements of the destination integer 3669/// vector. 3670/// \returns An initialized 128-bit vector of [2 x i64] with all elements 3671/// containing the value provided in the operand. 3672static __inline__ __m128i __DEFAULT_FN_ATTRS 3673_mm_set1_epi64(__m64 __q) 3674{ 3675 return (__m128i){ (long long)__q, (long long)__q }; 3676} 3677 3678/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 3679/// specified 32-bit value. 3680/// 3681/// \headerfile <x86intrin.h> 3682/// 3683/// This intrinsic is a utility function and does not correspond to a specific 3684/// instruction. 3685/// 3686/// \param __i 3687/// A 32-bit value used to initialize the elements of the destination integer 3688/// vector. 3689/// \returns An initialized 128-bit vector of [4 x i32] with all elements 3690/// containing the value provided in the operand. 3691static __inline__ __m128i __DEFAULT_FN_ATTRS 3692_mm_set1_epi32(int __i) 3693{ 3694 return (__m128i)(__v4si){ __i, __i, __i, __i }; 3695} 3696 3697/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 3698/// specified 16-bit value. 3699/// 3700/// \headerfile <x86intrin.h> 3701/// 3702/// This intrinsic is a utility function and does not correspond to a specific 3703/// instruction. 3704/// 3705/// \param __w 3706/// A 16-bit value used to initialize the elements of the destination integer 3707/// vector. 3708/// \returns An initialized 128-bit vector of [8 x i16] with all elements 3709/// containing the value provided in the operand. 3710static __inline__ __m128i __DEFAULT_FN_ATTRS 3711_mm_set1_epi16(short __w) 3712{ 3713 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 3714} 3715 3716/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 3717/// specified 8-bit value. 3718/// 3719/// \headerfile <x86intrin.h> 3720/// 3721/// This intrinsic is a utility function and does not correspond to a specific 3722/// instruction. 3723/// 3724/// \param __b 3725/// An 8-bit value used to initialize the elements of the destination integer 3726/// vector. 3727/// \returns An initialized 128-bit vector of [16 x i8] with all elements 3728/// containing the value provided in the operand. 3729static __inline__ __m128i __DEFAULT_FN_ATTRS 3730_mm_set1_epi8(char __b) 3731{ 3732 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 3733} 3734 3735/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3736/// with the specified 64-bit integral values. 3737/// 3738/// \headerfile <x86intrin.h> 3739/// 3740/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 3741/// instruction. 3742/// 3743/// \param __q0 3744/// A 64-bit integral value used to initialize the lower 64 bits of the 3745/// result. 3746/// \param __q1 3747/// A 64-bit integral value used to initialize the upper 64 bits of the 3748/// result. 3749/// \returns An initialized 128-bit integer vector. 3750static __inline__ __m128i __DEFAULT_FN_ATTRS 3751_mm_setr_epi64(__m64 __q0, __m64 __q1) 3752{ 3753 return (__m128i){ (long long)__q0, (long long)__q1 }; 3754} 3755 3756/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3757/// with the specified 32-bit integral values. 3758/// 3759/// \headerfile <x86intrin.h> 3760/// 3761/// This intrinsic is a utility function and does not correspond to a specific 3762/// instruction. 3763/// 3764/// \param __i0 3765/// A 32-bit integral value used to initialize bits [31:0] of the result. 3766/// \param __i1 3767/// A 32-bit integral value used to initialize bits [63:32] of the result. 3768/// \param __i2 3769/// A 32-bit integral value used to initialize bits [95:64] of the result. 3770/// \param __i3 3771/// A 32-bit integral value used to initialize bits [127:96] of the result. 3772/// \returns An initialized 128-bit integer vector. 3773static __inline__ __m128i __DEFAULT_FN_ATTRS 3774_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 3775{ 3776 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3777} 3778 3779/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3780/// with the specified 16-bit integral values. 3781/// 3782/// \headerfile <x86intrin.h> 3783/// 3784/// This intrinsic is a utility function and does not correspond to a specific 3785/// instruction. 3786/// 3787/// \param __w0 3788/// A 16-bit integral value used to initialize bits [15:0] of the result. 3789/// \param __w1 3790/// A 16-bit integral value used to initialize bits [31:16] of the result. 3791/// \param __w2 3792/// A 16-bit integral value used to initialize bits [47:32] of the result. 3793/// \param __w3 3794/// A 16-bit integral value used to initialize bits [63:48] of the result. 3795/// \param __w4 3796/// A 16-bit integral value used to initialize bits [79:64] of the result. 3797/// \param __w5 3798/// A 16-bit integral value used to initialize bits [95:80] of the result. 3799/// \param __w6 3800/// A 16-bit integral value used to initialize bits [111:96] of the result. 3801/// \param __w7 3802/// A 16-bit integral value used to initialize bits [127:112] of the result. 3803/// \returns An initialized 128-bit integer vector. 3804static __inline__ __m128i __DEFAULT_FN_ATTRS 3805_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 3806{ 3807 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3808} 3809 3810/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3811/// with the specified 8-bit integral values. 3812/// 3813/// \headerfile <x86intrin.h> 3814/// 3815/// This intrinsic is a utility function and does not correspond to a specific 3816/// instruction. 3817/// 3818/// \param __b0 3819/// An 8-bit integral value used to initialize bits [7:0] of the result. 3820/// \param __b1 3821/// An 8-bit integral value used to initialize bits [15:8] of the result. 3822/// \param __b2 3823/// An 8-bit integral value used to initialize bits [23:16] of the result. 3824/// \param __b3 3825/// An 8-bit integral value used to initialize bits [31:24] of the result. 3826/// \param __b4 3827/// An 8-bit integral value used to initialize bits [39:32] of the result. 3828/// \param __b5 3829/// An 8-bit integral value used to initialize bits [47:40] of the result. 3830/// \param __b6 3831/// An 8-bit integral value used to initialize bits [55:48] of the result. 3832/// \param __b7 3833/// An 8-bit integral value used to initialize bits [63:56] of the result. 3834/// \param __b8 3835/// An 8-bit integral value used to initialize bits [71:64] of the result. 3836/// \param __b9 3837/// An 8-bit integral value used to initialize bits [79:72] of the result. 3838/// \param __b10 3839/// An 8-bit integral value used to initialize bits [87:80] of the result. 3840/// \param __b11 3841/// An 8-bit integral value used to initialize bits [95:88] of the result. 3842/// \param __b12 3843/// An 8-bit integral value used to initialize bits [103:96] of the result. 3844/// \param __b13 3845/// An 8-bit integral value used to initialize bits [111:104] of the result. 3846/// \param __b14 3847/// An 8-bit integral value used to initialize bits [119:112] of the result. 3848/// \param __b15 3849/// An 8-bit integral value used to initialize bits [127:120] of the result. 3850/// \returns An initialized 128-bit integer vector. 3851static __inline__ __m128i __DEFAULT_FN_ATTRS 3852_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 3853{ 3854 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3855} 3856 3857/// \brief Creates a 128-bit integer vector initialized to zero. 3858/// 3859/// \headerfile <x86intrin.h> 3860/// 3861/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3862/// 3863/// \returns An initialized 128-bit integer vector with all elements set to 3864/// zero. 3865static __inline__ __m128i __DEFAULT_FN_ATTRS 3866_mm_setzero_si128(void) 3867{ 3868 return (__m128i){ 0LL, 0LL }; 3869} 3870 3871/// \brief Stores a 128-bit integer vector to a memory location aligned on a 3872/// 128-bit boundary. 3873/// 3874/// \headerfile <x86intrin.h> 3875/// 3876/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3877/// 3878/// \param __p 3879/// A pointer to an aligned memory location that will receive the integer 3880/// values. 3881/// \param __b 3882/// A 128-bit integer vector containing the values to be moved. 3883static __inline__ void __DEFAULT_FN_ATTRS 3884_mm_store_si128(__m128i *__p, __m128i __b) 3885{ 3886 *__p = __b; 3887} 3888 3889/// \brief Stores a 128-bit integer vector to an unaligned memory location. 3890/// 3891/// \headerfile <x86intrin.h> 3892/// 3893/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 3894/// 3895/// \param __p 3896/// A pointer to a memory location that will receive the integer values. 3897/// \param __b 3898/// A 128-bit integer vector containing the values to be moved. 3899static __inline__ void __DEFAULT_FN_ATTRS 3900_mm_storeu_si128(__m128i *__p, __m128i __b) 3901{ 3902 struct __storeu_si128 { 3903 __m128i __v; 3904 } __attribute__((__packed__, __may_alias__)); 3905 ((struct __storeu_si128*)__p)->__v = __b; 3906} 3907 3908/// \brief Moves bytes selected by the mask from the first operand to the 3909/// specified unaligned memory location. When a mask bit is 1, the 3910/// corresponding byte is written, otherwise it is not written. To minimize 3911/// caching, the date is flagged as non-temporal (unlikely to be used again 3912/// soon). Exception and trap behavior for elements not selected for storage 3913/// to memory are implementation dependent. 3914/// 3915/// \headerfile <x86intrin.h> 3916/// 3917/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 3918/// instruction. 3919/// 3920/// \param __d 3921/// A 128-bit integer vector containing the values to be moved. 3922/// \param __n 3923/// A 128-bit integer vector containing the mask. The most significant bit of 3924/// each byte represents the mask bits. 3925/// \param __p 3926/// A pointer to an unaligned 128-bit memory location where the specified 3927/// values are moved. 3928static __inline__ void __DEFAULT_FN_ATTRS 3929_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 3930{ 3931 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 3932} 3933 3934/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 3935/// a memory location. 3936/// 3937/// \headerfile <x86intrin.h> 3938/// 3939/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 3940/// 3941/// \param __p 3942/// A pointer to a 64-bit memory location that will receive the lower 64 bits 3943/// of the integer vector parameter. 3944/// \param __a 3945/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 3946/// value to be stored. 3947static __inline__ void __DEFAULT_FN_ATTRS 3948_mm_storel_epi64(__m128i *__p, __m128i __a) 3949{ 3950 struct __mm_storel_epi64_struct { 3951 long long __u; 3952 } __attribute__((__packed__, __may_alias__)); 3953 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 3954} 3955 3956/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit 3957/// aligned memory location. To minimize caching, the data is flagged as 3958/// non-temporal (unlikely to be used again soon). 3959/// 3960/// \headerfile <x86intrin.h> 3961/// 3962/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 3963/// 3964/// \param __p 3965/// A pointer to the 128-bit aligned memory location used to store the value. 3966/// \param __a 3967/// A vector of [2 x double] containing the 64-bit values to be stored. 3968static __inline__ void __DEFAULT_FN_ATTRS 3969_mm_stream_pd(double *__p, __m128d __a) 3970{ 3971 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 3972} 3973 3974/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location. 3975/// To minimize caching, the data is flagged as non-temporal (unlikely to be 3976/// used again soon). 3977/// 3978/// \headerfile <x86intrin.h> 3979/// 3980/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 3981/// 3982/// \param __p 3983/// A pointer to the 128-bit aligned memory location used to store the value. 3984/// \param __a 3985/// A 128-bit integer vector containing the values to be stored. 3986static __inline__ void __DEFAULT_FN_ATTRS 3987_mm_stream_si128(__m128i *__p, __m128i __a) 3988{ 3989 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 3990} 3991 3992/// \brief Stores a 32-bit integer value in the specified memory location. To 3993/// minimize caching, the data is flagged as non-temporal (unlikely to be 3994/// used again soon). 3995/// 3996/// \headerfile <x86intrin.h> 3997/// 3998/// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 3999/// 4000/// \param __p 4001/// A pointer to the 32-bit memory location used to store the value. 4002/// \param __a 4003/// A 32-bit integer containing the value to be stored. 4004static __inline__ void __DEFAULT_FN_ATTRS 4005_mm_stream_si32(int *__p, int __a) 4006{ 4007 __builtin_ia32_movnti(__p, __a); 4008} 4009 4010#ifdef __x86_64__ 4011/// \brief Stores a 64-bit integer value in the specified memory location. To 4012/// minimize caching, the data is flagged as non-temporal (unlikely to be 4013/// used again soon). 4014/// 4015/// \headerfile <x86intrin.h> 4016/// 4017/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4018/// 4019/// \param __p 4020/// A pointer to the 64-bit memory location used to store the value. 4021/// \param __a 4022/// A 64-bit integer containing the value to be stored. 4023static __inline__ void __DEFAULT_FN_ATTRS 4024_mm_stream_si64(long long *__p, long long __a) 4025{ 4026 __builtin_ia32_movnti64(__p, __a); 4027} 4028#endif 4029 4030#if defined(__cplusplus) 4031extern "C" { 4032#endif 4033 4034/// \brief The cache line containing \a __p is flushed and invalidated from all 4035/// caches in the coherency domain. 4036/// 4037/// \headerfile <x86intrin.h> 4038/// 4039/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4040/// 4041/// \param __p 4042/// A pointer to the memory location used to identify the cache line to be 4043/// flushed. 4044void _mm_clflush(void const * __p); 4045 4046/// \brief Forces strong memory ordering (serialization) between load 4047/// instructions preceding this instruction and load instructions following 4048/// this instruction, ensuring the system completes all previous loads before 4049/// executing subsequent loads. 4050/// 4051/// \headerfile <x86intrin.h> 4052/// 4053/// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4054/// 4055void _mm_lfence(void); 4056 4057/// \brief Forces strong memory ordering (serialization) between load and store 4058/// instructions preceding this instruction and load and store instructions 4059/// following this instruction, ensuring that the system completes all 4060/// previous memory accesses before executing subsequent memory accesses. 4061/// 4062/// \headerfile <x86intrin.h> 4063/// 4064/// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4065/// 4066void _mm_mfence(void); 4067 4068#if defined(__cplusplus) 4069} // extern "C" 4070#endif 4071 4072/// \brief Converts 16-bit signed integers from both 128-bit integer vector 4073/// operands into 8-bit signed integers, and packs the results into the 4074/// destination. Positive values greater than 0x7F are saturated to 0x7F. 4075/// Negative values less than 0x80 are saturated to 0x80. 4076/// 4077/// \headerfile <x86intrin.h> 4078/// 4079/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4080/// 4081/// \param __a 4082/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4083/// a signed integer and is converted to a 8-bit signed integer with 4084/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4085/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4086/// written to the lower 64 bits of the result. 4087/// \param __b 4088/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4089/// a signed integer and is converted to a 8-bit signed integer with 4090/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4091/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4092/// written to the higher 64 bits of the result. 4093/// \returns A 128-bit vector of [16 x i8] containing the converted values. 4094static __inline__ __m128i __DEFAULT_FN_ATTRS 4095_mm_packs_epi16(__m128i __a, __m128i __b) 4096{ 4097 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4098} 4099 4100/// \brief Converts 32-bit signed integers from both 128-bit integer vector 4101/// operands into 16-bit signed integers, and packs the results into the 4102/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4103/// Negative values less than 0x8000 are saturated to 0x8000. 4104/// 4105/// \headerfile <x86intrin.h> 4106/// 4107/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4108/// 4109/// \param __a 4110/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4111/// a signed integer and is converted to a 16-bit signed integer with 4112/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4113/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4114/// are written to the lower 64 bits of the result. 4115/// \param __b 4116/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4117/// a signed integer and is converted to a 16-bit signed integer with 4118/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4119/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4120/// are written to the higher 64 bits of the result. 4121/// \returns A 128-bit vector of [8 x i16] containing the converted values. 4122static __inline__ __m128i __DEFAULT_FN_ATTRS 4123_mm_packs_epi32(__m128i __a, __m128i __b) 4124{ 4125 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4126} 4127 4128/// \brief Converts 16-bit signed integers from both 128-bit integer vector 4129/// operands into 8-bit unsigned integers, and packs the results into the 4130/// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4131/// than 0x00 are saturated to 0x00. 4132/// 4133/// \headerfile <x86intrin.h> 4134/// 4135/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4136/// 4137/// \param __a 4138/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4139/// a signed integer and is converted to an 8-bit unsigned integer with 4140/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4141/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4142/// written to the lower 64 bits of the result. 4143/// \param __b 4144/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4145/// a signed integer and is converted to an 8-bit unsigned integer with 4146/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4147/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4148/// written to the higher 64 bits of the result. 4149/// \returns A 128-bit vector of [16 x i8] containing the converted values. 4150static __inline__ __m128i __DEFAULT_FN_ATTRS 4151_mm_packus_epi16(__m128i __a, __m128i __b) 4152{ 4153 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4154} 4155 4156/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4157/// the immediate-value parameter as a selector. 4158/// 4159/// \headerfile <x86intrin.h> 4160/// 4161/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4162/// 4163/// \param __a 4164/// A 128-bit integer vector. 4165/// \param __imm 4166/// An immediate value. Bits [2:0] selects values from \a __a to be assigned 4167/// to bits[15:0] of the result. \n 4168/// 000: assign values from bits [15:0] of \a __a. \n 4169/// 001: assign values from bits [31:16] of \a __a. \n 4170/// 010: assign values from bits [47:32] of \a __a. \n 4171/// 011: assign values from bits [63:48] of \a __a. \n 4172/// 100: assign values from bits [79:64] of \a __a. \n 4173/// 101: assign values from bits [95:80] of \a __a. \n 4174/// 110: assign values from bits [111:96] of \a __a. \n 4175/// 111: assign values from bits [127:112] of \a __a. 4176/// \returns An integer, whose lower 16 bits are selected from the 128-bit 4177/// integer vector parameter and the remaining bits are assigned zeros. 4178static __inline__ int __DEFAULT_FN_ATTRS 4179_mm_extract_epi16(__m128i __a, int __imm) 4180{ 4181 __v8hi __b = (__v8hi)__a; 4182 return (unsigned short)__b[__imm & 7]; 4183} 4184 4185/// \brief Constructs a 128-bit integer vector by first making a copy of the 4186/// 128-bit integer vector parameter, and then inserting the lower 16 bits 4187/// of an integer parameter into an offset specified by the immediate-value 4188/// parameter. 4189/// 4190/// \headerfile <x86intrin.h> 4191/// 4192/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4193/// 4194/// \param __a 4195/// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4196/// result and then one of the eight elements in the result is replaced by 4197/// the lower 16 bits of \a __b. 4198/// \param __b 4199/// An integer. The lower 16 bits of this parameter are written to the 4200/// result beginning at an offset specified by \a __imm. 4201/// \param __imm 4202/// An immediate value specifying the bit offset in the result at which the 4203/// lower 16 bits of \a __b are written. 4204/// \returns A 128-bit integer vector containing the constructed values. 4205static __inline__ __m128i __DEFAULT_FN_ATTRS 4206_mm_insert_epi16(__m128i __a, int __b, int __imm) 4207{ 4208 __v8hi __c = (__v8hi)__a; 4209 __c[__imm & 7] = __b; 4210 return (__m128i)__c; 4211} 4212 4213/// \brief Copies the values of the most significant bits from each 8-bit 4214/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4215/// value, zero-extends the value, and writes it to the destination. 4216/// 4217/// \headerfile <x86intrin.h> 4218/// 4219/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4220/// 4221/// \param __a 4222/// A 128-bit integer vector containing the values with bits to be extracted. 4223/// \returns The most significant bits from each 8-bit element in \a __a, 4224/// written to bits [15:0]. The other bits are assigned zeros. 4225static __inline__ int __DEFAULT_FN_ATTRS 4226_mm_movemask_epi8(__m128i __a) 4227{ 4228 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4229} 4230 4231/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit 4232/// elements of a 128-bit integer vector parameter, using the immediate-value 4233/// parameter as a specifier. 4234/// 4235/// \headerfile <x86intrin.h> 4236/// 4237/// \code 4238/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4239/// \endcode 4240/// 4241/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4242/// 4243/// \param a 4244/// A 128-bit integer vector containing the values to be copied. 4245/// \param imm 4246/// An immediate value containing an 8-bit value specifying which elements to 4247/// copy from a. The destinations within the 128-bit destination are assigned 4248/// values as follows: \n 4249/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4250/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4251/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4252/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4253/// Bit value assignments: \n 4254/// 00: assign values from bits [31:0] of \a a. \n 4255/// 01: assign values from bits [63:32] of \a a. \n 4256/// 10: assign values from bits [95:64] of \a a. \n 4257/// 11: assign values from bits [127:96] of \a a. 4258/// \returns A 128-bit integer vector containing the shuffled values. 4259#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 4260 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 4261 (__v4si)_mm_undefined_si128(), \ 4262 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 4263 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) 4264 4265/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit 4266/// elements of a 128-bit integer vector of [8 x i16], using the immediate 4267/// value parameter as a specifier. 4268/// 4269/// \headerfile <x86intrin.h> 4270/// 4271/// \code 4272/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4273/// \endcode 4274/// 4275/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4276/// 4277/// \param a 4278/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4279/// [127:64] of the result. 4280/// \param imm 4281/// An 8-bit immediate value specifying which elements to copy from \a a. \n 4282/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4283/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4284/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4285/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4286/// Bit value assignments: \n 4287/// 00: assign values from bits [15:0] of \a a. \n 4288/// 01: assign values from bits [31:16] of \a a. \n 4289/// 10: assign values from bits [47:32] of \a a. \n 4290/// 11: assign values from bits [63:48] of \a a. \n 4291/// \returns A 128-bit integer vector containing the shuffled values. 4292#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 4293 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 4294 (__v8hi)_mm_undefined_si128(), \ 4295 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 4296 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ 4297 4, 5, 6, 7); }) 4298 4299/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit 4300/// elements of a 128-bit integer vector of [8 x i16], using the immediate 4301/// value parameter as a specifier. 4302/// 4303/// \headerfile <x86intrin.h> 4304/// 4305/// \code 4306/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4307/// \endcode 4308/// 4309/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4310/// 4311/// \param a 4312/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4313/// [63:0] of the result. 4314/// \param imm 4315/// An 8-bit immediate value specifying which elements to copy from \a a. \n 4316/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4317/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4318/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4319/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4320/// Bit value assignments: \n 4321/// 00: assign values from bits [79:64] of \a a. \n 4322/// 01: assign values from bits [95:80] of \a a. \n 4323/// 10: assign values from bits [111:96] of \a a. \n 4324/// 11: assign values from bits [127:112] of \a a. \n 4325/// \returns A 128-bit integer vector containing the shuffled values. 4326#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 4327 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 4328 (__v8hi)_mm_undefined_si128(), \ 4329 0, 1, 2, 3, \ 4330 4 + (((imm) >> 0) & 0x3), \ 4331 4 + (((imm) >> 2) & 0x3), \ 4332 4 + (((imm) >> 4) & 0x3), \ 4333 4 + (((imm) >> 6) & 0x3)); }) 4334 4335/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors 4336/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4337/// 4338/// \headerfile <x86intrin.h> 4339/// 4340/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4341/// instruction. 4342/// 4343/// \param __a 4344/// A 128-bit vector of [16 x i8]. 4345/// Bits [71:64] are written to bits [7:0] of the result. \n 4346/// Bits [79:72] are written to bits [23:16] of the result. \n 4347/// Bits [87:80] are written to bits [39:32] of the result. \n 4348/// Bits [95:88] are written to bits [55:48] of the result. \n 4349/// Bits [103:96] are written to bits [71:64] of the result. \n 4350/// Bits [111:104] are written to bits [87:80] of the result. \n 4351/// Bits [119:112] are written to bits [103:96] of the result. \n 4352/// Bits [127:120] are written to bits [119:112] of the result. 4353/// \param __b 4354/// A 128-bit vector of [16 x i8]. \n 4355/// Bits [71:64] are written to bits [15:8] of the result. \n 4356/// Bits [79:72] are written to bits [31:24] of the result. \n 4357/// Bits [87:80] are written to bits [47:40] of the result. \n 4358/// Bits [95:88] are written to bits [63:56] of the result. \n 4359/// Bits [103:96] are written to bits [79:72] of the result. \n 4360/// Bits [111:104] are written to bits [95:88] of the result. \n 4361/// Bits [119:112] are written to bits [111:104] of the result. \n 4362/// Bits [127:120] are written to bits [127:120] of the result. 4363/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4364static __inline__ __m128i __DEFAULT_FN_ATTRS 4365_mm_unpackhi_epi8(__m128i __a, __m128i __b) 4366{ 4367 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 4368} 4369 4370/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4371/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4372/// 4373/// \headerfile <x86intrin.h> 4374/// 4375/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4376/// instruction. 4377/// 4378/// \param __a 4379/// A 128-bit vector of [8 x i16]. 4380/// Bits [79:64] are written to bits [15:0] of the result. \n 4381/// Bits [95:80] are written to bits [47:32] of the result. \n 4382/// Bits [111:96] are written to bits [79:64] of the result. \n 4383/// Bits [127:112] are written to bits [111:96] of the result. 4384/// \param __b 4385/// A 128-bit vector of [8 x i16]. 4386/// Bits [79:64] are written to bits [31:16] of the result. \n 4387/// Bits [95:80] are written to bits [63:48] of the result. \n 4388/// Bits [111:96] are written to bits [95:80] of the result. \n 4389/// Bits [127:112] are written to bits [127:112] of the result. 4390/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4391static __inline__ __m128i __DEFAULT_FN_ATTRS 4392_mm_unpackhi_epi16(__m128i __a, __m128i __b) 4393{ 4394 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 4395} 4396 4397/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4398/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4399/// 4400/// \headerfile <x86intrin.h> 4401/// 4402/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4403/// instruction. 4404/// 4405/// \param __a 4406/// A 128-bit vector of [4 x i32]. \n 4407/// Bits [95:64] are written to bits [31:0] of the destination. \n 4408/// Bits [127:96] are written to bits [95:64] of the destination. 4409/// \param __b 4410/// A 128-bit vector of [4 x i32]. \n 4411/// Bits [95:64] are written to bits [64:32] of the destination. \n 4412/// Bits [127:96] are written to bits [127:96] of the destination. 4413/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4414static __inline__ __m128i __DEFAULT_FN_ATTRS 4415_mm_unpackhi_epi32(__m128i __a, __m128i __b) 4416{ 4417 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 4418} 4419 4420/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors 4421/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4422/// 4423/// \headerfile <x86intrin.h> 4424/// 4425/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4426/// instruction. 4427/// 4428/// \param __a 4429/// A 128-bit vector of [2 x i64]. \n 4430/// Bits [127:64] are written to bits [63:0] of the destination. 4431/// \param __b 4432/// A 128-bit vector of [2 x i64]. \n 4433/// Bits [127:64] are written to bits [127:64] of the destination. 4434/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4435static __inline__ __m128i __DEFAULT_FN_ATTRS 4436_mm_unpackhi_epi64(__m128i __a, __m128i __b) 4437{ 4438 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 4439} 4440 4441/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4442/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4443/// 4444/// \headerfile <x86intrin.h> 4445/// 4446/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4447/// instruction. 4448/// 4449/// \param __a 4450/// A 128-bit vector of [16 x i8]. \n 4451/// Bits [7:0] are written to bits [7:0] of the result. \n 4452/// Bits [15:8] are written to bits [23:16] of the result. \n 4453/// Bits [23:16] are written to bits [39:32] of the result. \n 4454/// Bits [31:24] are written to bits [55:48] of the result. \n 4455/// Bits [39:32] are written to bits [71:64] of the result. \n 4456/// Bits [47:40] are written to bits [87:80] of the result. \n 4457/// Bits [55:48] are written to bits [103:96] of the result. \n 4458/// Bits [63:56] are written to bits [119:112] of the result. 4459/// \param __b 4460/// A 128-bit vector of [16 x i8]. 4461/// Bits [7:0] are written to bits [15:8] of the result. \n 4462/// Bits [15:8] are written to bits [31:24] of the result. \n 4463/// Bits [23:16] are written to bits [47:40] of the result. \n 4464/// Bits [31:24] are written to bits [63:56] of the result. \n 4465/// Bits [39:32] are written to bits [79:72] of the result. \n 4466/// Bits [47:40] are written to bits [95:88] of the result. \n 4467/// Bits [55:48] are written to bits [111:104] of the result. \n 4468/// Bits [63:56] are written to bits [127:120] of the result. 4469/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4470static __inline__ __m128i __DEFAULT_FN_ATTRS 4471_mm_unpacklo_epi8(__m128i __a, __m128i __b) 4472{ 4473 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 4474} 4475 4476/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit 4477/// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4478/// [8 x i16]. 4479/// 4480/// \headerfile <x86intrin.h> 4481/// 4482/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4483/// instruction. 4484/// 4485/// \param __a 4486/// A 128-bit vector of [8 x i16]. 4487/// Bits [15:0] are written to bits [15:0] of the result. \n 4488/// Bits [31:16] are written to bits [47:32] of the result. \n 4489/// Bits [47:32] are written to bits [79:64] of the result. \n 4490/// Bits [63:48] are written to bits [111:96] of the result. 4491/// \param __b 4492/// A 128-bit vector of [8 x i16]. 4493/// Bits [15:0] are written to bits [31:16] of the result. \n 4494/// Bits [31:16] are written to bits [63:48] of the result. \n 4495/// Bits [47:32] are written to bits [95:80] of the result. \n 4496/// Bits [63:48] are written to bits [127:112] of the result. 4497/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4498static __inline__ __m128i __DEFAULT_FN_ATTRS 4499_mm_unpacklo_epi16(__m128i __a, __m128i __b) 4500{ 4501 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 4502} 4503 4504/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4505/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4506/// 4507/// \headerfile <x86intrin.h> 4508/// 4509/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4510/// instruction. 4511/// 4512/// \param __a 4513/// A 128-bit vector of [4 x i32]. \n 4514/// Bits [31:0] are written to bits [31:0] of the destination. \n 4515/// Bits [63:32] are written to bits [95:64] of the destination. 4516/// \param __b 4517/// A 128-bit vector of [4 x i32]. \n 4518/// Bits [31:0] are written to bits [64:32] of the destination. \n 4519/// Bits [63:32] are written to bits [127:96] of the destination. 4520/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4521static __inline__ __m128i __DEFAULT_FN_ATTRS 4522_mm_unpacklo_epi32(__m128i __a, __m128i __b) 4523{ 4524 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 4525} 4526 4527/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of 4528/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4529/// 4530/// \headerfile <x86intrin.h> 4531/// 4532/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4533/// instruction. 4534/// 4535/// \param __a 4536/// A 128-bit vector of [2 x i64]. \n 4537/// Bits [63:0] are written to bits [63:0] of the destination. \n 4538/// \param __b 4539/// A 128-bit vector of [2 x i64]. \n 4540/// Bits [63:0] are written to bits [127:64] of the destination. \n 4541/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4542static __inline__ __m128i __DEFAULT_FN_ATTRS 4543_mm_unpacklo_epi64(__m128i __a, __m128i __b) 4544{ 4545 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 4546} 4547 4548/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4549/// integer. 4550/// 4551/// \headerfile <x86intrin.h> 4552/// 4553/// This intrinsic has no corresponding instruction. 4554/// 4555/// \param __a 4556/// A 128-bit integer vector operand. The lower 64 bits are moved to the 4557/// destination. 4558/// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4559static __inline__ __m64 __DEFAULT_FN_ATTRS 4560_mm_movepi64_pi64(__m128i __a) 4561{ 4562 return (__m64)__a[0]; 4563} 4564 4565/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4566/// upper bits. 4567/// 4568/// \headerfile <x86intrin.h> 4569/// 4570/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction. 4571/// 4572/// \param __a 4573/// A 64-bit value. 4574/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4575/// the operand. The upper 64 bits are assigned zeros. 4576static __inline__ __m128i __DEFAULT_FN_ATTRS 4577_mm_movpi64_epi64(__m64 __a) 4578{ 4579 return (__m128i){ (long long)__a, 0 }; 4580} 4581 4582/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4583/// integer vector, zeroing the upper bits. 4584/// 4585/// \headerfile <x86intrin.h> 4586/// 4587/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4588/// 4589/// \param __a 4590/// A 128-bit integer vector operand. The lower 64 bits are moved to the 4591/// destination. 4592/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4593/// the operand. The upper 64 bits are assigned zeros. 4594static __inline__ __m128i __DEFAULT_FN_ATTRS 4595_mm_move_epi64(__m128i __a) 4596{ 4597 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); 4598} 4599 4600/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors 4601/// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4602/// double]. 4603/// 4604/// \headerfile <x86intrin.h> 4605/// 4606/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4607/// 4608/// \param __a 4609/// A 128-bit vector of [2 x double]. \n 4610/// Bits [127:64] are written to bits [63:0] of the destination. 4611/// \param __b 4612/// A 128-bit vector of [2 x double]. \n 4613/// Bits [127:64] are written to bits [127:64] of the destination. 4614/// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4615static __inline__ __m128d __DEFAULT_FN_ATTRS 4616_mm_unpackhi_pd(__m128d __a, __m128d __b) 4617{ 4618 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 4619} 4620 4621/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors 4622/// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4623/// double]. 4624/// 4625/// \headerfile <x86intrin.h> 4626/// 4627/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4628/// 4629/// \param __a 4630/// A 128-bit vector of [2 x double]. \n 4631/// Bits [63:0] are written to bits [63:0] of the destination. 4632/// \param __b 4633/// A 128-bit vector of [2 x double]. \n 4634/// Bits [63:0] are written to bits [127:64] of the destination. 4635/// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4636static __inline__ __m128d __DEFAULT_FN_ATTRS 4637_mm_unpacklo_pd(__m128d __a, __m128d __b) 4638{ 4639 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 4640} 4641 4642/// \brief Extracts the sign bits of the double-precision values in the 128-bit 4643/// vector of [2 x double], zero-extends the value, and writes it to the 4644/// low-order bits of the destination. 4645/// 4646/// \headerfile <x86intrin.h> 4647/// 4648/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4649/// 4650/// \param __a 4651/// A 128-bit vector of [2 x double] containing the values with sign bits to 4652/// be extracted. 4653/// \returns The sign bits from each of the double-precision elements in \a __a, 4654/// written to bits [1:0]. The remaining bits are assigned values of zero. 4655static __inline__ int __DEFAULT_FN_ATTRS 4656_mm_movemask_pd(__m128d __a) 4657{ 4658 return __builtin_ia32_movmskpd((__v2df)__a); 4659} 4660 4661 4662/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two 4663/// 128-bit vector parameters of [2 x double], using the immediate-value 4664/// parameter as a specifier. 4665/// 4666/// \headerfile <x86intrin.h> 4667/// 4668/// \code 4669/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4670/// \endcode 4671/// 4672/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4673/// 4674/// \param a 4675/// A 128-bit vector of [2 x double]. 4676/// \param b 4677/// A 128-bit vector of [2 x double]. 4678/// \param i 4679/// An 8-bit immediate value. The least significant two bits specify which 4680/// elements to copy from a and b: \n 4681/// Bit[0] = 0: lower element of a copied to lower element of result. \n 4682/// Bit[0] = 1: upper element of a copied to lower element of result. \n 4683/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4684/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4685/// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4686#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 4687 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4688 0 + (((i) >> 0) & 0x1), \ 4689 2 + (((i) >> 1) & 0x1)); }) 4690 4691/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4692/// floating-point vector of [4 x float]. 4693/// 4694/// \headerfile <x86intrin.h> 4695/// 4696/// This intrinsic has no corresponding instruction. 4697/// 4698/// \param __a 4699/// A 128-bit floating-point vector of [2 x double]. 4700/// \returns A 128-bit floating-point vector of [4 x float] containing the same 4701/// bitwise pattern as the parameter. 4702static __inline__ __m128 __DEFAULT_FN_ATTRS 4703_mm_castpd_ps(__m128d __a) 4704{ 4705 return (__m128)__a; 4706} 4707 4708/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4709/// integer vector. 4710/// 4711/// \headerfile <x86intrin.h> 4712/// 4713/// This intrinsic has no corresponding instruction. 4714/// 4715/// \param __a 4716/// A 128-bit floating-point vector of [2 x double]. 4717/// \returns A 128-bit integer vector containing the same bitwise pattern as the 4718/// parameter. 4719static __inline__ __m128i __DEFAULT_FN_ATTRS 4720_mm_castpd_si128(__m128d __a) 4721{ 4722 return (__m128i)__a; 4723} 4724 4725/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4726/// floating-point vector of [2 x double]. 4727/// 4728/// \headerfile <x86intrin.h> 4729/// 4730/// This intrinsic has no corresponding instruction. 4731/// 4732/// \param __a 4733/// A 128-bit floating-point vector of [4 x float]. 4734/// \returns A 128-bit floating-point vector of [2 x double] containing the same 4735/// bitwise pattern as the parameter. 4736static __inline__ __m128d __DEFAULT_FN_ATTRS 4737_mm_castps_pd(__m128 __a) 4738{ 4739 return (__m128d)__a; 4740} 4741 4742/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4743/// integer vector. 4744/// 4745/// \headerfile <x86intrin.h> 4746/// 4747/// This intrinsic has no corresponding instruction. 4748/// 4749/// \param __a 4750/// A 128-bit floating-point vector of [4 x float]. 4751/// \returns A 128-bit integer vector containing the same bitwise pattern as the 4752/// parameter. 4753static __inline__ __m128i __DEFAULT_FN_ATTRS 4754_mm_castps_si128(__m128 __a) 4755{ 4756 return (__m128i)__a; 4757} 4758 4759/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector 4760/// of [4 x float]. 4761/// 4762/// \headerfile <x86intrin.h> 4763/// 4764/// This intrinsic has no corresponding instruction. 4765/// 4766/// \param __a 4767/// A 128-bit integer vector. 4768/// \returns A 128-bit floating-point vector of [4 x float] containing the same 4769/// bitwise pattern as the parameter. 4770static __inline__ __m128 __DEFAULT_FN_ATTRS 4771_mm_castsi128_ps(__m128i __a) 4772{ 4773 return (__m128)__a; 4774} 4775 4776/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector 4777/// of [2 x double]. 4778/// 4779/// \headerfile <x86intrin.h> 4780/// 4781/// This intrinsic has no corresponding instruction. 4782/// 4783/// \param __a 4784/// A 128-bit integer vector. 4785/// \returns A 128-bit floating-point vector of [2 x double] containing the same 4786/// bitwise pattern as the parameter. 4787static __inline__ __m128d __DEFAULT_FN_ATTRS 4788_mm_castsi128_pd(__m128i __a) 4789{ 4790 return (__m128d)__a; 4791} 4792 4793#if defined(__cplusplus) 4794extern "C" { 4795#endif 4796 4797/// \brief Indicates that a spin loop is being executed for the purposes of 4798/// optimizing power consumption during the loop. 4799/// 4800/// \headerfile <x86intrin.h> 4801/// 4802/// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4803/// 4804void _mm_pause(void); 4805 4806#if defined(__cplusplus) 4807} // extern "C" 4808#endif 4809#undef __DEFAULT_FN_ATTRS 4810 4811#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4812 4813#define _MM_DENORMALS_ZERO_ON (0x0040) 4814#define _MM_DENORMALS_ZERO_OFF (0x0000) 4815 4816#define _MM_DENORMALS_ZERO_MASK (0x0040) 4817 4818#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4819#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4820 4821#endif /* __EMMINTRIN_H */ 4822