1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __EMMINTRIN_H 25#define __EMMINTRIN_H 26 27#include <xmmintrin.h> 28 29typedef double __m128d __attribute__((__vector_size__(16))); 30typedef long long __m128i __attribute__((__vector_size__(16))); 31 32/* Type defines. */ 33typedef double __v2df __attribute__ ((__vector_size__ (16))); 34typedef long long __v2di __attribute__ ((__vector_size__ (16))); 35typedef short __v8hi __attribute__((__vector_size__(16))); 36typedef char __v16qi __attribute__((__vector_size__(16))); 37 38/* Unsigned types */ 39typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 40typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 41typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 42 43/* We need an explicitly signed variant for char. Note that this shouldn't 44 * appear in the interface though. */ 45typedef signed char __v16qs __attribute__((__vector_size__(16))); 46 47#include <f16cintrin.h> 48 49/* Define the default attributes for the functions in this file. */ 50#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 51 52/// \brief Adds lower double-precision values in both operands and returns the 53/// sum in the lower 64 bits of the result. The upper 64 bits of the result 54/// are copied from the upper double-precision value of the first operand. 55/// 56/// \headerfile <x86intrin.h> 57/// 58/// This intrinsic corresponds to the \c VADDSD / ADDSD instruction. 59/// 60/// \param __a 61/// A 128-bit vector of [2 x double] containing one of the source operands. 62/// \param __b 63/// A 128-bit vector of [2 x double] containing one of the source operands. 64/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 65/// sum of the lower 64 bits of both operands. The upper 64 bits are copied 66/// from the upper 64 bits of the first source operand. 67static __inline__ __m128d __DEFAULT_FN_ATTRS 68_mm_add_sd(__m128d __a, __m128d __b) 69{ 70 __a[0] += __b[0]; 71 return __a; 72} 73 74/// \brief Adds two 128-bit vectors of [2 x double]. 75/// 76/// \headerfile <x86intrin.h> 77/// 78/// This intrinsic corresponds to the \c VADDPD / ADDPD instruction. 79/// 80/// \param __a 81/// A 128-bit vector of [2 x double] containing one of the source operands. 82/// \param __b 83/// A 128-bit vector of [2 x double] containing one of the source operands. 84/// \returns A 128-bit vector of [2 x double] containing the sums of both 85/// operands. 86static __inline__ __m128d __DEFAULT_FN_ATTRS 87_mm_add_pd(__m128d __a, __m128d __b) 88{ 89 return (__m128d)((__v2df)__a + (__v2df)__b); 90} 91 92/// \brief Subtracts the lower double-precision value of the second operand 93/// from the lower double-precision value of the first operand and returns 94/// the difference in the lower 64 bits of the result. The upper 64 bits of 95/// the result are copied from the upper double-precision value of the first 96/// operand. 97/// 98/// \headerfile <x86intrin.h> 99/// 100/// This intrinsic corresponds to the \c VSUBSD / SUBSD instruction. 101/// 102/// \param __a 103/// A 128-bit vector of [2 x double] containing the minuend. 104/// \param __b 105/// A 128-bit vector of [2 x double] containing the subtrahend. 106/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 107/// difference of the lower 64 bits of both operands. The upper 64 bits are 108/// copied from the upper 64 bits of the first source operand. 109static __inline__ __m128d __DEFAULT_FN_ATTRS 110_mm_sub_sd(__m128d __a, __m128d __b) 111{ 112 __a[0] -= __b[0]; 113 return __a; 114} 115 116/// \brief Subtracts two 128-bit vectors of [2 x double]. 117/// 118/// \headerfile <x86intrin.h> 119/// 120/// This intrinsic corresponds to the \c VSUBPD / SUBPD instruction. 121/// 122/// \param __a 123/// A 128-bit vector of [2 x double] containing the minuend. 124/// \param __b 125/// A 128-bit vector of [2 x double] containing the subtrahend. 126/// \returns A 128-bit vector of [2 x double] containing the differences between 127/// both operands. 128static __inline__ __m128d __DEFAULT_FN_ATTRS 129_mm_sub_pd(__m128d __a, __m128d __b) 130{ 131 return (__m128d)((__v2df)__a - (__v2df)__b); 132} 133 134/// \brief Multiplies lower double-precision values in both operands and returns 135/// the product in the lower 64 bits of the result. The upper 64 bits of the 136/// result are copied from the upper double-precision value of the first 137/// operand. 138/// 139/// \headerfile <x86intrin.h> 140/// 141/// This intrinsic corresponds to the \c VMULSD / MULSD instruction. 142/// 143/// \param __a 144/// A 128-bit vector of [2 x double] containing one of the source operands. 145/// \param __b 146/// A 128-bit vector of [2 x double] containing one of the source operands. 147/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 148/// product of the lower 64 bits of both operands. The upper 64 bits are 149/// copied from the upper 64 bits of the first source operand. 150static __inline__ __m128d __DEFAULT_FN_ATTRS 151_mm_mul_sd(__m128d __a, __m128d __b) 152{ 153 __a[0] *= __b[0]; 154 return __a; 155} 156 157/// \brief Multiplies two 128-bit vectors of [2 x double]. 158/// 159/// \headerfile <x86intrin.h> 160/// 161/// This intrinsic corresponds to the \c VMULPD / MULPD instruction. 162/// 163/// \param __a 164/// A 128-bit vector of [2 x double] containing one of the operands. 165/// \param __b 166/// A 128-bit vector of [2 x double] containing one of the operands. 167/// \returns A 128-bit vector of [2 x double] containing the products of both 168/// operands. 169static __inline__ __m128d __DEFAULT_FN_ATTRS 170_mm_mul_pd(__m128d __a, __m128d __b) 171{ 172 return (__m128d)((__v2df)__a * (__v2df)__b); 173} 174 175/// \brief Divides the lower double-precision value of the first operand by the 176/// lower double-precision value of the second operand and returns the 177/// quotient in the lower 64 bits of the result. The upper 64 bits of the 178/// result are copied from the upper double-precision value of the first 179/// operand. 180/// 181/// \headerfile <x86intrin.h> 182/// 183/// This intrinsic corresponds to the \c VDIVSD / DIVSD instruction. 184/// 185/// \param __a 186/// A 128-bit vector of [2 x double] containing the dividend. 187/// \param __b 188/// A 128-bit vector of [2 x double] containing divisor. 189/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 190/// quotient of the lower 64 bits of both operands. The upper 64 bits are 191/// copied from the upper 64 bits of the first source operand. 192static __inline__ __m128d __DEFAULT_FN_ATTRS 193_mm_div_sd(__m128d __a, __m128d __b) 194{ 195 __a[0] /= __b[0]; 196 return __a; 197} 198 199/// \brief Performs an element-by-element division of two 128-bit vectors of 200/// [2 x double]. 201/// 202/// \headerfile <x86intrin.h> 203/// 204/// This intrinsic corresponds to the \c VDIVPD / DIVPD instruction. 205/// 206/// \param __a 207/// A 128-bit vector of [2 x double] containing the dividend. 208/// \param __b 209/// A 128-bit vector of [2 x double] containing the divisor. 210/// \returns A 128-bit vector of [2 x double] containing the quotients of both 211/// operands. 212static __inline__ __m128d __DEFAULT_FN_ATTRS 213_mm_div_pd(__m128d __a, __m128d __b) 214{ 215 return (__m128d)((__v2df)__a / (__v2df)__b); 216} 217 218/// \brief Calculates the square root of the lower double-precision value of 219/// the second operand and returns it in the lower 64 bits of the result. 220/// The upper 64 bits of the result are copied from the upper double- 221/// precision value of the first operand. 222/// 223/// \headerfile <x86intrin.h> 224/// 225/// This intrinsic corresponds to the \c VSQRTSD / SQRTSD instruction. 226/// 227/// \param __a 228/// A 128-bit vector of [2 x double] containing one of the operands. The 229/// upper 64 bits of this operand are copied to the upper 64 bits of the 230/// result. 231/// \param __b 232/// A 128-bit vector of [2 x double] containing one of the operands. The 233/// square root is calculated using the lower 64 bits of this operand. 234/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 235/// square root of the lower 64 bits of operand __b, and whose upper 64 bits 236/// are copied from the upper 64 bits of operand __a. 237static __inline__ __m128d __DEFAULT_FN_ATTRS 238_mm_sqrt_sd(__m128d __a, __m128d __b) 239{ 240 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 241 return (__m128d) { __c[0], __a[1] }; 242} 243 244/// \brief Calculates the square root of the each of two values stored in a 245/// 128-bit vector of [2 x double]. 246/// 247/// \headerfile <x86intrin.h> 248/// 249/// This intrinsic corresponds to the \c VSQRTPD / SQRTPD instruction. 250/// 251/// \param __a 252/// A 128-bit vector of [2 x double]. 253/// \returns A 128-bit vector of [2 x double] containing the square roots of the 254/// values in the operand. 255static __inline__ __m128d __DEFAULT_FN_ATTRS 256_mm_sqrt_pd(__m128d __a) 257{ 258 return __builtin_ia32_sqrtpd((__v2df)__a); 259} 260 261/// \brief Compares lower 64-bit double-precision values of both operands, and 262/// returns the lesser of the pair of values in the lower 64-bits of the 263/// result. The upper 64 bits of the result are copied from the upper double- 264/// precision value of the first operand. 265/// 266/// \headerfile <x86intrin.h> 267/// 268/// This intrinsic corresponds to the \c VMINSD / MINSD instruction. 269/// 270/// \param __a 271/// A 128-bit vector of [2 x double] containing one of the operands. The 272/// lower 64 bits of this operand are used in the comparison. 273/// \param __b 274/// A 128-bit vector of [2 x double] containing one of the operands. The 275/// lower 64 bits of this operand are used in the comparison. 276/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 277/// minimum value between both operands. The upper 64 bits are copied from 278/// the upper 64 bits of the first source operand. 279static __inline__ __m128d __DEFAULT_FN_ATTRS 280_mm_min_sd(__m128d __a, __m128d __b) 281{ 282 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 283} 284 285/// \brief Performs element-by-element comparison of the two 128-bit vectors of 286/// [2 x double] and returns the vector containing the lesser of each pair of 287/// values. 288/// 289/// \headerfile <x86intrin.h> 290/// 291/// This intrinsic corresponds to the \c VMINPD / MINPD instruction. 292/// 293/// \param __a 294/// A 128-bit vector of [2 x double] containing one of the operands. 295/// \param __b 296/// A 128-bit vector of [2 x double] containing one of the operands. 297/// \returns A 128-bit vector of [2 x double] containing the minimum values 298/// between both operands. 299static __inline__ __m128d __DEFAULT_FN_ATTRS 300_mm_min_pd(__m128d __a, __m128d __b) 301{ 302 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 303} 304 305/// \brief Compares lower 64-bits double-precision values of both operands, and 306/// returns the greater of the pair of values in the lower 64-bits of the 307/// result. The upper 64 bits of the result are copied from the upper double- 308/// precision value of the first operand. 309/// 310/// \headerfile <x86intrin.h> 311/// 312/// This intrinsic corresponds to the \c VMAXSD / MAXSD instruction. 313/// 314/// \param __a 315/// A 128-bit vector of [2 x double] containing one of the operands. The 316/// lower 64 bits of this operand are used in the comparison. 317/// \param __b 318/// A 128-bit vector of [2 x double] containing one of the operands. The 319/// lower 64 bits of this operand are used in the comparison. 320/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 321/// maximum value between both operands. The upper 64 bits are copied from 322/// the upper 64 bits of the first source operand. 323static __inline__ __m128d __DEFAULT_FN_ATTRS 324_mm_max_sd(__m128d __a, __m128d __b) 325{ 326 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 327} 328 329/// \brief Performs element-by-element comparison of the two 128-bit vectors of 330/// [2 x double] and returns the vector containing the greater of each pair 331/// of values. 332/// 333/// \headerfile <x86intrin.h> 334/// 335/// This intrinsic corresponds to the \c VMAXPD / MAXPD instruction. 336/// 337/// \param __a 338/// A 128-bit vector of [2 x double] containing one of the operands. 339/// \param __b 340/// A 128-bit vector of [2 x double] containing one of the operands. 341/// \returns A 128-bit vector of [2 x double] containing the maximum values 342/// between both operands. 343static __inline__ __m128d __DEFAULT_FN_ATTRS 344_mm_max_pd(__m128d __a, __m128d __b) 345{ 346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 347} 348 349/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double]. 350/// 351/// \headerfile <x86intrin.h> 352/// 353/// This intrinsic corresponds to the \c VPAND / PAND instruction. 354/// 355/// \param __a 356/// A 128-bit vector of [2 x double] containing one of the source operands. 357/// \param __b 358/// A 128-bit vector of [2 x double] containing one of the source operands. 359/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 360/// values between both operands. 361static __inline__ __m128d __DEFAULT_FN_ATTRS 362_mm_and_pd(__m128d __a, __m128d __b) 363{ 364 return (__m128d)((__v2du)__a & (__v2du)__b); 365} 366 367/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using 368/// the one's complement of the values contained in the first source operand. 369/// 370/// \headerfile <x86intrin.h> 371/// 372/// This intrinsic corresponds to the \c VPANDN / PANDN instruction. 373/// 374/// \param __a 375/// A 128-bit vector of [2 x double] containing the left source operand. The 376/// one's complement of this value is used in the bitwise AND. 377/// \param __b 378/// A 128-bit vector of [2 x double] containing the right source operand. 379/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 380/// values in the second operand and the one's complement of the first 381/// operand. 382static __inline__ __m128d __DEFAULT_FN_ATTRS 383_mm_andnot_pd(__m128d __a, __m128d __b) 384{ 385 return (__m128d)(~(__v2du)__a & (__v2du)__b); 386} 387 388/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double]. 389/// 390/// \headerfile <x86intrin.h> 391/// 392/// This intrinsic corresponds to the \c VPOR / POR instruction. 393/// 394/// \param __a 395/// A 128-bit vector of [2 x double] containing one of the source operands. 396/// \param __b 397/// A 128-bit vector of [2 x double] containing one of the source operands. 398/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 399/// values between both operands. 400static __inline__ __m128d __DEFAULT_FN_ATTRS 401_mm_or_pd(__m128d __a, __m128d __b) 402{ 403 return (__m128d)((__v2du)__a | (__v2du)__b); 404} 405 406/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 407/// 408/// \headerfile <x86intrin.h> 409/// 410/// This intrinsic corresponds to the \c VPXOR / PXOR instruction. 411/// 412/// \param __a 413/// A 128-bit vector of [2 x double] containing one of the source operands. 414/// \param __b 415/// A 128-bit vector of [2 x double] containing one of the source operands. 416/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 417/// values between both operands. 418static __inline__ __m128d __DEFAULT_FN_ATTRS 419_mm_xor_pd(__m128d __a, __m128d __b) 420{ 421 return (__m128d)((__v2du)__a ^ (__v2du)__b); 422} 423 424/// \brief Compares each of the corresponding double-precision values of the 425/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h 426/// for false, FFFFFFFFFFFFFFFFh for true. 427/// 428/// \headerfile <x86intrin.h> 429/// 430/// This intrinsic corresponds to the \c VCMPEQPD / CMPEQPD instruction. 431/// 432/// \param __a 433/// A 128-bit vector of [2 x double]. 434/// \param __b 435/// A 128-bit vector of [2 x double]. 436/// \returns A 128-bit vector containing the comparison results. 437static __inline__ __m128d __DEFAULT_FN_ATTRS 438_mm_cmpeq_pd(__m128d __a, __m128d __b) 439{ 440 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 441} 442 443/// \brief Compares each of the corresponding double-precision values of the 444/// 128-bit vectors of [2 x double] to determine if the values in the first 445/// operand are less than those in the second operand. Each comparison 446/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 447/// 448/// \headerfile <x86intrin.h> 449/// 450/// This intrinsic corresponds to the \c VCMPLTPD / CMPLTPD instruction. 451/// 452/// \param __a 453/// A 128-bit vector of [2 x double]. 454/// \param __b 455/// A 128-bit vector of [2 x double]. 456/// \returns A 128-bit vector containing the comparison results. 457static __inline__ __m128d __DEFAULT_FN_ATTRS 458_mm_cmplt_pd(__m128d __a, __m128d __b) 459{ 460 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 461} 462 463/// \brief Compares each of the corresponding double-precision values of the 464/// 128-bit vectors of [2 x double] to determine if the values in the first 465/// operand are less than or equal to those in the second operand. Each 466/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 467/// 468/// \headerfile <x86intrin.h> 469/// 470/// This intrinsic corresponds to the \c VCMPLEPD / CMPLEPD instruction. 471/// 472/// \param __a 473/// A 128-bit vector of [2 x double]. 474/// \param __b 475/// A 128-bit vector of [2 x double]. 476/// \returns A 128-bit vector containing the comparison results. 477static __inline__ __m128d __DEFAULT_FN_ATTRS 478_mm_cmple_pd(__m128d __a, __m128d __b) 479{ 480 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 481} 482 483/// \brief Compares each of the corresponding double-precision values of the 484/// 128-bit vectors of [2 x double] to determine if the values in the first 485/// operand are greater than those in the second operand. Each comparison 486/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 487/// 488/// \headerfile <x86intrin.h> 489/// 490/// This intrinsic corresponds to the \c VCMPLTPD / CMPLTPD instruction. 491/// 492/// \param __a 493/// A 128-bit vector of [2 x double]. 494/// \param __b 495/// A 128-bit vector of [2 x double]. 496/// \returns A 128-bit vector containing the comparison results. 497static __inline__ __m128d __DEFAULT_FN_ATTRS 498_mm_cmpgt_pd(__m128d __a, __m128d __b) 499{ 500 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 501} 502 503/// \brief Compares each of the corresponding double-precision values of the 504/// 128-bit vectors of [2 x double] to determine if the values in the first 505/// operand are greater than or equal to those in the second operand. Each 506/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 507/// 508/// \headerfile <x86intrin.h> 509/// 510/// This intrinsic corresponds to the \c VCMPLEPD / CMPLEPD instruction. 511/// 512/// \param __a 513/// A 128-bit vector of [2 x double]. 514/// \param __b 515/// A 128-bit vector of [2 x double]. 516/// \returns A 128-bit vector containing the comparison results. 517static __inline__ __m128d __DEFAULT_FN_ATTRS 518_mm_cmpge_pd(__m128d __a, __m128d __b) 519{ 520 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 521} 522 523/// \brief Compares each of the corresponding double-precision values of the 524/// 128-bit vectors of [2 x double] to determine if the values in the first 525/// operand are ordered with respect to those in the second operand. A pair 526/// of double-precision values are "ordered" with respect to each other if 527/// neither value is a NaN. Each comparison yields 0h for false, 528/// FFFFFFFFFFFFFFFFh for true. 529/// 530/// \headerfile <x86intrin.h> 531/// 532/// This intrinsic corresponds to the \c VCMPORDPD / CMPORDPD instruction. 533/// 534/// \param __a 535/// A 128-bit vector of [2 x double]. 536/// \param __b 537/// A 128-bit vector of [2 x double]. 538/// \returns A 128-bit vector containing the comparison results. 539static __inline__ __m128d __DEFAULT_FN_ATTRS 540_mm_cmpord_pd(__m128d __a, __m128d __b) 541{ 542 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 543} 544 545/// \brief Compares each of the corresponding double-precision values of the 546/// 128-bit vectors of [2 x double] to determine if the values in the first 547/// operand are unordered with respect to those in the second operand. A pair 548/// of double-precision values are "unordered" with respect to each other if 549/// one or both values are NaN. Each comparison yields 0h for false, 550/// FFFFFFFFFFFFFFFFh for true. 551/// 552/// \headerfile <x86intrin.h> 553/// 554/// This intrinsic corresponds to the \c VCMPUNORDPD / CMPUNORDPD instruction. 555/// 556/// \param __a 557/// A 128-bit vector of [2 x double]. 558/// \param __b 559/// A 128-bit vector of [2 x double]. 560/// \returns A 128-bit vector containing the comparison results. 561static __inline__ __m128d __DEFAULT_FN_ATTRS 562_mm_cmpunord_pd(__m128d __a, __m128d __b) 563{ 564 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 565} 566 567/// \brief Compares each of the corresponding double-precision values of the 568/// 128-bit vectors of [2 x double] to determine if the values in the first 569/// operand are unequal to those in the second operand. Each comparison 570/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 571/// 572/// \headerfile <x86intrin.h> 573/// 574/// This intrinsic corresponds to the \c VCMPNEQPD / CMPNEQPD instruction. 575/// 576/// \param __a 577/// A 128-bit vector of [2 x double]. 578/// \param __b 579/// A 128-bit vector of [2 x double]. 580/// \returns A 128-bit vector containing the comparison results. 581static __inline__ __m128d __DEFAULT_FN_ATTRS 582_mm_cmpneq_pd(__m128d __a, __m128d __b) 583{ 584 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 585} 586 587/// \brief Compares each of the corresponding double-precision values of the 588/// 128-bit vectors of [2 x double] to determine if the values in the first 589/// operand are not less than those in the second operand. Each comparison 590/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. 591/// 592/// \headerfile <x86intrin.h> 593/// 594/// This intrinsic corresponds to the \c VCMPNLTPD / CMPNLTPD instruction. 595/// 596/// \param __a 597/// A 128-bit vector of [2 x double]. 598/// \param __b 599/// A 128-bit vector of [2 x double]. 600/// \returns A 128-bit vector containing the comparison results. 601static __inline__ __m128d __DEFAULT_FN_ATTRS 602_mm_cmpnlt_pd(__m128d __a, __m128d __b) 603{ 604 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 605} 606 607/// \brief Compares each of the corresponding double-precision values of the 608/// 128-bit vectors of [2 x double] to determine if the values in the first 609/// operand are not less than or equal to those in the second operand. Each 610/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 611/// 612/// \headerfile <x86intrin.h> 613/// 614/// This intrinsic corresponds to the \c VCMPNLEPD / CMPNLEPD instruction. 615/// 616/// \param __a 617/// A 128-bit vector of [2 x double]. 618/// \param __b 619/// A 128-bit vector of [2 x double]. 620/// \returns A 128-bit vector containing the comparison results. 621static __inline__ __m128d __DEFAULT_FN_ATTRS 622_mm_cmpnle_pd(__m128d __a, __m128d __b) 623{ 624 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 625} 626 627/// \brief Compares each of the corresponding double-precision values of the 628/// 128-bit vectors of [2 x double] to determine if the values in the first 629/// operand are not greater than those in the second operand. Each 630/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 631/// 632/// \headerfile <x86intrin.h> 633/// 634/// This intrinsic corresponds to the \c VCMPNLTPD / CMPNLTPD instruction. 635/// 636/// \param __a 637/// A 128-bit vector of [2 x double]. 638/// \param __b 639/// A 128-bit vector of [2 x double]. 640/// \returns A 128-bit vector containing the comparison results. 641static __inline__ __m128d __DEFAULT_FN_ATTRS 642_mm_cmpngt_pd(__m128d __a, __m128d __b) 643{ 644 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 645} 646 647/// \brief Compares each of the corresponding double-precision values of the 648/// 128-bit vectors of [2 x double] to determine if the values in the first 649/// operand are not greater than or equal to those in the second operand. 650/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 651/// 652/// \headerfile <x86intrin.h> 653/// 654/// This intrinsic corresponds to the \c VCMPNLEPD / CMPNLEPD instruction. 655/// 656/// \param __a 657/// A 128-bit vector of [2 x double]. 658/// \param __b 659/// A 128-bit vector of [2 x double]. 660/// \returns A 128-bit vector containing the comparison results. 661static __inline__ __m128d __DEFAULT_FN_ATTRS 662_mm_cmpnge_pd(__m128d __a, __m128d __b) 663{ 664 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 665} 666 667/// \brief Compares the lower double-precision floating-point values in each of 668/// the two 128-bit floating-point vectors of [2 x double] for equality. The 669/// comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. 670/// 671/// \headerfile <x86intrin.h> 672/// 673/// This intrinsic corresponds to the \c VCMPEQSD / CMPEQSD instruction. 674/// 675/// \param __a 676/// A 128-bit vector of [2 x double]. The lower double-precision value is 677/// compared to the lower double-precision value of __b. 678/// \param __b 679/// A 128-bit vector of [2 x double]. The lower double-precision value is 680/// compared to the lower double-precision value of __a. 681/// \returns A 128-bit vector. The lower 64 bits contains the comparison 682/// results. The upper 64 bits are copied from the upper 64 bits of __a. 683static __inline__ __m128d __DEFAULT_FN_ATTRS 684_mm_cmpeq_sd(__m128d __a, __m128d __b) 685{ 686 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 687} 688 689/// \brief Compares the lower double-precision floating-point values in each of 690/// the two 128-bit floating-point vectors of [2 x double] to determine if 691/// the value in the first parameter is less than the corresponding value in 692/// the second parameter. The comparison yields 0h for false, 693/// FFFFFFFFFFFFFFFFh for true. 694/// 695/// \headerfile <x86intrin.h> 696/// 697/// This intrinsic corresponds to the \c VCMPLTSD / CMPLTSD instruction. 698/// 699/// \param __a 700/// A 128-bit vector of [2 x double]. The lower double-precision value is 701/// compared to the lower double-precision value of __b. 702/// \param __b 703/// A 128-bit vector of [2 x double]. The lower double-precision value is 704/// compared to the lower double-precision value of __a. 705/// \returns A 128-bit vector. The lower 64 bits contains the comparison 706/// results. The upper 64 bits are copied from the upper 64 bits of __a. 707static __inline__ __m128d __DEFAULT_FN_ATTRS 708_mm_cmplt_sd(__m128d __a, __m128d __b) 709{ 710 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 711} 712 713/// \brief Compares the lower double-precision floating-point values in each of 714/// the two 128-bit floating-point vectors of [2 x double] to determine if 715/// the value in the first parameter is less than or equal to the 716/// corresponding value in the second parameter. The comparison yields 0h for 717/// false, FFFFFFFFFFFFFFFFh for true. 718/// 719/// \headerfile <x86intrin.h> 720/// 721/// This intrinsic corresponds to the \c VCMPLESD / CMPLESD instruction. 722/// 723/// \param __a 724/// A 128-bit vector of [2 x double]. The lower double-precision value is 725/// compared to the lower double-precision value of __b. 726/// \param __b 727/// A 128-bit vector of [2 x double]. The lower double-precision value is 728/// compared to the lower double-precision value of __a. 729/// \returns A 128-bit vector. The lower 64 bits contains the comparison 730/// results. The upper 64 bits are copied from the upper 64 bits of __a. 731static __inline__ __m128d __DEFAULT_FN_ATTRS 732_mm_cmple_sd(__m128d __a, __m128d __b) 733{ 734 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 735} 736 737/// \brief Compares the lower double-precision floating-point values in each 738/// of the two 128-bit floating-point vectors of [2 x double] to determine 739/// if the value in the first parameter is greater than the corresponding 740/// value in the second parameter. The comparison yields 0h for false, 741/// FFFFFFFFFFFFFFFFh for true. 742/// 743/// \headerfile <x86intrin.h> 744/// 745/// This intrinsic corresponds to the \c VCMPLTSD / CMPLTSD instruction. 746/// 747/// \param __a 748/// A 128-bit vector of [2 x double]. The lower double-precision value is 749/// compared to the lower double-precision value of __b. 750/// \param __b 751/// A 128-bit vector of [2 x double]. The lower double-precision value is 752/// compared to the lower double-precision value of __a. 753/// \returns A 128-bit vector. The lower 64 bits contains the comparison 754/// results. The upper 64 bits are copied from the upper 64 bits of __a. 755static __inline__ __m128d __DEFAULT_FN_ATTRS 756_mm_cmpgt_sd(__m128d __a, __m128d __b) 757{ 758 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 759 return (__m128d) { __c[0], __a[1] }; 760} 761 762/// \brief Compares the lower double-precision floating-point values in each of 763/// the two 128-bit floating-point vectors of [2 x double] to determine if 764/// the value in the first parameter is greater than or equal to the 765/// corresponding value in the second parameter. The comparison yields 0h for 766/// false, FFFFFFFFFFFFFFFFh for true. 767/// 768/// \headerfile <x86intrin.h> 769/// 770/// This intrinsic corresponds to the \c VCMPLESD / CMPLESD instruction. 771/// 772/// \param __a 773/// A 128-bit vector of [2 x double]. The lower double-precision value is 774/// compared to the lower double-precision value of __b. 775/// \param __b 776/// A 128-bit vector of [2 x double]. The lower double-precision value is 777/// compared to the lower double-precision value of __a. 778/// \returns A 128-bit vector. The lower 64 bits contains the comparison 779/// results. The upper 64 bits are copied from the upper 64 bits of __a. 780static __inline__ __m128d __DEFAULT_FN_ATTRS 781_mm_cmpge_sd(__m128d __a, __m128d __b) 782{ 783 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 784 return (__m128d) { __c[0], __a[1] }; 785} 786 787/// \brief Compares the lower double-precision floating-point values in each 788/// of the two 128-bit floating-point vectors of [2 x double] to determine 789/// if the value in the first parameter is "ordered" with respect to the 790/// corresponding value in the second parameter. The comparison yields 0h for 791/// false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are 792/// "ordered" with respect to each other if neither value is a NaN. 793/// 794/// \headerfile <x86intrin.h> 795/// 796/// This intrinsic corresponds to the \c VCMPORDSD / CMPORDSD instruction. 797/// 798/// \param __a 799/// A 128-bit vector of [2 x double]. The lower double-precision value is 800/// compared to the lower double-precision value of __b. 801/// \param __b 802/// A 128-bit vector of [2 x double]. The lower double-precision value is 803/// compared to the lower double-precision value of __a. 804/// \returns A 128-bit vector. The lower 64 bits contains the comparison 805/// results. The upper 64 bits are copied from the upper 64 bits of __a. 806static __inline__ __m128d __DEFAULT_FN_ATTRS 807_mm_cmpord_sd(__m128d __a, __m128d __b) 808{ 809 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 810} 811 812/// \brief Compares the lower double-precision floating-point values in each 813/// of the two 128-bit floating-point vectors of [2 x double] to determine 814/// if the value in the first parameter is "unordered" with respect to the 815/// corresponding value in the second parameter. The comparison yields 0h 816/// for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values 817/// are "unordered" with respect to each other if one or both values are NaN. 818/// 819/// \headerfile <x86intrin.h> 820/// 821/// This intrinsic corresponds to the \c VCMPUNORDSD / CMPUNORDSD instruction. 822/// 823/// \param __a 824/// A 128-bit vector of [2 x double]. The lower double-precision value is 825/// compared to the lower double-precision value of __b. 826/// \param __b 827/// A 128-bit vector of [2 x double]. The lower double-precision value is 828/// compared to the lower double-precision value of __a. 829/// \returns A 128-bit vector. The lower 64 bits contains the comparison 830/// results. The upper 64 bits are copied from the upper 64 bits of __a. 831static __inline__ __m128d __DEFAULT_FN_ATTRS 832_mm_cmpunord_sd(__m128d __a, __m128d __b) 833{ 834 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 835} 836 837/// \brief Compares the lower double-precision floating-point values in each of 838/// the two 128-bit floating-point vectors of [2 x double] to determine if 839/// the value in the first parameter is unequal to the corresponding value in 840/// the second parameter. The comparison yields 0h for false, 841/// FFFFFFFFFFFFFFFFh for true. 842/// 843/// \headerfile <x86intrin.h> 844/// 845/// This intrinsic corresponds to the \c VCMPNEQSD / CMPNEQSD instruction. 846/// 847/// \param __a 848/// A 128-bit vector of [2 x double]. The lower double-precision value is 849/// compared to the lower double-precision value of __b. 850/// \param __b 851/// A 128-bit vector of [2 x double]. The lower double-precision value is 852/// compared to the lower double-precision value of __a. 853/// \returns A 128-bit vector. The lower 64 bits contains the comparison 854/// results. The upper 64 bits are copied from the upper 64 bits of __a. 855static __inline__ __m128d __DEFAULT_FN_ATTRS 856_mm_cmpneq_sd(__m128d __a, __m128d __b) 857{ 858 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 859} 860 861/// \brief Compares the lower double-precision floating-point values in each of 862/// the two 128-bit floating-point vectors of [2 x double] to determine if 863/// the value in the first parameter is not less than the corresponding 864/// value in the second parameter. The comparison yields 0h for false, 865/// FFFFFFFFFFFFFFFFh for true. 866/// 867/// \headerfile <x86intrin.h> 868/// 869/// This intrinsic corresponds to the \c VCMPNLTSD / CMPNLTSD instruction. 870/// 871/// \param __a 872/// A 128-bit vector of [2 x double]. The lower double-precision value is 873/// compared to the lower double-precision value of __b. 874/// \param __b 875/// A 128-bit vector of [2 x double]. The lower double-precision value is 876/// compared to the lower double-precision value of __a. 877/// \returns A 128-bit vector. The lower 64 bits contains the comparison 878/// results. The upper 64 bits are copied from the upper 64 bits of __a. 879static __inline__ __m128d __DEFAULT_FN_ATTRS 880_mm_cmpnlt_sd(__m128d __a, __m128d __b) 881{ 882 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 883} 884 885/// \brief Compares the lower double-precision floating-point values in each of 886/// the two 128-bit floating-point vectors of [2 x double] to determine if 887/// the value in the first parameter is not less than or equal to the 888/// corresponding value in the second parameter. The comparison yields 0h 889/// for false, FFFFFFFFFFFFFFFFh for true. 890/// 891/// \headerfile <x86intrin.h> 892/// 893/// This intrinsic corresponds to the \c VCMPNLESD / CMPNLESD instruction. 894/// 895/// \param __a 896/// A 128-bit vector of [2 x double]. The lower double-precision value is 897/// compared to the lower double-precision value of __b. 898/// \param __b 899/// A 128-bit vector of [2 x double]. The lower double-precision value is 900/// compared to the lower double-precision value of __a. 901/// \returns A 128-bit vector. The lower 64 bits contains the comparison 902/// results. The upper 64 bits are copied from the upper 64 bits of __a. 903static __inline__ __m128d __DEFAULT_FN_ATTRS 904_mm_cmpnle_sd(__m128d __a, __m128d __b) 905{ 906 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 907} 908 909/// \brief Compares the lower double-precision floating-point values in each of 910/// the two 128-bit floating-point vectors of [2 x double] to determine if 911/// the value in the first parameter is not greater than the corresponding 912/// value in the second parameter. The comparison yields 0h for false, 913/// FFFFFFFFFFFFFFFFh for true. 914/// 915/// \headerfile <x86intrin.h> 916/// 917/// This intrinsic corresponds to the \c VCMPNLTSD / CMPNLTSD instruction. 918/// 919/// \param __a 920/// A 128-bit vector of [2 x double]. The lower double-precision value is 921/// compared to the lower double-precision value of __b. 922/// \param __b 923/// A 128-bit vector of [2 x double]. The lower double-precision value is 924/// compared to the lower double-precision value of __a. 925/// \returns A 128-bit vector. The lower 64 bits contains the comparison 926/// results. The upper 64 bits are copied from the upper 64 bits of __a. 927static __inline__ __m128d __DEFAULT_FN_ATTRS 928_mm_cmpngt_sd(__m128d __a, __m128d __b) 929{ 930 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 931 return (__m128d) { __c[0], __a[1] }; 932} 933 934/// \brief Compares the lower double-precision floating-point values in each of 935/// the two 128-bit floating-point vectors of [2 x double] to determine if 936/// the value in the first parameter is not greater than or equal to the 937/// corresponding value in the second parameter. The comparison yields 0h 938/// for false, FFFFFFFFFFFFFFFFh for true. 939/// 940/// \headerfile <x86intrin.h> 941/// 942/// This intrinsic corresponds to the \c VCMPNLESD / CMPNLESD instruction. 943/// 944/// \param __a 945/// A 128-bit vector of [2 x double]. The lower double-precision value is 946/// compared to the lower double-precision value of __b. 947/// \param __b 948/// A 128-bit vector of [2 x double]. The lower double-precision value is 949/// compared to the lower double-precision value of __a. 950/// \returns A 128-bit vector. The lower 64 bits contains the comparison 951/// results. The upper 64 bits are copied from the upper 64 bits of __a. 952static __inline__ __m128d __DEFAULT_FN_ATTRS 953_mm_cmpnge_sd(__m128d __a, __m128d __b) 954{ 955 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 956 return (__m128d) { __c[0], __a[1] }; 957} 958 959/// \brief Compares the lower double-precision floating-point values in each of 960/// the two 128-bit floating-point vectors of [2 x double] for equality. The 961/// comparison yields 0 for false, 1 for true. 962/// 963/// \headerfile <x86intrin.h> 964/// 965/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction. 966/// 967/// \param __a 968/// A 128-bit vector of [2 x double]. The lower double-precision value is 969/// compared to the lower double-precision value of __b. 970/// \param __b 971/// A 128-bit vector of [2 x double]. The lower double-precision value is 972/// compared to the lower double-precision value of __a. 973/// \returns An integer containing the comparison results. 974static __inline__ int __DEFAULT_FN_ATTRS 975_mm_comieq_sd(__m128d __a, __m128d __b) 976{ 977 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 978} 979 980/// \brief Compares the lower double-precision floating-point values in each of 981/// the two 128-bit floating-point vectors of [2 x double] to determine if 982/// the value in the first parameter is less than the corresponding value in 983/// the second parameter. The comparison yields 0 for false, 1 for true. 984/// 985/// \headerfile <x86intrin.h> 986/// 987/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction. 988/// 989/// \param __a 990/// A 128-bit vector of [2 x double]. The lower double-precision value is 991/// compared to the lower double-precision value of __b. 992/// \param __b 993/// A 128-bit vector of [2 x double]. The lower double-precision value is 994/// compared to the lower double-precision value of __a. 995/// \returns An integer containing the comparison results. 996static __inline__ int __DEFAULT_FN_ATTRS 997_mm_comilt_sd(__m128d __a, __m128d __b) 998{ 999 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1000} 1001 1002/// \brief Compares the lower double-precision floating-point values in each of 1003/// the two 128-bit floating-point vectors of [2 x double] to determine if 1004/// the value in the first parameter is less than or equal to the 1005/// corresponding value in the second parameter. The comparison yields 0 for 1006/// false, 1 for true. 1007/// 1008/// \headerfile <x86intrin.h> 1009/// 1010/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction. 1011/// 1012/// \param __a 1013/// A 128-bit vector of [2 x double]. The lower double-precision value is 1014/// compared to the lower double-precision value of __b. 1015/// \param __b 1016/// A 128-bit vector of [2 x double]. The lower double-precision value is 1017/// compared to the lower double-precision value of __a. 1018/// \returns An integer containing the comparison results. 1019static __inline__ int __DEFAULT_FN_ATTRS 1020_mm_comile_sd(__m128d __a, __m128d __b) 1021{ 1022 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1023} 1024 1025/// \brief Compares the lower double-precision floating-point values in each of 1026/// the two 128-bit floating-point vectors of [2 x double] to determine if 1027/// the value in the first parameter is greater than the corresponding value 1028/// in the second parameter. The comparison yields 0 for false, 1 for true. 1029/// 1030/// \headerfile <x86intrin.h> 1031/// 1032/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction. 1033/// 1034/// \param __a 1035/// A 128-bit vector of [2 x double]. The lower double-precision value is 1036/// compared to the lower double-precision value of __b. 1037/// \param __b 1038/// A 128-bit vector of [2 x double]. The lower double-precision value is 1039/// compared to the lower double-precision value of __a. 1040/// \returns An integer containing the comparison results. 1041static __inline__ int __DEFAULT_FN_ATTRS 1042_mm_comigt_sd(__m128d __a, __m128d __b) 1043{ 1044 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1045} 1046 1047/// \brief Compares the lower double-precision floating-point values in each of 1048/// the two 128-bit floating-point vectors of [2 x double] to determine if 1049/// the value in the first parameter is greater than or equal to the 1050/// corresponding value in the second parameter. The comparison yields 0 for 1051/// false, 1 for true. 1052/// 1053/// \headerfile <x86intrin.h> 1054/// 1055/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction. 1056/// 1057/// \param __a 1058/// A 128-bit vector of [2 x double]. The lower double-precision value is 1059/// compared to the lower double-precision value of __b. 1060/// \param __b 1061/// A 128-bit vector of [2 x double]. The lower double-precision value is 1062/// compared to the lower double-precision value of __a. 1063/// \returns An integer containing the comparison results. 1064static __inline__ int __DEFAULT_FN_ATTRS 1065_mm_comige_sd(__m128d __a, __m128d __b) 1066{ 1067 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1068} 1069 1070/// \brief Compares the lower double-precision floating-point values in each of 1071/// the two 128-bit floating-point vectors of [2 x double] to determine if 1072/// the value in the first parameter is unequal to the corresponding value in 1073/// the second parameter. The comparison yields 0 for false, 1 for true. 1074/// 1075/// \headerfile <x86intrin.h> 1076/// 1077/// This intrinsic corresponds to the \c VCOMISD / COMISD instruction. 1078/// 1079/// \param __a 1080/// A 128-bit vector of [2 x double]. The lower double-precision value is 1081/// compared to the lower double-precision value of __b. 1082/// \param __b 1083/// A 128-bit vector of [2 x double]. The lower double-precision value is 1084/// compared to the lower double-precision value of __a. 1085/// \returns An integer containing the comparison results. 1086static __inline__ int __DEFAULT_FN_ATTRS 1087_mm_comineq_sd(__m128d __a, __m128d __b) 1088{ 1089 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1090} 1091 1092/// \brief Compares the lower double-precision floating-point values in each of 1093/// the two 128-bit floating-point vectors of [2 x double] for equality. The 1094/// comparison yields 0 for false, 1 for true. If either of the two lower 1095/// double-precision values is NaN, 1 is returned. 1096/// 1097/// \headerfile <x86intrin.h> 1098/// 1099/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction. 1100/// 1101/// \param __a 1102/// A 128-bit vector of [2 x double]. The lower double-precision value is 1103/// compared to the lower double-precision value of __b. 1104/// \param __b 1105/// A 128-bit vector of [2 x double]. The lower double-precision value is 1106/// compared to the lower double-precision value of __a. 1107/// \returns An integer containing the comparison results. If either of the two 1108/// lower double-precision values is NaN, 1 is returned. 1109static __inline__ int __DEFAULT_FN_ATTRS 1110_mm_ucomieq_sd(__m128d __a, __m128d __b) 1111{ 1112 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1113} 1114 1115/// \brief Compares the lower double-precision floating-point values in each of 1116/// the two 128-bit floating-point vectors of [2 x double] to determine if 1117/// the value in the first parameter is less than the corresponding value in 1118/// the second parameter. The comparison yields 0 for false, 1 for true. 1119/// If either of the two lower double-precision values is NaN, 1 is returned. 1120/// 1121/// \headerfile <x86intrin.h> 1122/// 1123/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction. 1124/// 1125/// \param __a 1126/// A 128-bit vector of [2 x double]. The lower double-precision value is 1127/// compared to the lower double-precision value of __b. 1128/// \param __b 1129/// A 128-bit vector of [2 x double]. The lower double-precision value is 1130/// compared to the lower double-precision value of __a. 1131/// \returns An integer containing the comparison results. If either of the two 1132/// lower double-precision values is NaN, 1 is returned. 1133static __inline__ int __DEFAULT_FN_ATTRS 1134_mm_ucomilt_sd(__m128d __a, __m128d __b) 1135{ 1136 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1137} 1138 1139/// \brief Compares the lower double-precision floating-point values in each of 1140/// the two 128-bit floating-point vectors of [2 x double] to determine if 1141/// the value in the first parameter is less than or equal to the 1142/// corresponding value in the second parameter. The comparison yields 0 for 1143/// false, 1 for true. If either of the two lower double-precision values is 1144/// NaN, 1 is returned. 1145/// 1146/// \headerfile <x86intrin.h> 1147/// 1148/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction. 1149/// 1150/// \param __a 1151/// A 128-bit vector of [2 x double]. The lower double-precision value is 1152/// compared to the lower double-precision value of __b. 1153/// \param __b 1154/// A 128-bit vector of [2 x double]. The lower double-precision value is 1155/// compared to the lower double-precision value of __a. 1156/// \returns An integer containing the comparison results. If either of the two 1157/// lower double-precision values is NaN, 1 is returned. 1158static __inline__ int __DEFAULT_FN_ATTRS 1159_mm_ucomile_sd(__m128d __a, __m128d __b) 1160{ 1161 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1162} 1163 1164/// \brief Compares the lower double-precision floating-point values in each of 1165/// the two 128-bit floating-point vectors of [2 x double] to determine if 1166/// the value in the first parameter is greater than the corresponding value 1167/// in the second parameter. The comparison yields 0 for false, 1 for true. 1168/// If either of the two lower double-precision values is NaN, 0 is returned. 1169/// 1170/// \headerfile <x86intrin.h> 1171/// 1172/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction. 1173/// 1174/// \param __a 1175/// A 128-bit vector of [2 x double]. The lower double-precision value is 1176/// compared to the lower double-precision value of __b. 1177/// \param __b 1178/// A 128-bit vector of [2 x double]. The lower double-precision value is 1179/// compared to the lower double-precision value of __a. 1180/// \returns An integer containing the comparison results. If either of the two 1181/// lower double-precision values is NaN, 0 is returned. 1182static __inline__ int __DEFAULT_FN_ATTRS 1183_mm_ucomigt_sd(__m128d __a, __m128d __b) 1184{ 1185 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1186} 1187 1188/// \brief Compares the lower double-precision floating-point values in each of 1189/// the two 128-bit floating-point vectors of [2 x double] to determine if 1190/// the value in the first parameter is greater than or equal to the 1191/// corresponding value in the second parameter. The comparison yields 0 for 1192/// false, 1 for true. If either of the two lower double-precision values 1193/// is NaN, 0 is returned. 1194/// 1195/// \headerfile <x86intrin.h> 1196/// 1197/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction. 1198/// 1199/// \param __a 1200/// A 128-bit vector of [2 x double]. The lower double-precision value is 1201/// compared to the lower double-precision value of __b. 1202/// \param __b 1203/// A 128-bit vector of [2 x double]. The lower double-precision value is 1204/// compared to the lower double-precision value of __a. 1205/// \returns An integer containing the comparison results. If either of the two 1206/// lower double-precision values is NaN, 0 is returned. 1207static __inline__ int __DEFAULT_FN_ATTRS 1208_mm_ucomige_sd(__m128d __a, __m128d __b) 1209{ 1210 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1211} 1212 1213/// \brief Compares the lower double-precision floating-point values in each of 1214/// the two 128-bit floating-point vectors of [2 x double] to determine if 1215/// the value in the first parameter is unequal to the corresponding value in 1216/// the second parameter. The comparison yields 0 for false, 1 for true. If 1217/// either of the two lower double-precision values is NaN, 0 is returned. 1218/// 1219/// \headerfile <x86intrin.h> 1220/// 1221/// This intrinsic corresponds to the \c VUCOMISD / UCOMISD instruction. 1222/// 1223/// \param __a 1224/// A 128-bit vector of [2 x double]. The lower double-precision value is 1225/// compared to the lower double-precision value of __b. 1226/// \param __b 1227/// A 128-bit vector of [2 x double]. The lower double-precision value is 1228/// compared to the lower double-precision value of __a. 1229/// \returns An integer containing the comparison result. If either of the two 1230/// lower double-precision values is NaN, 0 is returned. 1231static __inline__ int __DEFAULT_FN_ATTRS 1232_mm_ucomineq_sd(__m128d __a, __m128d __b) 1233{ 1234 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1235} 1236 1237/// \brief Converts the two double-precision floating-point elements of a 1238/// 128-bit vector of [2 x double] into two single-precision floating-point 1239/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1240/// The upper 64 bits of the result vector are set to zero. 1241/// 1242/// \headerfile <x86intrin.h> 1243/// 1244/// This intrinsic corresponds to the \c VCVTPD2PS / CVTPD2PS instruction. 1245/// 1246/// \param __a 1247/// A 128-bit vector of [2 x double]. 1248/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1249/// converted values. The upper 64 bits are set to zero. 1250static __inline__ __m128 __DEFAULT_FN_ATTRS 1251_mm_cvtpd_ps(__m128d __a) 1252{ 1253 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1254} 1255 1256/// \brief Converts the lower two single-precision floating-point elements of a 1257/// 128-bit vector of [4 x float] into two double-precision floating-point 1258/// values, returned in a 128-bit vector of [2 x double]. The upper two 1259/// elements of the input vector are unused. 1260/// 1261/// \headerfile <x86intrin.h> 1262/// 1263/// This intrinsic corresponds to the \c VCVTPS2PD / CVTPS2PD instruction. 1264/// 1265/// \param __a 1266/// A 128-bit vector of [4 x float]. The lower two single-precision 1267/// floating-point elements are converted to double-precision values. The 1268/// upper two elements are unused. 1269/// \returns A 128-bit vector of [2 x double] containing the converted values. 1270static __inline__ __m128d __DEFAULT_FN_ATTRS 1271_mm_cvtps_pd(__m128 __a) 1272{ 1273 return (__m128d) __builtin_convertvector( 1274 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1275} 1276 1277/// \brief Converts the lower two integer elements of a 128-bit vector of 1278/// [4 x i32] into two double-precision floating-point values, returned in a 1279/// 128-bit vector of [2 x double]. The upper two elements of the input 1280/// vector are unused. 1281/// 1282/// \headerfile <x86intrin.h> 1283/// 1284/// This intrinsic corresponds to the \c VCVTDQ2PD / CVTDQ2PD instruction. 1285/// 1286/// \param __a 1287/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1288/// converted to double-precision values. The upper two elements are unused. 1289/// \returns A 128-bit vector of [2 x double] containing the converted values. 1290static __inline__ __m128d __DEFAULT_FN_ATTRS 1291_mm_cvtepi32_pd(__m128i __a) 1292{ 1293 return (__m128d) __builtin_convertvector( 1294 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1295} 1296 1297/// \brief Converts the two double-precision floating-point elements of a 1298/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1299/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1300/// 64 bits of the result vector are set to zero. 1301/// 1302/// \headerfile <x86intrin.h> 1303/// 1304/// This intrinsic corresponds to the \c VCVTPD2DQ / CVTPD2DQ instruction. 1305/// 1306/// \param __a 1307/// A 128-bit vector of [2 x double]. 1308/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1309/// converted values. The upper 64 bits are set to zero. 1310static __inline__ __m128i __DEFAULT_FN_ATTRS 1311_mm_cvtpd_epi32(__m128d __a) 1312{ 1313 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1314} 1315 1316/// \brief Converts the low-order element of a 128-bit vector of [2 x double] 1317/// into a 32-bit signed integer value. 1318/// 1319/// \headerfile <x86intrin.h> 1320/// 1321/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. 1322/// 1323/// \param __a 1324/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1325/// conversion. 1326/// \returns A 32-bit signed integer containing the converted value. 1327static __inline__ int __DEFAULT_FN_ATTRS 1328_mm_cvtsd_si32(__m128d __a) 1329{ 1330 return __builtin_ia32_cvtsd2si((__v2df)__a); 1331} 1332 1333/// \brief Converts the lower double-precision floating-point element of a 1334/// 128-bit vector of [2 x double], in the second parameter, into a 1335/// single-precision floating-point value, returned in the lower 32 bits of a 1336/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1337/// copied from the upper 96 bits of the first parameter. 1338/// 1339/// \headerfile <x86intrin.h> 1340/// 1341/// This intrinsic corresponds to the \c VCVTSD2SS / CVTSD2SS instruction. 1342/// 1343/// \param __a 1344/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1345/// copied to the upper 96 bits of the result. 1346/// \param __b 1347/// A 128-bit vector of [2 x double]. The lower double-precision 1348/// floating-point element is used in the conversion. 1349/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1350/// converted value from the second parameter. The upper 96 bits are copied 1351/// from the upper 96 bits of the first parameter. 1352static __inline__ __m128 __DEFAULT_FN_ATTRS 1353_mm_cvtsd_ss(__m128 __a, __m128d __b) 1354{ 1355 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1356} 1357 1358/// \brief Converts a 32-bit signed integer value, in the second parameter, into 1359/// a double-precision floating-point value, returned in the lower 64 bits of 1360/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1361/// are copied from the upper 64 bits of the first parameter. 1362/// 1363/// \headerfile <x86intrin.h> 1364/// 1365/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. 1366/// 1367/// \param __a 1368/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1369/// copied to the upper 64 bits of the result. 1370/// \param __b 1371/// A 32-bit signed integer containing the value to be converted. 1372/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1373/// converted value from the second parameter. The upper 64 bits are copied 1374/// from the upper 64 bits of the first parameter. 1375static __inline__ __m128d __DEFAULT_FN_ATTRS 1376_mm_cvtsi32_sd(__m128d __a, int __b) 1377{ 1378 __a[0] = __b; 1379 return __a; 1380} 1381 1382/// \brief Converts the lower single-precision floating-point element of a 1383/// 128-bit vector of [4 x float], in the second parameter, into a 1384/// double-precision floating-point value, returned in the lower 64 bits of 1385/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1386/// are copied from the upper 64 bits of the first parameter. 1387/// 1388/// \headerfile <x86intrin.h> 1389/// 1390/// This intrinsic corresponds to the \c VCVTSS2SD / CVTSS2SD instruction. 1391/// 1392/// \param __a 1393/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1394/// copied to the upper 64 bits of the result. 1395/// \param __b 1396/// A 128-bit vector of [4 x float]. The lower single-precision 1397/// floating-point element is used in the conversion. 1398/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1399/// converted value from the second parameter. The upper 64 bits are copied 1400/// from the upper 64 bits of the first parameter. 1401static __inline__ __m128d __DEFAULT_FN_ATTRS 1402_mm_cvtss_sd(__m128d __a, __m128 __b) 1403{ 1404 __a[0] = __b[0]; 1405 return __a; 1406} 1407 1408/// \brief Converts the two double-precision floating-point elements of a 1409/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1410/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the 1411/// result of either conversion is inexact, the result is truncated (rounded 1412/// towards zero) regardless of the current MXCSR setting. The upper 64 bits 1413/// of the result vector are set to zero. 1414/// 1415/// \headerfile <x86intrin.h> 1416/// 1417/// This intrinsic corresponds to the \c VCVTTPD2DQ / CVTTPD2DQ instruction. 1418/// 1419/// \param __a 1420/// A 128-bit vector of [2 x double]. 1421/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1422/// converted values. The upper 64 bits are set to zero. 1423static __inline__ __m128i __DEFAULT_FN_ATTRS 1424_mm_cvttpd_epi32(__m128d __a) 1425{ 1426 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1427} 1428 1429/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit 1430/// signed integer value, truncating the result when it is inexact. 1431/// 1432/// \headerfile <x86intrin.h> 1433/// 1434/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. 1435/// 1436/// \param __a 1437/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1438/// conversion. 1439/// \returns A 32-bit signed integer containing the converted value. 1440static __inline__ int __DEFAULT_FN_ATTRS 1441_mm_cvttsd_si32(__m128d __a) 1442{ 1443 return __builtin_ia32_cvttsd2si((__v2df)__a); 1444} 1445 1446/// \brief Converts the two double-precision floating-point elements of a 1447/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1448/// returned in a 64-bit vector of [2 x i32]. 1449/// 1450/// \headerfile <x86intrin.h> 1451/// 1452/// This intrinsic corresponds to the \c CVTPD2PI instruction. 1453/// 1454/// \param __a 1455/// A 128-bit vector of [2 x double]. 1456/// \returns A 64-bit vector of [2 x i32] containing the converted values. 1457static __inline__ __m64 __DEFAULT_FN_ATTRS 1458_mm_cvtpd_pi32(__m128d __a) 1459{ 1460 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1461} 1462 1463/// \brief Converts the two double-precision floating-point elements of a 1464/// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1465/// returned in a 64-bit vector of [2 x i32]. If the result of either 1466/// conversion is inexact, the result is truncated (rounded towards zero) 1467/// regardless of the current MXCSR setting. 1468/// 1469/// \headerfile <x86intrin.h> 1470/// 1471/// This intrinsic corresponds to the \c CVTTPD2PI instruction. 1472/// 1473/// \param __a 1474/// A 128-bit vector of [2 x double]. 1475/// \returns A 64-bit vector of [2 x i32] containing the converted values. 1476static __inline__ __m64 __DEFAULT_FN_ATTRS 1477_mm_cvttpd_pi32(__m128d __a) 1478{ 1479 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1480} 1481 1482/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of 1483/// [2 x i32] into two double-precision floating-point values, returned in a 1484/// 128-bit vector of [2 x double]. 1485/// 1486/// \headerfile <x86intrin.h> 1487/// 1488/// This intrinsic corresponds to the \c CVTPI2PD instruction. 1489/// 1490/// \param __a 1491/// A 64-bit vector of [2 x i32]. 1492/// \returns A 128-bit vector of [2 x double] containing the converted values. 1493static __inline__ __m128d __DEFAULT_FN_ATTRS 1494_mm_cvtpi32_pd(__m64 __a) 1495{ 1496 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1497} 1498 1499/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as 1500/// a double-precision floating-point value. 1501/// 1502/// \headerfile <x86intrin.h> 1503/// 1504/// This intrinsic has no corresponding instruction. 1505/// 1506/// \param __a 1507/// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1508/// \returns A double-precision floating-point value copied from the lower 64 1509/// bits of __a. 1510static __inline__ double __DEFAULT_FN_ATTRS 1511_mm_cvtsd_f64(__m128d __a) 1512{ 1513 return __a[0]; 1514} 1515 1516/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned 1517/// memory location. 1518/// 1519/// \headerfile <x86intrin.h> 1520/// 1521/// This intrinsic corresponds to the \c VMOVAPD / MOVAPD instruction. 1522/// 1523/// \param __dp 1524/// A pointer to a 128-bit memory location. The address of the memory 1525/// location has to be 16-byte aligned. 1526/// \returns A 128-bit vector of [2 x double] containing the loaded values. 1527static __inline__ __m128d __DEFAULT_FN_ATTRS 1528_mm_load_pd(double const *__dp) 1529{ 1530 return *(__m128d*)__dp; 1531} 1532 1533/// \brief Loads a double-precision floating-point value from a specified memory 1534/// location and duplicates it to both vector elements of a 128-bit vector of 1535/// [2 x double]. 1536/// 1537/// \headerfile <x86intrin.h> 1538/// 1539/// This intrinsic corresponds to the \c VMOVDDUP / MOVDDUP instruction. 1540/// 1541/// \param __dp 1542/// A pointer to a memory location containing a double-precision value. 1543/// \returns A 128-bit vector of [2 x double] containing the loaded and 1544/// duplicated values. 1545static __inline__ __m128d __DEFAULT_FN_ATTRS 1546_mm_load1_pd(double const *__dp) 1547{ 1548 struct __mm_load1_pd_struct { 1549 double __u; 1550 } __attribute__((__packed__, __may_alias__)); 1551 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 1552 return (__m128d){ __u, __u }; 1553} 1554 1555#define _mm_load_pd1(dp) _mm_load1_pd(dp) 1556 1557/// \brief Loads two double-precision values, in reverse order, from an aligned 1558/// memory location into a 128-bit vector of [2 x double]. 1559/// 1560/// \headerfile <x86intrin.h> 1561/// 1562/// This intrinsic corresponds to the \c VMOVAPD / MOVAPD instruction + needed 1563/// shuffling instructions. In AVX mode, the shuffling may be combined with the 1564/// \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1565/// 1566/// \param __dp 1567/// A 16-byte aligned pointer to an array of double-precision values to be 1568/// loaded in reverse order. 1569/// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1570/// values. 1571static __inline__ __m128d __DEFAULT_FN_ATTRS 1572_mm_loadr_pd(double const *__dp) 1573{ 1574 __m128d __u = *(__m128d*)__dp; 1575 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1576} 1577 1578/// \brief Loads a 128-bit floating-point vector of [2 x double] from an 1579/// unaligned memory location. 1580/// 1581/// \headerfile <x86intrin.h> 1582/// 1583/// This intrinsic corresponds to the \c VMOVUPD / MOVUPD instruction. 1584/// 1585/// \param __dp 1586/// A pointer to a 128-bit memory location. The address of the memory 1587/// location does not have to be aligned. 1588/// \returns A 128-bit vector of [2 x double] containing the loaded values. 1589static __inline__ __m128d __DEFAULT_FN_ATTRS 1590_mm_loadu_pd(double const *__dp) 1591{ 1592 struct __loadu_pd { 1593 __m128d __v; 1594 } __attribute__((__packed__, __may_alias__)); 1595 return ((struct __loadu_pd*)__dp)->__v; 1596} 1597 1598static __inline__ __m128i __DEFAULT_FN_ATTRS 1599_mm_loadu_si64(void const *__a) 1600{ 1601 struct __loadu_si64 { 1602 long long __v; 1603 } __attribute__((__packed__, __may_alias__)); 1604 long long __u = ((struct __loadu_si64*)__a)->__v; 1605 return (__m128i){__u, 0L}; 1606} 1607 1608static __inline__ __m128d __DEFAULT_FN_ATTRS 1609_mm_load_sd(double const *__dp) 1610{ 1611 struct __mm_load_sd_struct { 1612 double __u; 1613 } __attribute__((__packed__, __may_alias__)); 1614 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 1615 return (__m128d){ __u, 0 }; 1616} 1617 1618/// \brief Loads a double-precision value into the high-order bits of a 128-bit 1619/// vector of [2 x double]. The low-order bits are copied from the low-order 1620/// bits of the first operand. 1621/// 1622/// \headerfile <x86intrin.h> 1623/// 1624/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction. 1625/// 1626/// \param __a 1627/// A 128-bit vector of [2 x double]. 1628/// Bits [63:0] are written to bits [63:0] of the result. 1629/// \param __dp 1630/// A pointer to a 64-bit memory location containing a double-precision 1631/// floating-point value that is loaded. The loaded value is written to bits 1632/// [127:64] of the result. The address of the memory location does not have 1633/// to be aligned. 1634/// \returns A 128-bit vector of [2 x double] containing the moved values. 1635static __inline__ __m128d __DEFAULT_FN_ATTRS 1636_mm_loadh_pd(__m128d __a, double const *__dp) 1637{ 1638 struct __mm_loadh_pd_struct { 1639 double __u; 1640 } __attribute__((__packed__, __may_alias__)); 1641 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 1642 return (__m128d){ __a[0], __u }; 1643} 1644 1645/// \brief Loads a double-precision value into the low-order bits of a 128-bit 1646/// vector of [2 x double]. The high-order bits are copied from the 1647/// high-order bits of the first operand. 1648/// 1649/// \headerfile <x86intrin.h> 1650/// 1651/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction. 1652/// 1653/// \param __a 1654/// A 128-bit vector of [2 x double]. 1655/// Bits [127:64] are written to bits [127:64] of the result. 1656/// \param __dp 1657/// A pointer to a 64-bit memory location containing a double-precision 1658/// floating-point value that is loaded. The loaded value is written to bits 1659/// [63:0] of the result. The address of the memory location does not have to 1660/// be aligned. 1661/// \returns A 128-bit vector of [2 x double] containing the moved values. 1662static __inline__ __m128d __DEFAULT_FN_ATTRS 1663_mm_loadl_pd(__m128d __a, double const *__dp) 1664{ 1665 struct __mm_loadl_pd_struct { 1666 double __u; 1667 } __attribute__((__packed__, __may_alias__)); 1668 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 1669 return (__m128d){ __u, __a[1] }; 1670} 1671 1672/// \brief Constructs a 128-bit floating-point vector of [2 x double] with 1673/// unspecified content. This could be used as an argument to another 1674/// intrinsic function where the argument is required but the value is not 1675/// actually used. 1676/// 1677/// \headerfile <x86intrin.h> 1678/// 1679/// This intrinsic has no corresponding instruction. 1680/// 1681/// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1682/// content. 1683static __inline__ __m128d __DEFAULT_FN_ATTRS 1684_mm_undefined_pd(void) 1685{ 1686 return (__m128d)__builtin_ia32_undef128(); 1687} 1688 1689/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower 1690/// 64 bits of the vector are initialized with the specified double-precision 1691/// floating-point value. The upper 64 bits are set to zero. 1692/// 1693/// \headerfile <x86intrin.h> 1694/// 1695/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 1696/// 1697/// \param __w 1698/// A double-precision floating-point value used to initialize the lower 64 1699/// bits of the result. 1700/// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1701/// lower 64 bits contain the value of the parameter. The upper 64 bits are 1702/// set to zero. 1703static __inline__ __m128d __DEFAULT_FN_ATTRS 1704_mm_set_sd(double __w) 1705{ 1706 return (__m128d){ __w, 0 }; 1707} 1708 1709/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each 1710/// of the two double-precision floating-point vector elements set to the 1711/// specified double-precision floating-point value. 1712/// 1713/// \headerfile <x86intrin.h> 1714/// 1715/// This intrinsic corresponds to the \c VMOVDDUP / MOVLHPS instruction. 1716/// 1717/// \param __w 1718/// A double-precision floating-point value used to initialize each vector 1719/// element of the result. 1720/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1721static __inline__ __m128d __DEFAULT_FN_ATTRS 1722_mm_set1_pd(double __w) 1723{ 1724 return (__m128d){ __w, __w }; 1725} 1726 1727/// \brief Constructs a 128-bit floating-point vector of [2 x double] 1728/// initialized with the specified double-precision floating-point values. 1729/// 1730/// \headerfile <x86intrin.h> 1731/// 1732/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction. 1733/// 1734/// \param __w 1735/// A double-precision floating-point value used to initialize the upper 64 1736/// bits of the result. 1737/// \param __x 1738/// A double-precision floating-point value used to initialize the lower 64 1739/// bits of the result. 1740/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1741static __inline__ __m128d __DEFAULT_FN_ATTRS 1742_mm_set_pd(double __w, double __x) 1743{ 1744 return (__m128d){ __x, __w }; 1745} 1746 1747/// \brief Constructs a 128-bit floating-point vector of [2 x double], 1748/// initialized in reverse order with the specified double-precision 1749/// floating-point values. 1750/// 1751/// \headerfile <x86intrin.h> 1752/// 1753/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction. 1754/// 1755/// \param __w 1756/// A double-precision floating-point value used to initialize the lower 64 1757/// bits of the result. 1758/// \param __x 1759/// A double-precision floating-point value used to initialize the upper 64 1760/// bits of the result. 1761/// \returns An initialized 128-bit floating-point vector of [2 x double]. 1762static __inline__ __m128d __DEFAULT_FN_ATTRS 1763_mm_setr_pd(double __w, double __x) 1764{ 1765 return (__m128d){ __w, __x }; 1766} 1767 1768/// \brief Constructs a 128-bit floating-point vector of [2 x double] 1769/// initialized to zero. 1770/// 1771/// \headerfile <x86intrin.h> 1772/// 1773/// This intrinsic corresponds to the \c VXORPS / XORPS instruction. 1774/// 1775/// \returns An initialized 128-bit floating-point vector of [2 x double] with 1776/// all elements set to zero. 1777static __inline__ __m128d __DEFAULT_FN_ATTRS 1778_mm_setzero_pd(void) 1779{ 1780 return (__m128d){ 0, 0 }; 1781} 1782 1783/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower 1784/// 64 bits are set to the lower 64 bits of the second parameter. The upper 1785/// 64 bits are set to the upper 64 bits of the first parameter. 1786// 1787/// \headerfile <x86intrin.h> 1788/// 1789/// This intrinsic corresponds to the \c VBLENDPD / BLENDPD instruction. 1790/// 1791/// \param __a 1792/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1793/// upper 64 bits of the result. 1794/// \param __b 1795/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1796/// lower 64 bits of the result. 1797/// \returns A 128-bit vector of [2 x double] containing the moved values. 1798static __inline__ __m128d __DEFAULT_FN_ATTRS 1799_mm_move_sd(__m128d __a, __m128d __b) 1800{ 1801 return (__m128d){ __b[0], __a[1] }; 1802} 1803 1804/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1805/// memory location. 1806/// 1807/// \headerfile <x86intrin.h> 1808/// 1809/// This intrinsic corresponds to the \c VMOVSD / MOVSD instruction. 1810/// 1811/// \param __dp 1812/// A pointer to a 64-bit memory location. 1813/// \param __a 1814/// A 128-bit vector of [2 x double] containing the value to be stored. 1815static __inline__ void __DEFAULT_FN_ATTRS 1816_mm_store_sd(double *__dp, __m128d __a) 1817{ 1818 struct __mm_store_sd_struct { 1819 double __u; 1820 } __attribute__((__packed__, __may_alias__)); 1821 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 1822} 1823 1824static __inline__ void __DEFAULT_FN_ATTRS 1825_mm_store_pd(double *__dp, __m128d __a) 1826{ 1827 *(__m128d*)__dp = __a; 1828} 1829 1830static __inline__ void __DEFAULT_FN_ATTRS 1831_mm_store1_pd(double *__dp, __m128d __a) 1832{ 1833 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1834 _mm_store_pd(__dp, __a); 1835} 1836 1837/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory 1838/// location. 1839/// 1840/// \headerfile <x86intrin.h> 1841/// 1842/// This intrinsic corresponds to the \c VMOVAPD / MOVAPD instruction. 1843/// 1844/// \param __dp 1845/// A pointer to a 128-bit memory location. The address of the memory 1846/// location has to be 16-byte aligned. 1847/// \param __a 1848/// A 128-bit vector of [2 x double] containing the values to be stored. 1849static __inline__ void __DEFAULT_FN_ATTRS 1850_mm_store_pd1(double *__dp, __m128d __a) 1851{ 1852 return _mm_store1_pd(__dp, __a); 1853} 1854 1855/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory 1856/// location. 1857/// 1858/// \headerfile <x86intrin.h> 1859/// 1860/// This intrinsic corresponds to the \c VMOVUPD / MOVUPD instruction. 1861/// 1862/// \param __dp 1863/// A pointer to a 128-bit memory location. The address of the memory 1864/// location does not have to be aligned. 1865/// \param __a 1866/// A 128-bit vector of [2 x double] containing the values to be stored. 1867static __inline__ void __DEFAULT_FN_ATTRS 1868_mm_storeu_pd(double *__dp, __m128d __a) 1869{ 1870 struct __storeu_pd { 1871 __m128d __v; 1872 } __attribute__((__packed__, __may_alias__)); 1873 ((struct __storeu_pd*)__dp)->__v = __a; 1874} 1875 1876/// \brief Stores two double-precision values, in reverse order, from a 128-bit 1877/// vector of [2 x double] to a 16-byte aligned memory location. 1878/// 1879/// \headerfile <x86intrin.h> 1880/// 1881/// This intrinsic corresponds to a shuffling instruction followed by a 1882/// \c VMOVAPD / MOVAPD instruction. 1883/// 1884/// \param __dp 1885/// A pointer to a 16-byte aligned memory location that can store two 1886/// double-precision values. 1887/// \param __a 1888/// A 128-bit vector of [2 x double] containing the values to be reversed and 1889/// stored. 1890static __inline__ void __DEFAULT_FN_ATTRS 1891_mm_storer_pd(double *__dp, __m128d __a) 1892{ 1893 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 1894 *(__m128d *)__dp = __a; 1895} 1896 1897/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 1898/// memory location. 1899/// 1900/// \headerfile <x86intrin.h> 1901/// 1902/// This intrinsic corresponds to the \c VMOVHPD / MOVHPD instruction. 1903/// 1904/// \param __dp 1905/// A pointer to a 64-bit memory location. 1906/// \param __a 1907/// A 128-bit vector of [2 x double] containing the value to be stored. 1908static __inline__ void __DEFAULT_FN_ATTRS 1909_mm_storeh_pd(double *__dp, __m128d __a) 1910{ 1911 struct __mm_storeh_pd_struct { 1912 double __u; 1913 } __attribute__((__packed__, __may_alias__)); 1914 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 1915} 1916 1917/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1918/// memory location. 1919/// 1920/// \headerfile <x86intrin.h> 1921/// 1922/// This intrinsic corresponds to the \c VMOVLPD / MOVLPD instruction. 1923/// 1924/// \param __dp 1925/// A pointer to a 64-bit memory location. 1926/// \param __a 1927/// A 128-bit vector of [2 x double] containing the value to be stored. 1928static __inline__ void __DEFAULT_FN_ATTRS 1929_mm_storel_pd(double *__dp, __m128d __a) 1930{ 1931 struct __mm_storeh_pd_struct { 1932 double __u; 1933 } __attribute__((__packed__, __may_alias__)); 1934 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 1935} 1936 1937/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8], 1938/// saving the lower 8 bits of each sum in the corresponding element of a 1939/// 128-bit result vector of [16 x i8]. The integer elements of both 1940/// parameters can be either signed or unsigned. 1941/// 1942/// \headerfile <x86intrin.h> 1943/// 1944/// This intrinsic corresponds to the \c VPADDB / PADDB instruction. 1945/// 1946/// \param __a 1947/// A 128-bit vector of [16 x i8]. 1948/// \param __b 1949/// A 128-bit vector of [16 x i8]. 1950/// \returns A 128-bit vector of [16 x i8] containing the sums of both 1951/// parameters. 1952static __inline__ __m128i __DEFAULT_FN_ATTRS 1953_mm_add_epi8(__m128i __a, __m128i __b) 1954{ 1955 return (__m128i)((__v16qu)__a + (__v16qu)__b); 1956} 1957 1958/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16], 1959/// saving the lower 16 bits of each sum in the corresponding element of a 1960/// 128-bit result vector of [8 x i16]. The integer elements of both 1961/// parameters can be either signed or unsigned. 1962/// 1963/// \headerfile <x86intrin.h> 1964/// 1965/// This intrinsic corresponds to the \c VPADDW / PADDW instruction. 1966/// 1967/// \param __a 1968/// A 128-bit vector of [8 x i16]. 1969/// \param __b 1970/// A 128-bit vector of [8 x i16]. 1971/// \returns A 128-bit vector of [8 x i16] containing the sums of both 1972/// parameters. 1973static __inline__ __m128i __DEFAULT_FN_ATTRS 1974_mm_add_epi16(__m128i __a, __m128i __b) 1975{ 1976 return (__m128i)((__v8hu)__a + (__v8hu)__b); 1977} 1978 1979/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32], 1980/// saving the lower 32 bits of each sum in the corresponding element of a 1981/// 128-bit result vector of [4 x i32]. The integer elements of both 1982/// parameters can be either signed or unsigned. 1983/// 1984/// \headerfile <x86intrin.h> 1985/// 1986/// This intrinsic corresponds to the \c VPADDD / PADDD instruction. 1987/// 1988/// \param __a 1989/// A 128-bit vector of [4 x i32]. 1990/// \param __b 1991/// A 128-bit vector of [4 x i32]. 1992/// \returns A 128-bit vector of [4 x i32] containing the sums of both 1993/// parameters. 1994static __inline__ __m128i __DEFAULT_FN_ATTRS 1995_mm_add_epi32(__m128i __a, __m128i __b) 1996{ 1997 return (__m128i)((__v4su)__a + (__v4su)__b); 1998} 1999 2000/// \brief Adds two signed or unsigned 64-bit integer values, returning the 2001/// lower 64 bits of the sum. 2002/// 2003/// \headerfile <x86intrin.h> 2004/// 2005/// This intrinsic corresponds to the \c PADDQ instruction. 2006/// 2007/// \param __a 2008/// A 64-bit integer. 2009/// \param __b 2010/// A 64-bit integer. 2011/// \returns A 64-bit integer containing the sum of both parameters. 2012static __inline__ __m64 __DEFAULT_FN_ATTRS 2013_mm_add_si64(__m64 __a, __m64 __b) 2014{ 2015 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2016} 2017 2018/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2019/// saving the lower 64 bits of each sum in the corresponding element of a 2020/// 128-bit result vector of [2 x i64]. The integer elements of both 2021/// parameters can be either signed or unsigned. 2022/// 2023/// \headerfile <x86intrin.h> 2024/// 2025/// This intrinsic corresponds to the \c VPADDQ / PADDQ instruction. 2026/// 2027/// \param __a 2028/// A 128-bit vector of [2 x i64]. 2029/// \param __b 2030/// A 128-bit vector of [2 x i64]. 2031/// \returns A 128-bit vector of [2 x i64] containing the sums of both 2032/// parameters. 2033static __inline__ __m128i __DEFAULT_FN_ATTRS 2034_mm_add_epi64(__m128i __a, __m128i __b) 2035{ 2036 return (__m128i)((__v2du)__a + (__v2du)__b); 2037} 2038 2039/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2040/// signed [16 x i8] vectors, saving each sum in the corresponding element of 2041/// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are 2042/// saturated to 7Fh. Negative sums less than 80h are saturated to 80h. 2043/// 2044/// \headerfile <x86intrin.h> 2045/// 2046/// This intrinsic corresponds to the \c VPADDSB / PADDSB instruction. 2047/// 2048/// \param __a 2049/// A 128-bit signed [16 x i8] vector. 2050/// \param __b 2051/// A 128-bit signed [16 x i8] vector. 2052/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2053/// both parameters. 2054static __inline__ __m128i __DEFAULT_FN_ATTRS 2055_mm_adds_epi8(__m128i __a, __m128i __b) 2056{ 2057 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 2058} 2059 2060/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2061/// signed [8 x i16] vectors, saving each sum in the corresponding element of 2062/// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh 2063/// are saturated to 7FFFh. Negative sums less than 8000h are saturated to 2064/// 8000h. 2065/// 2066/// \headerfile <x86intrin.h> 2067/// 2068/// This intrinsic corresponds to the \c VPADDSW / PADDSW instruction. 2069/// 2070/// \param __a 2071/// A 128-bit signed [8 x i16] vector. 2072/// \param __b 2073/// A 128-bit signed [8 x i16] vector. 2074/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2075/// both parameters. 2076static __inline__ __m128i __DEFAULT_FN_ATTRS 2077_mm_adds_epi16(__m128i __a, __m128i __b) 2078{ 2079 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 2080} 2081 2082/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2083/// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2084/// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh 2085/// are saturated to FFh. Negative sums are saturated to 00h. 2086/// 2087/// \headerfile <x86intrin.h> 2088/// 2089/// This intrinsic corresponds to the \c VPADDUSB / PADDUSB instruction. 2090/// 2091/// \param __a 2092/// A 128-bit unsigned [16 x i8] vector. 2093/// \param __b 2094/// A 128-bit unsigned [16 x i8] vector. 2095/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2096/// of both parameters. 2097static __inline__ __m128i __DEFAULT_FN_ATTRS 2098_mm_adds_epu8(__m128i __a, __m128i __b) 2099{ 2100 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 2101} 2102 2103/// \brief Adds, with saturation, the corresponding elements of two 128-bit 2104/// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2105/// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh 2106/// are saturated to FFFFh. Negative sums are saturated to 0000h. 2107/// 2108/// \headerfile <x86intrin.h> 2109/// 2110/// This intrinsic corresponds to the \c VPADDUSB / PADDUSB instruction. 2111/// 2112/// \param __a 2113/// A 128-bit unsigned [8 x i16] vector. 2114/// \param __b 2115/// A 128-bit unsigned [8 x i16] vector. 2116/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2117/// of both parameters. 2118static __inline__ __m128i __DEFAULT_FN_ATTRS 2119_mm_adds_epu16(__m128i __a, __m128i __b) 2120{ 2121 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 2122} 2123 2124/// \brief Computes the rounded avarages of corresponding elements of two 2125/// 128-bit unsigned [16 x i8] vectors, saving each result in the 2126/// corresponding element of a 128-bit result vector of [16 x i8]. 2127/// 2128/// \headerfile <x86intrin.h> 2129/// 2130/// This intrinsic corresponds to the \c VPAVGB / PAVGB instruction. 2131/// 2132/// \param __a 2133/// A 128-bit unsigned [16 x i8] vector. 2134/// \param __b 2135/// A 128-bit unsigned [16 x i8] vector. 2136/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2137/// averages of both parameters. 2138static __inline__ __m128i __DEFAULT_FN_ATTRS 2139_mm_avg_epu8(__m128i __a, __m128i __b) 2140{ 2141 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2142} 2143 2144/// \brief Computes the rounded avarages of corresponding elements of two 2145/// 128-bit unsigned [8 x i16] vectors, saving each result in the 2146/// corresponding element of a 128-bit result vector of [8 x i16]. 2147/// 2148/// \headerfile <x86intrin.h> 2149/// 2150/// This intrinsic corresponds to the \c VPAVGW / PAVGW instruction. 2151/// 2152/// \param __a 2153/// A 128-bit unsigned [8 x i16] vector. 2154/// \param __b 2155/// A 128-bit unsigned [8 x i16] vector. 2156/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2157/// averages of both parameters. 2158static __inline__ __m128i __DEFAULT_FN_ATTRS 2159_mm_avg_epu16(__m128i __a, __m128i __b) 2160{ 2161 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2162} 2163 2164/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2165/// vectors, producing eight intermediate 32-bit signed integer products, and 2166/// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2167/// [4 x i32] vector. For example, bits [15:0] of both parameters are 2168/// multiplied producing a 32-bit product, bits [31:16] of both parameters 2169/// are multiplied producing a 32-bit product, and the sum of those two 2170/// products becomes bits [31:0] of the result. 2171/// 2172/// \headerfile <x86intrin.h> 2173/// 2174/// This intrinsic corresponds to the \c VPMADDWD / PMADDWD instruction. 2175/// 2176/// \param __a 2177/// A 128-bit signed [8 x i16] vector. 2178/// \param __b 2179/// A 128-bit signed [8 x i16] vector. 2180/// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2181/// of both parameters. 2182static __inline__ __m128i __DEFAULT_FN_ATTRS 2183_mm_madd_epi16(__m128i __a, __m128i __b) 2184{ 2185 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2186} 2187 2188/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] 2189/// vectors, saving the greater value from each comparison in the 2190/// corresponding element of a 128-bit result vector of [8 x i16]. 2191/// 2192/// \headerfile <x86intrin.h> 2193/// 2194/// This intrinsic corresponds to the \c VPMAXSW / PMAXSW instruction. 2195/// 2196/// \param __a 2197/// A 128-bit signed [8 x i16] vector. 2198/// \param __b 2199/// A 128-bit signed [8 x i16] vector. 2200/// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2201/// each comparison. 2202static __inline__ __m128i __DEFAULT_FN_ATTRS 2203_mm_max_epi16(__m128i __a, __m128i __b) 2204{ 2205 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 2206} 2207 2208/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] 2209/// vectors, saving the greater value from each comparison in the 2210/// corresponding element of a 128-bit result vector of [16 x i8]. 2211/// 2212/// \headerfile <x86intrin.h> 2213/// 2214/// This intrinsic corresponds to the \c VPMAXUB / PMAXUB instruction. 2215/// 2216/// \param __a 2217/// A 128-bit unsigned [16 x i8] vector. 2218/// \param __b 2219/// A 128-bit unsigned [16 x i8] vector. 2220/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2221/// each comparison. 2222static __inline__ __m128i __DEFAULT_FN_ATTRS 2223_mm_max_epu8(__m128i __a, __m128i __b) 2224{ 2225 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 2226} 2227 2228/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] 2229/// vectors, saving the smaller value from each comparison in the 2230/// corresponding element of a 128-bit result vector of [8 x i16]. 2231/// 2232/// \headerfile <x86intrin.h> 2233/// 2234/// This intrinsic corresponds to the \c VPMINSW / PMINSW instruction. 2235/// 2236/// \param __a 2237/// A 128-bit signed [8 x i16] vector. 2238/// \param __b 2239/// A 128-bit signed [8 x i16] vector. 2240/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2241/// each comparison. 2242static __inline__ __m128i __DEFAULT_FN_ATTRS 2243_mm_min_epi16(__m128i __a, __m128i __b) 2244{ 2245 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 2246} 2247 2248/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] 2249/// vectors, saving the smaller value from each comparison in the 2250/// corresponding element of a 128-bit result vector of [16 x i8]. 2251/// 2252/// \headerfile <x86intrin.h> 2253/// 2254/// This intrinsic corresponds to the \c VPMINUB / PMINUB instruction. 2255/// 2256/// \param __a 2257/// A 128-bit unsigned [16 x i8] vector. 2258/// \param __b 2259/// A 128-bit unsigned [16 x i8] vector. 2260/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2261/// each comparison. 2262static __inline__ __m128i __DEFAULT_FN_ATTRS 2263_mm_min_epu8(__m128i __a, __m128i __b) 2264{ 2265 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 2266} 2267 2268/// \brief Multiplies the corresponding elements of two signed [8 x i16] 2269/// vectors, saving the upper 16 bits of each 32-bit product in the 2270/// corresponding element of a 128-bit signed [8 x i16] result vector. 2271/// 2272/// \headerfile <x86intrin.h> 2273/// 2274/// This intrinsic corresponds to the \c VPMULHW / PMULHW instruction. 2275/// 2276/// \param __a 2277/// A 128-bit signed [8 x i16] vector. 2278/// \param __b 2279/// A 128-bit signed [8 x i16] vector. 2280/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2281/// each of the eight 32-bit products. 2282static __inline__ __m128i __DEFAULT_FN_ATTRS 2283_mm_mulhi_epi16(__m128i __a, __m128i __b) 2284{ 2285 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2286} 2287 2288/// \brief Multiplies the corresponding elements of two unsigned [8 x i16] 2289/// vectors, saving the upper 16 bits of each 32-bit product in the 2290/// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2291/// 2292/// \headerfile <x86intrin.h> 2293/// 2294/// This intrinsic corresponds to the \c VPMULHUW / PMULHUW instruction. 2295/// 2296/// \param __a 2297/// A 128-bit unsigned [8 x i16] vector. 2298/// \param __b 2299/// A 128-bit unsigned [8 x i16] vector. 2300/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2301/// of each of the eight 32-bit products. 2302static __inline__ __m128i __DEFAULT_FN_ATTRS 2303_mm_mulhi_epu16(__m128i __a, __m128i __b) 2304{ 2305 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2306} 2307 2308/// \brief Multiplies the corresponding elements of two signed [8 x i16] 2309/// vectors, saving the lower 16 bits of each 32-bit product in the 2310/// corresponding element of a 128-bit signed [8 x i16] result vector. 2311/// 2312/// \headerfile <x86intrin.h> 2313/// 2314/// This intrinsic corresponds to the \c VPMULLW / PMULLW instruction. 2315/// 2316/// \param __a 2317/// A 128-bit signed [8 x i16] vector. 2318/// \param __b 2319/// A 128-bit signed [8 x i16] vector. 2320/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2321/// each of the eight 32-bit products. 2322static __inline__ __m128i __DEFAULT_FN_ATTRS 2323_mm_mullo_epi16(__m128i __a, __m128i __b) 2324{ 2325 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2326} 2327 2328/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits 2329/// of the two 64-bit integer vectors and returns the 64-bit unsigned 2330/// product. 2331/// 2332/// \headerfile <x86intrin.h> 2333/// 2334/// This intrinsic corresponds to the \c PMULUDQ instruction. 2335/// 2336/// \param __a 2337/// A 64-bit integer containing one of the source operands. 2338/// \param __b 2339/// A 64-bit integer containing one of the source operands. 2340/// \returns A 64-bit integer vector containing the product of both operands. 2341static __inline__ __m64 __DEFAULT_FN_ATTRS 2342_mm_mul_su32(__m64 __a, __m64 __b) 2343{ 2344 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2345} 2346 2347/// \brief Multiplies 32-bit unsigned integer values contained in the lower 2348/// bits of the corresponding elements of two [2 x i64] vectors, and returns 2349/// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2350/// 2351/// \headerfile <x86intrin.h> 2352/// 2353/// This intrinsic corresponds to the \c VPMULUDQ / PMULUDQ instruction. 2354/// 2355/// \param __a 2356/// A [2 x i64] vector containing one of the source operands. 2357/// \param __b 2358/// A [2 x i64] vector containing one of the source operands. 2359/// \returns A [2 x i64] vector containing the product of both operands. 2360static __inline__ __m128i __DEFAULT_FN_ATTRS 2361_mm_mul_epu32(__m128i __a, __m128i __b) 2362{ 2363 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2364} 2365 2366/// \brief Computes the absolute differences of corresponding 8-bit integer 2367/// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2368/// separately sums the second 8 absolute differences. Packss these two 2369/// unsigned 16-bit integer sums into the upper and lower elements of a 2370/// [2 x i64] vector. 2371/// 2372/// \headerfile <x86intrin.h> 2373/// 2374/// This intrinsic corresponds to the \c VPSADBW / PSADBW instruction. 2375/// 2376/// \param __a 2377/// A 128-bit integer vector containing one of the source operands. 2378/// \param __b 2379/// A 128-bit integer vector containing one of the source operands. 2380/// \returns A [2 x i64] vector containing the sums of the sets of absolute 2381/// differences between both operands. 2382static __inline__ __m128i __DEFAULT_FN_ATTRS 2383_mm_sad_epu8(__m128i __a, __m128i __b) 2384{ 2385 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2386} 2387 2388/// \brief Subtracts the corresponding 8-bit integer values in the operands. 2389/// 2390/// \headerfile <x86intrin.h> 2391/// 2392/// This intrinsic corresponds to the \c VPSUBB / PSUBB instruction. 2393/// 2394/// \param __a 2395/// A 128-bit integer vector containing the minuends. 2396/// \param __b 2397/// A 128-bit integer vector containing the subtrahends. 2398/// \returns A 128-bit integer vector containing the differences of the values 2399/// in the operands. 2400static __inline__ __m128i __DEFAULT_FN_ATTRS 2401_mm_sub_epi8(__m128i __a, __m128i __b) 2402{ 2403 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2404} 2405 2406/// \brief Subtracts the corresponding 16-bit integer values in the operands. 2407/// 2408/// \headerfile <x86intrin.h> 2409/// 2410/// This intrinsic corresponds to the \c VPSUBW / PSUBW instruction. 2411/// 2412/// \param __a 2413/// A 128-bit integer vector containing the minuends. 2414/// \param __b 2415/// A 128-bit integer vector containing the subtrahends. 2416/// \returns A 128-bit integer vector containing the differences of the values 2417/// in the operands. 2418static __inline__ __m128i __DEFAULT_FN_ATTRS 2419_mm_sub_epi16(__m128i __a, __m128i __b) 2420{ 2421 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2422} 2423 2424/// \brief Subtracts the corresponding 32-bit integer values in the operands. 2425/// 2426/// \headerfile <x86intrin.h> 2427/// 2428/// This intrinsic corresponds to the \c VPSUBD / PSUBD instruction. 2429/// 2430/// \param __a 2431/// A 128-bit integer vector containing the minuends. 2432/// \param __b 2433/// A 128-bit integer vector containing the subtrahends. 2434/// \returns A 128-bit integer vector containing the differences of the values 2435/// in the operands. 2436static __inline__ __m128i __DEFAULT_FN_ATTRS 2437_mm_sub_epi32(__m128i __a, __m128i __b) 2438{ 2439 return (__m128i)((__v4su)__a - (__v4su)__b); 2440} 2441 2442/// \brief Subtracts signed or unsigned 64-bit integer values and writes the 2443/// difference to the corresponding bits in the destination. 2444/// 2445/// \headerfile <x86intrin.h> 2446/// 2447/// This intrinsic corresponds to the \c PSUBQ instruction. 2448/// 2449/// \param __a 2450/// A 64-bit integer vector containing the minuend. 2451/// \param __b 2452/// A 64-bit integer vector containing the subtrahend. 2453/// \returns A 64-bit integer vector containing the difference of the values in 2454/// the operands. 2455static __inline__ __m64 __DEFAULT_FN_ATTRS 2456_mm_sub_si64(__m64 __a, __m64 __b) 2457{ 2458 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2459} 2460 2461/// \brief Subtracts the corresponding elements of two [2 x i64] vectors. 2462/// 2463/// \headerfile <x86intrin.h> 2464/// 2465/// This intrinsic corresponds to the \c VPSUBQ / PSUBQ instruction. 2466/// 2467/// \param __a 2468/// A 128-bit integer vector containing the minuends. 2469/// \param __b 2470/// A 128-bit integer vector containing the subtrahends. 2471/// \returns A 128-bit integer vector containing the differences of the values 2472/// in the operands. 2473static __inline__ __m128i __DEFAULT_FN_ATTRS 2474_mm_sub_epi64(__m128i __a, __m128i __b) 2475{ 2476 return (__m128i)((__v2du)__a - (__v2du)__b); 2477} 2478 2479/// \brief Subtracts corresponding 8-bit signed integer values in the input and 2480/// returns the differences in the corresponding bytes in the destination. 2481/// Differences greater than 7Fh are saturated to 7Fh, and differences less 2482/// than 80h are saturated to 80h. 2483/// 2484/// \headerfile <x86intrin.h> 2485/// 2486/// This intrinsic corresponds to the \c VPSUBSB / PSUBSB instruction. 2487/// 2488/// \param __a 2489/// A 128-bit integer vector containing the minuends. 2490/// \param __b 2491/// A 128-bit integer vector containing the subtrahends. 2492/// \returns A 128-bit integer vector containing the differences of the values 2493/// in the operands. 2494static __inline__ __m128i __DEFAULT_FN_ATTRS 2495_mm_subs_epi8(__m128i __a, __m128i __b) 2496{ 2497 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 2498} 2499 2500/// \brief Subtracts corresponding 16-bit signed integer values in the input and 2501/// returns the differences in the corresponding bytes in the destination. 2502/// Differences greater than 7FFFh are saturated to 7FFFh, and values less 2503/// than 8000h are saturated to 8000h. 2504/// 2505/// \headerfile <x86intrin.h> 2506/// 2507/// This intrinsic corresponds to the \c VPSUBSW / PSUBSW instruction. 2508/// 2509/// \param __a 2510/// A 128-bit integer vector containing the minuends. 2511/// \param __b 2512/// A 128-bit integer vector containing the subtrahends. 2513/// \returns A 128-bit integer vector containing the differences of the values 2514/// in the operands. 2515static __inline__ __m128i __DEFAULT_FN_ATTRS 2516_mm_subs_epi16(__m128i __a, __m128i __b) 2517{ 2518 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 2519} 2520 2521/// \brief Subtracts corresponding 8-bit unsigned integer values in the input 2522/// and returns the differences in the corresponding bytes in the 2523/// destination. Differences less than 00h are saturated to 00h. 2524/// 2525/// \headerfile <x86intrin.h> 2526/// 2527/// This intrinsic corresponds to the \c VPSUBUSB / PSUBUSB instruction. 2528/// 2529/// \param __a 2530/// A 128-bit integer vector containing the minuends. 2531/// \param __b 2532/// A 128-bit integer vector containing the subtrahends. 2533/// \returns A 128-bit integer vector containing the unsigned integer 2534/// differences of the values in the operands. 2535static __inline__ __m128i __DEFAULT_FN_ATTRS 2536_mm_subs_epu8(__m128i __a, __m128i __b) 2537{ 2538 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 2539} 2540 2541/// \brief Subtracts corresponding 16-bit unsigned integer values in the input 2542/// and returns the differences in the corresponding bytes in the 2543/// destination. Differences less than 0000h are saturated to 0000h. 2544/// 2545/// \headerfile <x86intrin.h> 2546/// 2547/// This intrinsic corresponds to the \c VPSUBUSW / PSUBUSW instruction. 2548/// 2549/// \param __a 2550/// A 128-bit integer vector containing the minuends. 2551/// \param __b 2552/// A 128-bit integer vector containing the subtrahends. 2553/// \returns A 128-bit integer vector containing the unsigned integer 2554/// differences of the values in the operands. 2555static __inline__ __m128i __DEFAULT_FN_ATTRS 2556_mm_subs_epu16(__m128i __a, __m128i __b) 2557{ 2558 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 2559} 2560 2561/// \brief Performs a bitwise AND of two 128-bit integer vectors. 2562/// 2563/// \headerfile <x86intrin.h> 2564/// 2565/// This intrinsic corresponds to the \c VPAND / PAND instruction. 2566/// 2567/// \param __a 2568/// A 128-bit integer vector containing one of the source operands. 2569/// \param __b 2570/// A 128-bit integer vector containing one of the source operands. 2571/// \returns A 128-bit integer vector containing the bitwise AND of the values 2572/// in both operands. 2573static __inline__ __m128i __DEFAULT_FN_ATTRS 2574_mm_and_si128(__m128i __a, __m128i __b) 2575{ 2576 return (__m128i)((__v2du)__a & (__v2du)__b); 2577} 2578 2579/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the 2580/// one's complement of the values contained in the first source operand. 2581/// 2582/// \headerfile <x86intrin.h> 2583/// 2584/// This intrinsic corresponds to the \c VPANDN / PANDN instruction. 2585/// 2586/// \param __a 2587/// A 128-bit vector containing the left source operand. The one's complement 2588/// of this value is used in the bitwise AND. 2589/// \param __b 2590/// A 128-bit vector containing the right source operand. 2591/// \returns A 128-bit integer vector containing the bitwise AND of the one's 2592/// complement of the first operand and the values in the second operand. 2593static __inline__ __m128i __DEFAULT_FN_ATTRS 2594_mm_andnot_si128(__m128i __a, __m128i __b) 2595{ 2596 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2597} 2598/// \brief Performs a bitwise OR of two 128-bit integer vectors. 2599/// 2600/// \headerfile <x86intrin.h> 2601/// 2602/// This intrinsic corresponds to the \c VPOR / POR instruction. 2603/// 2604/// \param __a 2605/// A 128-bit integer vector containing one of the source operands. 2606/// \param __b 2607/// A 128-bit integer vector containing one of the source operands. 2608/// \returns A 128-bit integer vector containing the bitwise OR of the values 2609/// in both operands. 2610static __inline__ __m128i __DEFAULT_FN_ATTRS 2611_mm_or_si128(__m128i __a, __m128i __b) 2612{ 2613 return (__m128i)((__v2du)__a | (__v2du)__b); 2614} 2615 2616/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. 2617/// 2618/// \headerfile <x86intrin.h> 2619/// 2620/// This intrinsic corresponds to the \c VPXOR / PXOR instruction. 2621/// 2622/// \param __a 2623/// A 128-bit integer vector containing one of the source operands. 2624/// \param __b 2625/// A 128-bit integer vector containing one of the source operands. 2626/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2627/// values in both operands. 2628static __inline__ __m128i __DEFAULT_FN_ATTRS 2629_mm_xor_si128(__m128i __a, __m128i __b) 2630{ 2631 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2632} 2633 2634/// \brief Left-shifts the 128-bit integer vector operand by the specified 2635/// number of bytes. Low-order bits are cleared. 2636/// 2637/// \headerfile <x86intrin.h> 2638/// 2639/// \code 2640/// __m128i _mm_slli_si128(__m128i a, const int imm); 2641/// \endcode 2642/// 2643/// This intrinsic corresponds to the \c VPSLLDQ / PSLLDQ instruction. 2644/// 2645/// \param a 2646/// A 128-bit integer vector containing the source operand. 2647/// \param imm 2648/// An immediate value specifying the number of bytes to left-shift 2649/// operand a. 2650/// \returns A 128-bit integer vector containing the left-shifted value. 2651#define _mm_slli_si128(a, imm) __extension__ ({ \ 2652 (__m128i)__builtin_shufflevector( \ 2653 (__v16qi)_mm_setzero_si128(), \ 2654 (__v16qi)(__m128i)(a), \ 2655 ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \ 2656 ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \ 2657 ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \ 2658 ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \ 2659 ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \ 2660 ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \ 2661 ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \ 2662 ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \ 2663 ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \ 2664 ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \ 2665 ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \ 2666 ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \ 2667 ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ 2668 ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ 2669 ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ 2670 ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) 2671 2672#define _mm_bslli_si128(a, imm) \ 2673 _mm_slli_si128((a), (imm)) 2674 2675/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 2676/// by the specified number of bits. Low-order bits are cleared. 2677/// 2678/// \headerfile <x86intrin.h> 2679/// 2680/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 2681/// 2682/// \param __a 2683/// A 128-bit integer vector containing the source operand. 2684/// \param __count 2685/// An integer value specifying the number of bits to left-shift each value 2686/// in operand __a. 2687/// \returns A 128-bit integer vector containing the left-shifted values. 2688static __inline__ __m128i __DEFAULT_FN_ATTRS 2689_mm_slli_epi16(__m128i __a, int __count) 2690{ 2691 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2692} 2693 2694/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 2695/// by the specified number of bits. Low-order bits are cleared. 2696/// 2697/// \headerfile <x86intrin.h> 2698/// 2699/// This intrinsic corresponds to the \c VPSLLW / PSLLW instruction. 2700/// 2701/// \param __a 2702/// A 128-bit integer vector containing the source operand. 2703/// \param __count 2704/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2705/// to left-shift each value in operand __a. 2706/// \returns A 128-bit integer vector containing the left-shifted values. 2707static __inline__ __m128i __DEFAULT_FN_ATTRS 2708_mm_sll_epi16(__m128i __a, __m128i __count) 2709{ 2710 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2711} 2712 2713/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 2714/// by the specified number of bits. Low-order bits are cleared. 2715/// 2716/// \headerfile <x86intrin.h> 2717/// 2718/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 2719/// 2720/// \param __a 2721/// A 128-bit integer vector containing the source operand. 2722/// \param __count 2723/// An integer value specifying the number of bits to left-shift each value 2724/// in operand __a. 2725/// \returns A 128-bit integer vector containing the left-shifted values. 2726static __inline__ __m128i __DEFAULT_FN_ATTRS 2727_mm_slli_epi32(__m128i __a, int __count) 2728{ 2729 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2730} 2731 2732/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 2733/// by the specified number of bits. Low-order bits are cleared. 2734/// 2735/// \headerfile <x86intrin.h> 2736/// 2737/// This intrinsic corresponds to the \c VPSLLD / PSLLD instruction. 2738/// 2739/// \param __a 2740/// A 128-bit integer vector containing the source operand. 2741/// \param __count 2742/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2743/// to left-shift each value in operand __a. 2744/// \returns A 128-bit integer vector containing the left-shifted values. 2745static __inline__ __m128i __DEFAULT_FN_ATTRS 2746_mm_sll_epi32(__m128i __a, __m128i __count) 2747{ 2748 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2749} 2750 2751/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 2752/// by the specified number of bits. Low-order bits are cleared. 2753/// 2754/// \headerfile <x86intrin.h> 2755/// 2756/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 2757/// 2758/// \param __a 2759/// A 128-bit integer vector containing the source operand. 2760/// \param __count 2761/// An integer value specifying the number of bits to left-shift each value 2762/// in operand __a. 2763/// \returns A 128-bit integer vector containing the left-shifted values. 2764static __inline__ __m128i __DEFAULT_FN_ATTRS 2765_mm_slli_epi64(__m128i __a, int __count) 2766{ 2767 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2768} 2769 2770/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 2771/// by the specified number of bits. Low-order bits are cleared. 2772/// 2773/// \headerfile <x86intrin.h> 2774/// 2775/// This intrinsic corresponds to the \c VPSLLQ / PSLLQ instruction. 2776/// 2777/// \param __a 2778/// A 128-bit integer vector containing the source operand. 2779/// \param __count 2780/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2781/// to left-shift each value in operand __a. 2782/// \returns A 128-bit integer vector containing the left-shifted values. 2783static __inline__ __m128i __DEFAULT_FN_ATTRS 2784_mm_sll_epi64(__m128i __a, __m128i __count) 2785{ 2786 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2787} 2788 2789/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 2790/// by the specified number of bits. High-order bits are filled with the sign 2791/// bit of the initial value. 2792/// 2793/// \headerfile <x86intrin.h> 2794/// 2795/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 2796/// 2797/// \param __a 2798/// A 128-bit integer vector containing the source operand. 2799/// \param __count 2800/// An integer value specifying the number of bits to right-shift each value 2801/// in operand __a. 2802/// \returns A 128-bit integer vector containing the right-shifted values. 2803static __inline__ __m128i __DEFAULT_FN_ATTRS 2804_mm_srai_epi16(__m128i __a, int __count) 2805{ 2806 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2807} 2808 2809/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand 2810/// by the specified number of bits. High-order bits are filled with the sign 2811/// bit of the initial value. 2812/// 2813/// \headerfile <x86intrin.h> 2814/// 2815/// This intrinsic corresponds to the \c VPSRAW / PSRAW instruction. 2816/// 2817/// \param __a 2818/// A 128-bit integer vector containing the source operand. 2819/// \param __count 2820/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2821/// to right-shift each value in operand __a. 2822/// \returns A 128-bit integer vector containing the right-shifted values. 2823static __inline__ __m128i __DEFAULT_FN_ATTRS 2824_mm_sra_epi16(__m128i __a, __m128i __count) 2825{ 2826 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2827} 2828 2829/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 2830/// by the specified number of bits. High-order bits are filled with the sign 2831/// bit of the initial value. 2832/// 2833/// \headerfile <x86intrin.h> 2834/// 2835/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 2836/// 2837/// \param __a 2838/// A 128-bit integer vector containing the source operand. 2839/// \param __count 2840/// An integer value specifying the number of bits to right-shift each value 2841/// in operand __a. 2842/// \returns A 128-bit integer vector containing the right-shifted values. 2843static __inline__ __m128i __DEFAULT_FN_ATTRS 2844_mm_srai_epi32(__m128i __a, int __count) 2845{ 2846 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2847} 2848 2849/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand 2850/// by the specified number of bits. High-order bits are filled with the sign 2851/// bit of the initial value. 2852/// 2853/// \headerfile <x86intrin.h> 2854/// 2855/// This intrinsic corresponds to the \c VPSRAD / PSRAD instruction. 2856/// 2857/// \param __a 2858/// A 128-bit integer vector containing the source operand. 2859/// \param __count 2860/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2861/// to right-shift each value in operand __a. 2862/// \returns A 128-bit integer vector containing the right-shifted values. 2863static __inline__ __m128i __DEFAULT_FN_ATTRS 2864_mm_sra_epi32(__m128i __a, __m128i __count) 2865{ 2866 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2867} 2868 2869/// \brief Right-shifts the 128-bit integer vector operand by the specified 2870/// number of bytes. High-order bits are cleared. 2871/// 2872/// \headerfile <x86intrin.h> 2873/// 2874/// \code 2875/// __m128i _mm_srli_si128(__m128i a, const int imm); 2876/// \endcode 2877/// 2878/// This intrinsic corresponds to the \c VPSRLDQ / PSRLDQ instruction. 2879/// 2880/// \param a 2881/// A 128-bit integer vector containing the source operand. 2882/// \param imm 2883/// An immediate value specifying the number of bytes to right-shift operand 2884/// a. 2885/// \returns A 128-bit integer vector containing the right-shifted value. 2886#define _mm_srli_si128(a, imm) __extension__ ({ \ 2887 (__m128i)__builtin_shufflevector( \ 2888 (__v16qi)(__m128i)(a), \ 2889 (__v16qi)_mm_setzero_si128(), \ 2890 ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \ 2891 ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \ 2892 ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \ 2893 ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \ 2894 ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \ 2895 ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \ 2896 ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \ 2897 ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \ 2898 ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \ 2899 ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \ 2900 ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \ 2901 ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \ 2902 ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ 2903 ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ 2904 ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ 2905 ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) 2906 2907#define _mm_bsrli_si128(a, imm) \ 2908 _mm_srli_si128((a), (imm)) 2909 2910/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 2911/// operand by the specified number of bits. High-order bits are cleared. 2912/// 2913/// \headerfile <x86intrin.h> 2914/// 2915/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 2916/// 2917/// \param __a 2918/// A 128-bit integer vector containing the source operand. 2919/// \param __count 2920/// An integer value specifying the number of bits to right-shift each value 2921/// in operand __a. 2922/// \returns A 128-bit integer vector containing the right-shifted values. 2923static __inline__ __m128i __DEFAULT_FN_ATTRS 2924_mm_srli_epi16(__m128i __a, int __count) 2925{ 2926 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 2927} 2928 2929/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector 2930/// operand by the specified number of bits. High-order bits are cleared. 2931/// 2932/// \headerfile <x86intrin.h> 2933/// 2934/// This intrinsic corresponds to the \c VPSRLW / PSRLW instruction. 2935/// 2936/// \param __a 2937/// A 128-bit integer vector containing the source operand. 2938/// \param __count 2939/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2940/// to right-shift each value in operand __a. 2941/// \returns A 128-bit integer vector containing the right-shifted values. 2942static __inline__ __m128i __DEFAULT_FN_ATTRS 2943_mm_srl_epi16(__m128i __a, __m128i __count) 2944{ 2945 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 2946} 2947 2948/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 2949/// operand by the specified number of bits. High-order bits are cleared. 2950/// 2951/// \headerfile <x86intrin.h> 2952/// 2953/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 2954/// 2955/// \param __a 2956/// A 128-bit integer vector containing the source operand. 2957/// \param __count 2958/// An integer value specifying the number of bits to right-shift each value 2959/// in operand __a. 2960/// \returns A 128-bit integer vector containing the right-shifted values. 2961static __inline__ __m128i __DEFAULT_FN_ATTRS 2962_mm_srli_epi32(__m128i __a, int __count) 2963{ 2964 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 2965} 2966 2967/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector 2968/// operand by the specified number of bits. High-order bits are cleared. 2969/// 2970/// \headerfile <x86intrin.h> 2971/// 2972/// This intrinsic corresponds to the \c VPSRLD / PSRLD instruction. 2973/// 2974/// \param __a 2975/// A 128-bit integer vector containing the source operand. 2976/// \param __count 2977/// A 128-bit integer vector in which bits [63:0] specify the number of bits 2978/// to right-shift each value in operand __a. 2979/// \returns A 128-bit integer vector containing the right-shifted values. 2980static __inline__ __m128i __DEFAULT_FN_ATTRS 2981_mm_srl_epi32(__m128i __a, __m128i __count) 2982{ 2983 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 2984} 2985 2986/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 2987/// operand by the specified number of bits. High-order bits are cleared. 2988/// 2989/// \headerfile <x86intrin.h> 2990/// 2991/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 2992/// 2993/// \param __a 2994/// A 128-bit integer vector containing the source operand. 2995/// \param __count 2996/// An integer value specifying the number of bits to right-shift each value 2997/// in operand __a. 2998/// \returns A 128-bit integer vector containing the right-shifted values. 2999static __inline__ __m128i __DEFAULT_FN_ATTRS 3000_mm_srli_epi64(__m128i __a, int __count) 3001{ 3002 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3003} 3004 3005/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector 3006/// operand by the specified number of bits. High-order bits are cleared. 3007/// 3008/// \headerfile <x86intrin.h> 3009/// 3010/// This intrinsic corresponds to the \c VPSRLQ / PSRLQ instruction. 3011/// 3012/// \param __a 3013/// A 128-bit integer vector containing the source operand. 3014/// \param __count 3015/// A 128-bit integer vector in which bits [63:0] specify the number of bits 3016/// to right-shift each value in operand __a. 3017/// \returns A 128-bit integer vector containing the right-shifted values. 3018static __inline__ __m128i __DEFAULT_FN_ATTRS 3019_mm_srl_epi64(__m128i __a, __m128i __count) 3020{ 3021 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3022} 3023 3024/// \brief Compares each of the corresponding 8-bit values of the 128-bit 3025/// integer vectors for equality. Each comparison yields 0h for false, FFh 3026/// for true. 3027/// 3028/// \headerfile <x86intrin.h> 3029/// 3030/// This intrinsic corresponds to the \c VPCMPEQB / PCMPEQB instruction. 3031/// 3032/// \param __a 3033/// A 128-bit integer vector. 3034/// \param __b 3035/// A 128-bit integer vector. 3036/// \returns A 128-bit integer vector containing the comparison results. 3037static __inline__ __m128i __DEFAULT_FN_ATTRS 3038_mm_cmpeq_epi8(__m128i __a, __m128i __b) 3039{ 3040 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3041} 3042 3043/// \brief Compares each of the corresponding 16-bit values of the 128-bit 3044/// integer vectors for equality. Each comparison yields 0h for false, FFFFh 3045/// for true. 3046/// 3047/// \headerfile <x86intrin.h> 3048/// 3049/// This intrinsic corresponds to the \c VPCMPEQW / PCMPEQW instruction. 3050/// 3051/// \param __a 3052/// A 128-bit integer vector. 3053/// \param __b 3054/// A 128-bit integer vector. 3055/// \returns A 128-bit integer vector containing the comparison results. 3056static __inline__ __m128i __DEFAULT_FN_ATTRS 3057_mm_cmpeq_epi16(__m128i __a, __m128i __b) 3058{ 3059 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3060} 3061 3062/// \brief Compares each of the corresponding 32-bit values of the 128-bit 3063/// integer vectors for equality. Each comparison yields 0h for false, 3064/// FFFFFFFFh for true. 3065/// 3066/// \headerfile <x86intrin.h> 3067/// 3068/// This intrinsic corresponds to the \c VPCMPEQD / PCMPEQD instruction. 3069/// 3070/// \param __a 3071/// A 128-bit integer vector. 3072/// \param __b 3073/// A 128-bit integer vector. 3074/// \returns A 128-bit integer vector containing the comparison results. 3075static __inline__ __m128i __DEFAULT_FN_ATTRS 3076_mm_cmpeq_epi32(__m128i __a, __m128i __b) 3077{ 3078 return (__m128i)((__v4si)__a == (__v4si)__b); 3079} 3080 3081/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 3082/// integer vectors to determine if the values in the first operand are 3083/// greater than those in the second operand. Each comparison yields 0h for 3084/// false, FFh for true. 3085/// 3086/// \headerfile <x86intrin.h> 3087/// 3088/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 3089/// 3090/// \param __a 3091/// A 128-bit integer vector. 3092/// \param __b 3093/// A 128-bit integer vector. 3094/// \returns A 128-bit integer vector containing the comparison results. 3095static __inline__ __m128i __DEFAULT_FN_ATTRS 3096_mm_cmpgt_epi8(__m128i __a, __m128i __b) 3097{ 3098 /* This function always performs a signed comparison, but __v16qi is a char 3099 which may be signed or unsigned, so use __v16qs. */ 3100 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3101} 3102 3103/// \brief Compares each of the corresponding signed 16-bit values of the 3104/// 128-bit integer vectors to determine if the values in the first operand 3105/// are greater than those in the second operand. Each comparison yields 0h 3106/// for false, FFFFh for true. 3107/// 3108/// \headerfile <x86intrin.h> 3109/// 3110/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 3111/// 3112/// \param __a 3113/// A 128-bit integer vector. 3114/// \param __b 3115/// A 128-bit integer vector. 3116/// \returns A 128-bit integer vector containing the comparison results. 3117static __inline__ __m128i __DEFAULT_FN_ATTRS 3118_mm_cmpgt_epi16(__m128i __a, __m128i __b) 3119{ 3120 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3121} 3122 3123/// \brief Compares each of the corresponding signed 32-bit values of the 3124/// 128-bit integer vectors to determine if the values in the first operand 3125/// are greater than those in the second operand. Each comparison yields 0h 3126/// for false, FFFFFFFFh for true. 3127/// 3128/// \headerfile <x86intrin.h> 3129/// 3130/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 3131/// 3132/// \param __a 3133/// A 128-bit integer vector. 3134/// \param __b 3135/// A 128-bit integer vector. 3136/// \returns A 128-bit integer vector containing the comparison results. 3137static __inline__ __m128i __DEFAULT_FN_ATTRS 3138_mm_cmpgt_epi32(__m128i __a, __m128i __b) 3139{ 3140 return (__m128i)((__v4si)__a > (__v4si)__b); 3141} 3142 3143/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit 3144/// integer vectors to determine if the values in the first operand are less 3145/// than those in the second operand. Each comparison yields 0h for false, 3146/// FFh for true. 3147/// 3148/// \headerfile <x86intrin.h> 3149/// 3150/// This intrinsic corresponds to the \c VPCMPGTB / PCMPGTB instruction. 3151/// 3152/// \param __a 3153/// A 128-bit integer vector. 3154/// \param __b 3155/// A 128-bit integer vector. 3156/// \returns A 128-bit integer vector containing the comparison results. 3157static __inline__ __m128i __DEFAULT_FN_ATTRS 3158_mm_cmplt_epi8(__m128i __a, __m128i __b) 3159{ 3160 return _mm_cmpgt_epi8(__b, __a); 3161} 3162 3163/// \brief Compares each of the corresponding signed 16-bit values of the 3164/// 128-bit integer vectors to determine if the values in the first operand 3165/// are less than those in the second operand. Each comparison yields 0h for 3166/// false, FFFFh for true. 3167/// 3168/// \headerfile <x86intrin.h> 3169/// 3170/// This intrinsic corresponds to the \c VPCMPGTW / PCMPGTW instruction. 3171/// 3172/// \param __a 3173/// A 128-bit integer vector. 3174/// \param __b 3175/// A 128-bit integer vector. 3176/// \returns A 128-bit integer vector containing the comparison results. 3177static __inline__ __m128i __DEFAULT_FN_ATTRS 3178_mm_cmplt_epi16(__m128i __a, __m128i __b) 3179{ 3180 return _mm_cmpgt_epi16(__b, __a); 3181} 3182 3183/// \brief Compares each of the corresponding signed 32-bit values of the 3184/// 128-bit integer vectors to determine if the values in the first operand 3185/// are less than those in the second operand. Each comparison yields 0h for 3186/// false, FFFFFFFFh for true. 3187/// 3188/// \headerfile <x86intrin.h> 3189/// 3190/// This intrinsic corresponds to the \c VPCMPGTD / PCMPGTD instruction. 3191/// 3192/// \param __a 3193/// A 128-bit integer vector. 3194/// \param __b 3195/// A 128-bit integer vector. 3196/// \returns A 128-bit integer vector containing the comparison results. 3197static __inline__ __m128i __DEFAULT_FN_ATTRS 3198_mm_cmplt_epi32(__m128i __a, __m128i __b) 3199{ 3200 return _mm_cmpgt_epi32(__b, __a); 3201} 3202 3203#ifdef __x86_64__ 3204/// \brief Converts a 64-bit signed integer value from the second operand into a 3205/// double-precision value and returns it in the lower element of a [2 x 3206/// double] vector; the upper element of the returned vector is copied from 3207/// the upper element of the first operand. 3208/// 3209/// \headerfile <x86intrin.h> 3210/// 3211/// This intrinsic corresponds to the \c VCVTSI2SD / CVTSI2SD instruction. 3212/// 3213/// \param __a 3214/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3215/// copied to the upper 64 bits of the destination. 3216/// \param __b 3217/// A 64-bit signed integer operand containing the value to be converted. 3218/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3219/// converted value of the second operand. The upper 64 bits are copied from 3220/// the upper 64 bits of the first operand. 3221static __inline__ __m128d __DEFAULT_FN_ATTRS 3222_mm_cvtsi64_sd(__m128d __a, long long __b) 3223{ 3224 __a[0] = __b; 3225 return __a; 3226} 3227 3228/// \brief Converts the first (lower) element of a vector of [2 x double] into a 3229/// 64-bit signed integer value, according to the current rounding mode. 3230/// 3231/// \headerfile <x86intrin.h> 3232/// 3233/// This intrinsic corresponds to the \c VCVTSD2SI / CVTSD2SI instruction. 3234/// 3235/// \param __a 3236/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3237/// conversion. 3238/// \returns A 64-bit signed integer containing the converted value. 3239static __inline__ long long __DEFAULT_FN_ATTRS 3240_mm_cvtsd_si64(__m128d __a) 3241{ 3242 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3243} 3244 3245/// \brief Converts the first (lower) element of a vector of [2 x double] into a 3246/// 64-bit signed integer value, truncating the result when it is inexact. 3247/// 3248/// \headerfile <x86intrin.h> 3249/// 3250/// This intrinsic corresponds to the \c VCVTTSD2SI / CVTTSD2SI instruction. 3251/// 3252/// \param __a 3253/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3254/// conversion. 3255/// \returns A 64-bit signed integer containing the converted value. 3256static __inline__ long long __DEFAULT_FN_ATTRS 3257_mm_cvttsd_si64(__m128d __a) 3258{ 3259 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3260} 3261#endif 3262 3263/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. 3264/// 3265/// \headerfile <x86intrin.h> 3266/// 3267/// This intrinsic corresponds to the \c VCVTDQ2PS / CVTDQ2PS instruction. 3268/// 3269/// \param __a 3270/// A 128-bit integer vector. 3271/// \returns A 128-bit vector of [4 x float] containing the converted values. 3272static __inline__ __m128 __DEFAULT_FN_ATTRS 3273_mm_cvtepi32_ps(__m128i __a) 3274{ 3275 return __builtin_ia32_cvtdq2ps((__v4si)__a); 3276} 3277 3278/// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. 3279/// 3280/// \headerfile <x86intrin.h> 3281/// 3282/// This intrinsic corresponds to the \c VCVTPS2DQ / CVTPS2DQ instruction. 3283/// 3284/// \param __a 3285/// A 128-bit vector of [4 x float]. 3286/// \returns A 128-bit integer vector of [4 x i32] containing the converted 3287/// values. 3288static __inline__ __m128i __DEFAULT_FN_ATTRS 3289_mm_cvtps_epi32(__m128 __a) 3290{ 3291 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3292} 3293 3294/// \brief Converts a vector of [4 x float] into a vector of [4 x i32], 3295/// truncating the result when it is inexact. 3296/// 3297/// \headerfile <x86intrin.h> 3298/// 3299/// This intrinsic corresponds to the \c VCVTTPS2DQ / CVTTPS2DQ instruction. 3300/// 3301/// \param __a 3302/// A 128-bit vector of [4 x float]. 3303/// \returns A 128-bit vector of [4 x i32] containing the converted values. 3304static __inline__ __m128i __DEFAULT_FN_ATTRS 3305_mm_cvttps_epi32(__m128 __a) 3306{ 3307 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3308} 3309 3310/// \brief Returns a vector of [4 x i32] where the lowest element is the input 3311/// operand and the remaining elements are zero. 3312/// 3313/// \headerfile <x86intrin.h> 3314/// 3315/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 3316/// 3317/// \param __a 3318/// A 32-bit signed integer operand. 3319/// \returns A 128-bit vector of [4 x i32]. 3320static __inline__ __m128i __DEFAULT_FN_ATTRS 3321_mm_cvtsi32_si128(int __a) 3322{ 3323 return (__m128i)(__v4si){ __a, 0, 0, 0 }; 3324} 3325 3326#ifdef __x86_64__ 3327/// \brief Returns a vector of [2 x i64] where the lower element is the input 3328/// operand and the upper element is zero. 3329/// 3330/// \headerfile <x86intrin.h> 3331/// 3332/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 3333/// 3334/// \param __a 3335/// A 64-bit signed integer operand containing the value to be converted. 3336/// \returns A 128-bit vector of [2 x i64] containing the converted value. 3337static __inline__ __m128i __DEFAULT_FN_ATTRS 3338_mm_cvtsi64_si128(long long __a) 3339{ 3340 return (__m128i){ __a, 0 }; 3341} 3342#endif 3343 3344/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a 3345/// 32-bit signed integer value. 3346/// 3347/// \headerfile <x86intrin.h> 3348/// 3349/// This intrinsic corresponds to the \c VMOVD / MOVD instruction. 3350/// 3351/// \param __a 3352/// A vector of [4 x i32]. The least significant 32 bits are moved to the 3353/// destination. 3354/// \returns A 32-bit signed integer containing the moved value. 3355static __inline__ int __DEFAULT_FN_ATTRS 3356_mm_cvtsi128_si32(__m128i __a) 3357{ 3358 __v4si __b = (__v4si)__a; 3359 return __b[0]; 3360} 3361 3362#ifdef __x86_64__ 3363/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a 3364/// 64-bit signed integer value. 3365/// 3366/// \headerfile <x86intrin.h> 3367/// 3368/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 3369/// 3370/// \param __a 3371/// A vector of [2 x i64]. The least significant 64 bits are moved to the 3372/// destination. 3373/// \returns A 64-bit signed integer containing the moved value. 3374static __inline__ long long __DEFAULT_FN_ATTRS 3375_mm_cvtsi128_si64(__m128i __a) 3376{ 3377 return __a[0]; 3378} 3379#endif 3380 3381/// \brief Moves packed integer values from an aligned 128-bit memory location 3382/// to elements in a 128-bit integer vector. 3383/// 3384/// \headerfile <x86intrin.h> 3385/// 3386/// This intrinsic corresponds to the \c VMOVDQA / MOVDQA instruction. 3387/// 3388/// \param __p 3389/// An aligned pointer to a memory location containing integer values. 3390/// \returns A 128-bit integer vector containing the moved values. 3391static __inline__ __m128i __DEFAULT_FN_ATTRS 3392_mm_load_si128(__m128i const *__p) 3393{ 3394 return *__p; 3395} 3396 3397/// \brief Moves packed integer values from an unaligned 128-bit memory location 3398/// to elements in a 128-bit integer vector. 3399/// 3400/// \headerfile <x86intrin.h> 3401/// 3402/// This intrinsic corresponds to the \c VMOVDQU / MOVDQU instruction. 3403/// 3404/// \param __p 3405/// A pointer to a memory location containing integer values. 3406/// \returns A 128-bit integer vector containing the moved values. 3407static __inline__ __m128i __DEFAULT_FN_ATTRS 3408_mm_loadu_si128(__m128i const *__p) 3409{ 3410 struct __loadu_si128 { 3411 __m128i __v; 3412 } __attribute__((__packed__, __may_alias__)); 3413 return ((struct __loadu_si128*)__p)->__v; 3414} 3415 3416/// \brief Returns a vector of [2 x i64] where the lower element is taken from 3417/// the lower element of the operand, and the upper element is zero. 3418/// 3419/// \headerfile <x86intrin.h> 3420/// 3421/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 3422/// 3423/// \param __p 3424/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3425/// the destination. 3426/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3427/// moved value. The higher order bits are cleared. 3428static __inline__ __m128i __DEFAULT_FN_ATTRS 3429_mm_loadl_epi64(__m128i const *__p) 3430{ 3431 struct __mm_loadl_epi64_struct { 3432 long long __u; 3433 } __attribute__((__packed__, __may_alias__)); 3434 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 3435} 3436 3437/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. 3438/// This could be used as an argument to another intrinsic function where the 3439/// argument is required but the value is not actually used. 3440/// 3441/// \headerfile <x86intrin.h> 3442/// 3443/// This intrinsic has no corresponding instruction. 3444/// 3445/// \returns A 128-bit vector of [4 x i32] with unspecified content. 3446static __inline__ __m128i __DEFAULT_FN_ATTRS 3447_mm_undefined_si128(void) 3448{ 3449 return (__m128i)__builtin_ia32_undef128(); 3450} 3451 3452/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3453/// the specified 64-bit integer values. 3454/// 3455/// \headerfile <x86intrin.h> 3456/// 3457/// This intrinsic is a utility function and does not correspond to a specific 3458/// instruction. 3459/// 3460/// \param __q1 3461/// A 64-bit integer value used to initialize the upper 64 bits of the 3462/// destination vector of [2 x i64]. 3463/// \param __q0 3464/// A 64-bit integer value used to initialize the lower 64 bits of the 3465/// destination vector of [2 x i64]. 3466/// \returns An initialized 128-bit vector of [2 x i64] containing the values 3467/// provided in the operands. 3468static __inline__ __m128i __DEFAULT_FN_ATTRS 3469_mm_set_epi64x(long long __q1, long long __q0) 3470{ 3471 return (__m128i){ __q0, __q1 }; 3472} 3473 3474/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3475/// the specified 64-bit integer values. 3476/// 3477/// \headerfile <x86intrin.h> 3478/// 3479/// This intrinsic is a utility function and does not correspond to a specific 3480/// instruction. 3481/// 3482/// \param __q1 3483/// A 64-bit integer value used to initialize the upper 64 bits of the 3484/// destination vector of [2 x i64]. 3485/// \param __q0 3486/// A 64-bit integer value used to initialize the lower 64 bits of the 3487/// destination vector of [2 x i64]. 3488/// \returns An initialized 128-bit vector of [2 x i64] containing the values 3489/// provided in the operands. 3490static __inline__ __m128i __DEFAULT_FN_ATTRS 3491_mm_set_epi64(__m64 __q1, __m64 __q0) 3492{ 3493 return (__m128i){ (long long)__q0, (long long)__q1 }; 3494} 3495 3496/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3497/// the specified 32-bit integer values. 3498/// 3499/// \headerfile <x86intrin.h> 3500/// 3501/// This intrinsic is a utility function and does not correspond to a specific 3502/// instruction. 3503/// 3504/// \param __i3 3505/// A 32-bit integer value used to initialize bits [127:96] of the 3506/// destination vector. 3507/// \param __i2 3508/// A 32-bit integer value used to initialize bits [95:64] of the destination 3509/// vector. 3510/// \param __i1 3511/// A 32-bit integer value used to initialize bits [63:32] of the destination 3512/// vector. 3513/// \param __i0 3514/// A 32-bit integer value used to initialize bits [31:0] of the destination 3515/// vector. 3516/// \returns An initialized 128-bit vector of [4 x i32] containing the values 3517/// provided in the operands. 3518static __inline__ __m128i __DEFAULT_FN_ATTRS 3519_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 3520{ 3521 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3522} 3523 3524/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3525/// the specified 16-bit integer values. 3526/// 3527/// \headerfile <x86intrin.h> 3528/// 3529/// This intrinsic is a utility function and does not correspond to a specific 3530/// instruction. 3531/// 3532/// \param __w7 3533/// A 16-bit integer value used to initialize bits [127:112] of the 3534/// destination vector. 3535/// \param __w6 3536/// A 16-bit integer value used to initialize bits [111:96] of the 3537/// destination vector. 3538/// \param __w5 3539/// A 16-bit integer value used to initialize bits [95:80] of the destination 3540/// vector. 3541/// \param __w4 3542/// A 16-bit integer value used to initialize bits [79:64] of the destination 3543/// vector. 3544/// \param __w3 3545/// A 16-bit integer value used to initialize bits [63:48] of the destination 3546/// vector. 3547/// \param __w2 3548/// A 16-bit integer value used to initialize bits [47:32] of the destination 3549/// vector. 3550/// \param __w1 3551/// A 16-bit integer value used to initialize bits [31:16] of the destination 3552/// vector. 3553/// \param __w0 3554/// A 16-bit integer value used to initialize bits [15:0] of the destination 3555/// vector. 3556/// \returns An initialized 128-bit vector of [8 x i16] containing the values 3557/// provided in the operands. 3558static __inline__ __m128i __DEFAULT_FN_ATTRS 3559_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 3560{ 3561 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3562} 3563 3564/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3565/// the specified 8-bit integer values. 3566/// 3567/// \headerfile <x86intrin.h> 3568/// 3569/// This intrinsic is a utility function and does not correspond to a specific 3570/// instruction. 3571/// 3572/// \param __b15 3573/// Initializes bits [127:120] of the destination vector. 3574/// \param __b14 3575/// Initializes bits [119:112] of the destination vector. 3576/// \param __b13 3577/// Initializes bits [111:104] of the destination vector. 3578/// \param __b12 3579/// Initializes bits [103:96] of the destination vector. 3580/// \param __b11 3581/// Initializes bits [95:88] of the destination vector. 3582/// \param __b10 3583/// Initializes bits [87:80] of the destination vector. 3584/// \param __b9 3585/// Initializes bits [79:72] of the destination vector. 3586/// \param __b8 3587/// Initializes bits [71:64] of the destination vector. 3588/// \param __b7 3589/// Initializes bits [63:56] of the destination vector. 3590/// \param __b6 3591/// Initializes bits [55:48] of the destination vector. 3592/// \param __b5 3593/// Initializes bits [47:40] of the destination vector. 3594/// \param __b4 3595/// Initializes bits [39:32] of the destination vector. 3596/// \param __b3 3597/// Initializes bits [31:24] of the destination vector. 3598/// \param __b2 3599/// Initializes bits [23:16] of the destination vector. 3600/// \param __b1 3601/// Initializes bits [15:8] of the destination vector. 3602/// \param __b0 3603/// Initializes bits [7:0] of the destination vector. 3604/// \returns An initialized 128-bit vector of [16 x i8] containing the values 3605/// provided in the operands. 3606static __inline__ __m128i __DEFAULT_FN_ATTRS 3607_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 3608{ 3609 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3610} 3611 3612/// \brief Initializes both values in a 128-bit integer vector with the 3613/// specified 64-bit integer value. 3614/// 3615/// \headerfile <x86intrin.h> 3616/// 3617/// This intrinsic is a utility function and does not correspond to a specific 3618/// instruction. 3619/// 3620/// \param __q 3621/// Integer value used to initialize the elements of the destination integer 3622/// vector. 3623/// \returns An initialized 128-bit integer vector of [2 x i64] with both 3624/// elements containing the value provided in the operand. 3625static __inline__ __m128i __DEFAULT_FN_ATTRS 3626_mm_set1_epi64x(long long __q) 3627{ 3628 return (__m128i){ __q, __q }; 3629} 3630 3631/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the 3632/// specified 64-bit value. 3633/// 3634/// \headerfile <x86intrin.h> 3635/// 3636/// This intrinsic is a utility function and does not correspond to a specific 3637/// instruction. 3638/// 3639/// \param __q 3640/// A 64-bit value used to initialize the elements of the destination integer 3641/// vector. 3642/// \returns An initialized 128-bit vector of [2 x i64] with all elements 3643/// containing the value provided in the operand. 3644static __inline__ __m128i __DEFAULT_FN_ATTRS 3645_mm_set1_epi64(__m64 __q) 3646{ 3647 return (__m128i){ (long long)__q, (long long)__q }; 3648} 3649 3650/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the 3651/// specified 32-bit value. 3652/// 3653/// \headerfile <x86intrin.h> 3654/// 3655/// This intrinsic is a utility function and does not correspond to a specific 3656/// instruction. 3657/// 3658/// \param __i 3659/// A 32-bit value used to initialize the elements of the destination integer 3660/// vector. 3661/// \returns An initialized 128-bit vector of [4 x i32] with all elements 3662/// containing the value provided in the operand. 3663static __inline__ __m128i __DEFAULT_FN_ATTRS 3664_mm_set1_epi32(int __i) 3665{ 3666 return (__m128i)(__v4si){ __i, __i, __i, __i }; 3667} 3668 3669/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the 3670/// specified 16-bit value. 3671/// 3672/// \headerfile <x86intrin.h> 3673/// 3674/// This intrinsic is a utility function and does not correspond to a specific 3675/// instruction. 3676/// 3677/// \param __w 3678/// A 16-bit value used to initialize the elements of the destination integer 3679/// vector. 3680/// \returns An initialized 128-bit vector of [8 x i16] with all elements 3681/// containing the value provided in the operand. 3682static __inline__ __m128i __DEFAULT_FN_ATTRS 3683_mm_set1_epi16(short __w) 3684{ 3685 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; 3686} 3687 3688/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the 3689/// specified 8-bit value. 3690/// 3691/// \headerfile <x86intrin.h> 3692/// 3693/// This intrinsic is a utility function and does not correspond to a specific 3694/// instruction. 3695/// 3696/// \param __b 3697/// An 8-bit value used to initialize the elements of the destination integer 3698/// vector. 3699/// \returns An initialized 128-bit vector of [16 x i8] with all elements 3700/// containing the value provided in the operand. 3701static __inline__ __m128i __DEFAULT_FN_ATTRS 3702_mm_set1_epi8(char __b) 3703{ 3704 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; 3705} 3706 3707/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3708/// with the specified 64-bit integral values. 3709/// 3710/// \headerfile <x86intrin.h> 3711/// 3712/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction. 3713/// 3714/// \param __q0 3715/// A 64-bit integral value used to initialize the lower 64 bits of the 3716/// result. 3717/// \param __q1 3718/// A 64-bit integral value used to initialize the upper 64 bits of the 3719/// result. 3720/// \returns An initialized 128-bit integer vector. 3721static __inline__ __m128i __DEFAULT_FN_ATTRS 3722_mm_setr_epi64(__m64 __q0, __m64 __q1) 3723{ 3724 return (__m128i){ (long long)__q0, (long long)__q1 }; 3725} 3726 3727/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3728/// with the specified 32-bit integral values. 3729/// 3730/// \headerfile <x86intrin.h> 3731/// 3732/// This intrinsic is a utility function and does not correspond to a specific 3733/// instruction. 3734/// 3735/// \param __i0 3736/// A 32-bit integral value used to initialize bits [31:0] of the result. 3737/// \param __i1 3738/// A 32-bit integral value used to initialize bits [63:32] of the result. 3739/// \param __i2 3740/// A 32-bit integral value used to initialize bits [95:64] of the result. 3741/// \param __i3 3742/// A 32-bit integral value used to initialize bits [127:96] of the result. 3743/// \returns An initialized 128-bit integer vector. 3744static __inline__ __m128i __DEFAULT_FN_ATTRS 3745_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 3746{ 3747 return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3748} 3749 3750/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3751/// with the specified 16-bit integral values. 3752/// 3753/// \headerfile <x86intrin.h> 3754/// 3755/// This intrinsic is a utility function and does not correspond to a specific 3756/// instruction. 3757/// 3758/// \param __w0 3759/// A 16-bit integral value used to initialize bits [15:0] of the result. 3760/// \param __w1 3761/// A 16-bit integral value used to initialize bits [31:16] of the result. 3762/// \param __w2 3763/// A 16-bit integral value used to initialize bits [47:32] of the result. 3764/// \param __w3 3765/// A 16-bit integral value used to initialize bits [63:48] of the result. 3766/// \param __w4 3767/// A 16-bit integral value used to initialize bits [79:64] of the result. 3768/// \param __w5 3769/// A 16-bit integral value used to initialize bits [95:80] of the result. 3770/// \param __w6 3771/// A 16-bit integral value used to initialize bits [111:96] of the result. 3772/// \param __w7 3773/// A 16-bit integral value used to initialize bits [127:112] of the result. 3774/// \returns An initialized 128-bit integer vector. 3775static __inline__ __m128i __DEFAULT_FN_ATTRS 3776_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 3777{ 3778 return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3779} 3780 3781/// \brief Constructs a 128-bit integer vector, initialized in reverse order 3782/// with the specified 8-bit integral values. 3783/// 3784/// \headerfile <x86intrin.h> 3785/// 3786/// This intrinsic is a utility function and does not correspond to a specific 3787/// instruction. 3788/// 3789/// \param __b0 3790/// An 8-bit integral value used to initialize bits [7:0] of the result. 3791/// \param __b1 3792/// An 8-bit integral value used to initialize bits [15:8] of the result. 3793/// \param __b2 3794/// An 8-bit integral value used to initialize bits [23:16] of the result. 3795/// \param __b3 3796/// An 8-bit integral value used to initialize bits [31:24] of the result. 3797/// \param __b4 3798/// An 8-bit integral value used to initialize bits [39:32] of the result. 3799/// \param __b5 3800/// An 8-bit integral value used to initialize bits [47:40] of the result. 3801/// \param __b6 3802/// An 8-bit integral value used to initialize bits [55:48] of the result. 3803/// \param __b7 3804/// An 8-bit integral value used to initialize bits [63:56] of the result. 3805/// \param __b8 3806/// An 8-bit integral value used to initialize bits [71:64] of the result. 3807/// \param __b9 3808/// An 8-bit integral value used to initialize bits [79:72] of the result. 3809/// \param __b10 3810/// An 8-bit integral value used to initialize bits [87:80] of the result. 3811/// \param __b11 3812/// An 8-bit integral value used to initialize bits [95:88] of the result. 3813/// \param __b12 3814/// An 8-bit integral value used to initialize bits [103:96] of the result. 3815/// \param __b13 3816/// An 8-bit integral value used to initialize bits [111:104] of the result. 3817/// \param __b14 3818/// An 8-bit integral value used to initialize bits [119:112] of the result. 3819/// \param __b15 3820/// An 8-bit integral value used to initialize bits [127:120] of the result. 3821/// \returns An initialized 128-bit integer vector. 3822static __inline__ __m128i __DEFAULT_FN_ATTRS 3823_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 3824{ 3825 return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3826} 3827 3828/// \brief Creates a 128-bit integer vector initialized to zero. 3829/// 3830/// \headerfile <x86intrin.h> 3831/// 3832/// This intrinsic corresponds to the \c VXORPS / XORPS instruction. 3833/// 3834/// \returns An initialized 128-bit integer vector with all elements set to 3835/// zero. 3836static __inline__ __m128i __DEFAULT_FN_ATTRS 3837_mm_setzero_si128(void) 3838{ 3839 return (__m128i){ 0LL, 0LL }; 3840} 3841 3842/// \brief Stores a 128-bit integer vector to a memory location aligned on a 3843/// 128-bit boundary. 3844/// 3845/// \headerfile <x86intrin.h> 3846/// 3847/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction. 3848/// 3849/// \param __p 3850/// A pointer to an aligned memory location that will receive the integer 3851/// values. 3852/// \param __b 3853/// A 128-bit integer vector containing the values to be moved. 3854static __inline__ void __DEFAULT_FN_ATTRS 3855_mm_store_si128(__m128i *__p, __m128i __b) 3856{ 3857 *__p = __b; 3858} 3859 3860/// \brief Stores a 128-bit integer vector to an unaligned memory location. 3861/// 3862/// \headerfile <x86intrin.h> 3863/// 3864/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction. 3865/// 3866/// \param __p 3867/// A pointer to a memory location that will receive the integer values. 3868/// \param __b 3869/// A 128-bit integer vector containing the values to be moved. 3870static __inline__ void __DEFAULT_FN_ATTRS 3871_mm_storeu_si128(__m128i *__p, __m128i __b) 3872{ 3873 struct __storeu_si128 { 3874 __m128i __v; 3875 } __attribute__((__packed__, __may_alias__)); 3876 ((struct __storeu_si128*)__p)->__v = __b; 3877} 3878 3879/// \brief Moves bytes selected by the mask from the first operand to the 3880/// specified unaligned memory location. When a mask bit is 1, the 3881/// corresponding byte is written, otherwise it is not written. To minimize 3882/// caching, the date is flagged as non-temporal (unlikely to be used again 3883/// soon). Exception and trap behavior for elements not selected for storage 3884/// to memory are implementation dependent. 3885/// 3886/// \headerfile <x86intrin.h> 3887/// 3888/// This intrinsic corresponds to the \c VMASKMOVDQU / MASKMOVDQU instruction. 3889/// 3890/// \param __d 3891/// A 128-bit integer vector containing the values to be moved. 3892/// \param __n 3893/// A 128-bit integer vector containing the mask. The most significant bit of 3894/// each byte represents the mask bits. 3895/// \param __p 3896/// A pointer to an unaligned 128-bit memory location where the specified 3897/// values are moved. 3898static __inline__ void __DEFAULT_FN_ATTRS 3899_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 3900{ 3901 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 3902} 3903 3904/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 3905/// a memory location. 3906/// 3907/// \headerfile <x86intrin.h> 3908/// 3909/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction. 3910/// 3911/// \param __p 3912/// A pointer to a 64-bit memory location that will receive the lower 64 bits 3913/// of the integer vector parameter. 3914/// \param __a 3915/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 3916/// value to be stored. 3917static __inline__ void __DEFAULT_FN_ATTRS 3918_mm_storel_epi64(__m128i *__p, __m128i __a) 3919{ 3920 struct __mm_storel_epi64_struct { 3921 long long __u; 3922 } __attribute__((__packed__, __may_alias__)); 3923 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 3924} 3925 3926/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit 3927/// aligned memory location. To minimize caching, the data is flagged as 3928/// non-temporal (unlikely to be used again soon). 3929/// 3930/// \headerfile <x86intrin.h> 3931/// 3932/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction. 3933/// 3934/// \param __p 3935/// A pointer to the 128-bit aligned memory location used to store the value. 3936/// \param __a 3937/// A vector of [2 x double] containing the 64-bit values to be stored. 3938static __inline__ void __DEFAULT_FN_ATTRS 3939_mm_stream_pd(double *__p, __m128d __a) 3940{ 3941 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 3942} 3943 3944/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location. 3945/// To minimize caching, the data is flagged as non-temporal (unlikely to be 3946/// used again soon). 3947/// 3948/// \headerfile <x86intrin.h> 3949/// 3950/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction. 3951/// 3952/// \param __p 3953/// A pointer to the 128-bit aligned memory location used to store the value. 3954/// \param __a 3955/// A 128-bit integer vector containing the values to be stored. 3956static __inline__ void __DEFAULT_FN_ATTRS 3957_mm_stream_si128(__m128i *__p, __m128i __a) 3958{ 3959 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 3960} 3961 3962/// \brief Stores a 32-bit integer value in the specified memory location. To 3963/// minimize caching, the data is flagged as non-temporal (unlikely to be 3964/// used again soon). 3965/// 3966/// \headerfile <x86intrin.h> 3967/// 3968/// This intrinsic corresponds to the \c MOVNTI instruction. 3969/// 3970/// \param __p 3971/// A pointer to the 32-bit memory location used to store the value. 3972/// \param __a 3973/// A 32-bit integer containing the value to be stored. 3974static __inline__ void __DEFAULT_FN_ATTRS 3975_mm_stream_si32(int *__p, int __a) 3976{ 3977 __builtin_ia32_movnti(__p, __a); 3978} 3979 3980#ifdef __x86_64__ 3981/// \brief Stores a 64-bit integer value in the specified memory location. To 3982/// minimize caching, the data is flagged as non-temporal (unlikely to be 3983/// used again soon). 3984/// 3985/// \headerfile <x86intrin.h> 3986/// 3987/// This intrinsic corresponds to the \c MOVNTIQ instruction. 3988/// 3989/// \param __p 3990/// A pointer to the 64-bit memory location used to store the value. 3991/// \param __a 3992/// A 64-bit integer containing the value to be stored. 3993static __inline__ void __DEFAULT_FN_ATTRS 3994_mm_stream_si64(long long *__p, long long __a) 3995{ 3996 __builtin_ia32_movnti64(__p, __a); 3997} 3998#endif 3999 4000#if defined(__cplusplus) 4001extern "C" { 4002#endif 4003 4004/// \brief The cache line containing __p is flushed and invalidated from all 4005/// caches in the coherency domain. 4006/// 4007/// \headerfile <x86intrin.h> 4008/// 4009/// This intrinsic corresponds to the \c CLFLUSH instruction. 4010/// 4011/// \param __p 4012/// A pointer to the memory location used to identify the cache line to be 4013/// flushed. 4014void _mm_clflush(void const *); 4015 4016/// \brief Forces strong memory ordering (serialization) between load 4017/// instructions preceding this instruction and load instructions following 4018/// this instruction, ensuring the system completes all previous loads before 4019/// executing subsequent loads. 4020/// 4021/// \headerfile <x86intrin.h> 4022/// 4023/// This intrinsic corresponds to the \c LFENCE instruction. 4024/// 4025void _mm_lfence(void); 4026 4027/// \brief Forces strong memory ordering (serialization) between load and store 4028/// instructions preceding this instruction and load and store instructions 4029/// following this instruction, ensuring that the system completes all 4030/// previous memory accesses before executing subsequent memory accesses. 4031/// 4032/// \headerfile <x86intrin.h> 4033/// 4034/// This intrinsic corresponds to the \c MFENCE instruction. 4035/// 4036void _mm_mfence(void); 4037 4038#if defined(__cplusplus) 4039} // extern "C" 4040#endif 4041 4042/// \brief Converts 16-bit signed integers from both 128-bit integer vector 4043/// operands into 8-bit signed integers, and packs the results into the 4044/// destination. Positive values greater than 0x7F are saturated to 0x7F. 4045/// Negative values less than 0x80 are saturated to 0x80. 4046/// 4047/// \headerfile <x86intrin.h> 4048/// 4049/// This intrinsic corresponds to the \c VPACKSSWB / PACKSSWB instruction. 4050/// 4051/// \param __a 4052/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4053/// a signed integer and is converted to a 8-bit signed integer with 4054/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4055/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4056/// written to the lower 64 bits of the result. 4057/// \param __b 4058/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4059/// a signed integer and is converted to a 8-bit signed integer with 4060/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4061/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4062/// written to the higher 64 bits of the result. 4063/// \returns A 128-bit vector of [16 x i8] containing the converted values. 4064static __inline__ __m128i __DEFAULT_FN_ATTRS 4065_mm_packs_epi16(__m128i __a, __m128i __b) 4066{ 4067 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4068} 4069 4070/// \brief Converts 32-bit signed integers from both 128-bit integer vector 4071/// operands into 16-bit signed integers, and packs the results into the 4072/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4073/// Negative values less than 0x8000 are saturated to 0x8000. 4074/// 4075/// \headerfile <x86intrin.h> 4076/// 4077/// This intrinsic corresponds to the \c VPACKSSDW / PACKSSDW instruction. 4078/// 4079/// \param __a 4080/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4081/// a signed integer and is converted to a 16-bit signed integer with 4082/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4083/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4084/// are written to the lower 64 bits of the result. 4085/// \param __b 4086/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4087/// a signed integer and is converted to a 16-bit signed integer with 4088/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4089/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4090/// are written to the higher 64 bits of the result. 4091/// \returns A 128-bit vector of [8 x i16] containing the converted values. 4092static __inline__ __m128i __DEFAULT_FN_ATTRS 4093_mm_packs_epi32(__m128i __a, __m128i __b) 4094{ 4095 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4096} 4097 4098/// \brief Converts 16-bit signed integers from both 128-bit integer vector 4099/// operands into 8-bit unsigned integers, and packs the results into the 4100/// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4101/// than 0x00 are saturated to 0x00. 4102/// 4103/// \headerfile <x86intrin.h> 4104/// 4105/// This intrinsic corresponds to the \c VPACKUSWB / PACKUSWB instruction. 4106/// 4107/// \param __a 4108/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4109/// a signed integer and is converted to an 8-bit unsigned integer with 4110/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4111/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4112/// written to the lower 64 bits of the result. 4113/// \param __b 4114/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4115/// a signed integer and is converted to an 8-bit unsigned integer with 4116/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4117/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4118/// written to the higher 64 bits of the result. 4119/// \returns A 128-bit vector of [16 x i8] containing the converted values. 4120static __inline__ __m128i __DEFAULT_FN_ATTRS 4121_mm_packus_epi16(__m128i __a, __m128i __b) 4122{ 4123 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4124} 4125 4126/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4127/// the immediate-value parameter as a selector. 4128/// 4129/// \headerfile <x86intrin.h> 4130/// 4131/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction. 4132/// 4133/// \param __a 4134/// A 128-bit integer vector. 4135/// \param __imm 4136/// An immediate value. Bits [3:0] selects values from __a to be assigned to 4137/// bits[15:0] of the result. 4138/// 000: assign values from bits [15:0] of __a. 4139/// 001: assign values from bits [31:16] of __a. 4140/// 010: assign values from bits [47:32] of __a. 4141/// 011: assign values from bits [63:48] of __a. 4142/// 100: assign values from bits [79:64] of __a. 4143/// 101: assign values from bits [95:80] of __a. 4144/// 110: assign values from bits [111:96] of __a. 4145/// 111: assign values from bits [127:112] of __a. 4146/// \returns An integer, whose lower 16 bits are selected from the 128-bit 4147/// integer vector parameter and the remaining bits are assigned zeros. 4148static __inline__ int __DEFAULT_FN_ATTRS 4149_mm_extract_epi16(__m128i __a, int __imm) 4150{ 4151 __v8hi __b = (__v8hi)__a; 4152 return (unsigned short)__b[__imm & 7]; 4153} 4154 4155/// \brief Constructs a 128-bit integer vector by first making a copy of the 4156/// 128-bit integer vector parameter, and then inserting the lower 16 bits 4157/// of an integer parameter into an offset specified by the immediate-value 4158/// parameter. 4159/// 4160/// \headerfile <x86intrin.h> 4161/// 4162/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction. 4163/// 4164/// \param __a 4165/// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4166/// result and then one of the eight elements in the result is replaced by 4167/// the lower 16 bits of __b. 4168/// \param __b 4169/// An integer. The lower 16 bits of this parameter are written to the 4170/// result beginning at an offset specified by __imm. 4171/// \param __imm 4172/// An immediate value specifying the bit offset in the result at which the 4173/// lower 16 bits of__b are written. 4174/// \returns A 128-bit integer vector containing the constructed values. 4175static __inline__ __m128i __DEFAULT_FN_ATTRS 4176_mm_insert_epi16(__m128i __a, int __b, int __imm) 4177{ 4178 __v8hi __c = (__v8hi)__a; 4179 __c[__imm & 7] = __b; 4180 return (__m128i)__c; 4181} 4182 4183/// \brief Copies the values of the most significant bits from each 8-bit 4184/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4185/// value, zero-extends the value, and writes it to the destination. 4186/// 4187/// \headerfile <x86intrin.h> 4188/// 4189/// This intrinsic corresponds to the \c VPMOVMSKB / PMOVMSKB instruction. 4190/// 4191/// \param __a 4192/// A 128-bit integer vector containing the values with bits to be extracted. 4193/// \returns The most significant bits from each 8-bit element in __a, written 4194/// to bits [15:0]. The other bits are assigned zeros. 4195static __inline__ int __DEFAULT_FN_ATTRS 4196_mm_movemask_epi8(__m128i __a) 4197{ 4198 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4199} 4200 4201/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit 4202/// elements of a 128-bit integer vector parameter, using the immediate-value 4203/// parameter as a specifier. 4204/// 4205/// \headerfile <x86intrin.h> 4206/// 4207/// \code 4208/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4209/// \endcode 4210/// 4211/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction. 4212/// 4213/// \param a 4214/// A 128-bit integer vector containing the values to be copied. 4215/// \param imm 4216/// An immediate value containing an 8-bit value specifying which elements to 4217/// copy from a. The destinations within the 128-bit destination are assigned 4218/// values as follows: 4219/// Bits [1:0] are used to assign values to bits [31:0] of the result. 4220/// Bits [3:2] are used to assign values to bits [63:32] of the result. 4221/// Bits [5:4] are used to assign values to bits [95:64] of the result. 4222/// Bits [7:6] are used to assign values to bits [127:96] of the result. 4223/// Bit value assignments: 4224/// 00: assign values from bits [31:0] of a. 4225/// 01: assign values from bits [63:32] of a. 4226/// 10: assign values from bits [95:64] of a. 4227/// 11: assign values from bits [127:96] of a. 4228/// \returns A 128-bit integer vector containing the shuffled values. 4229#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ 4230 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ 4231 (__v4si)_mm_undefined_si128(), \ 4232 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 4233 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) 4234 4235/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit 4236/// elements of a 128-bit integer vector of [8 x i16], using the immediate 4237/// value parameter as a specifier. 4238/// 4239/// \headerfile <x86intrin.h> 4240/// 4241/// \code 4242/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4243/// \endcode 4244/// 4245/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction. 4246/// 4247/// \param a 4248/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4249/// [127:64] of the result. 4250/// \param imm 4251/// An 8-bit immediate value specifying which elements to copy from a. 4252/// Bits[1:0] are used to assign values to bits [15:0] of the result. 4253/// Bits[3:2] are used to assign values to bits [31:16] of the result. 4254/// Bits[5:4] are used to assign values to bits [47:32] of the result. 4255/// Bits[7:6] are used to assign values to bits [63:48] of the result. 4256/// Bit value assignments: 4257/// 00: assign values from bits [15:0] of a. 4258/// 01: assign values from bits [31:16] of a. 4259/// 10: assign values from bits [47:32] of a. 4260/// 11: assign values from bits [63:48] of a. 4261/// \returns A 128-bit integer vector containing the shuffled values. 4262#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ 4263 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 4264 (__v8hi)_mm_undefined_si128(), \ 4265 ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ 4266 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ 4267 4, 5, 6, 7); }) 4268 4269/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit 4270/// elements of a 128-bit integer vector of [8 x i16], using the immediate 4271/// value parameter as a specifier. 4272/// 4273/// \headerfile <x86intrin.h> 4274/// 4275/// \code 4276/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4277/// \endcode 4278/// 4279/// This intrinsic corresponds to the \c VPSHUFHW / PSHUFHW instruction. 4280/// 4281/// \param a 4282/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4283/// [63:0] of the result. 4284/// \param imm 4285/// An 8-bit immediate value specifying which elements to copy from a. 4286/// Bits[1:0] are used to assign values to bits [79:64] of the result. 4287/// Bits[3:2] are used to assign values to bits [95:80] of the result. 4288/// Bits[5:4] are used to assign values to bits [111:96] of the result. 4289/// Bits[7:6] are used to assign values to bits [127:112] of the result. 4290/// Bit value assignments: 4291/// 00: assign values from bits [79:64] of a. 4292/// 01: assign values from bits [95:80] of a. 4293/// 10: assign values from bits [111:96] of a. 4294/// 11: assign values from bits [127:112] of a. 4295/// \returns A 128-bit integer vector containing the shuffled values. 4296#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ 4297 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ 4298 (__v8hi)_mm_undefined_si128(), \ 4299 0, 1, 2, 3, \ 4300 4 + (((imm) >> 0) & 0x3), \ 4301 4 + (((imm) >> 2) & 0x3), \ 4302 4 + (((imm) >> 4) & 0x3), \ 4303 4 + (((imm) >> 6) & 0x3)); }) 4304 4305/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors 4306/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4307/// 4308/// \headerfile <x86intrin.h> 4309/// 4310/// This intrinsic corresponds to the \c VPUNPCKHBW / PUNPCKHBW instruction. 4311/// 4312/// \param __a 4313/// A 128-bit vector of [16 x i8]. 4314/// Bits [71:64] are written to bits [7:0] of the result 4315/// Bits [79:72] are written to bits [23:16] of the result. 4316/// Bits [87:80] are written to bits [39:32] of the result. 4317/// Bits [95:88] are written to bits [55:48] of the result. 4318/// Bits [103:96] are written to bits [71:64] of the result. 4319/// Bits [111:104] are written to bits [87:80] of the result. 4320/// Bits [119:112] are written to bits [103:96] of the result. 4321/// Bits [127:120] are written to bits [119:112] of the result. 4322/// \param __b 4323/// A 128-bit vector of [16 x i8]. 4324/// Bits [71:64] are written to bits [15:8] of the result. 4325/// Bits [79:72] are written to bits [31:24] of the result. 4326/// Bits [87:80] are written to bits [47:40] of the result. 4327/// Bits [95:88] are written to bits [63:56] of the result. 4328/// Bits [103:96] are written to bits [79:72] of the result. 4329/// Bits [111:104] are written to bits [95:88] of the result. 4330/// Bits [119:112] are written to bits [111:104] of the result. 4331/// Bits [127:120] are written to bits [127:120] of the destination. 4332/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4333static __inline__ __m128i __DEFAULT_FN_ATTRS 4334_mm_unpackhi_epi8(__m128i __a, __m128i __b) 4335{ 4336 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 4337} 4338 4339/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4340/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4341/// 4342/// \headerfile <x86intrin.h> 4343/// 4344/// This intrinsic corresponds to the \c VPUNPCKHWD / PUNPCKHWD instruction. 4345/// 4346/// \param __a 4347/// A 128-bit vector of [8 x i16]. 4348/// Bits [79:64] are written to bits [15:0] of the result. 4349/// Bits [95:80] are written to bits [47:32] of the result. 4350/// Bits [111:96] are written to bits [79:64] of the result. 4351/// Bits [127:112] are written to bits [111:96] of the result. 4352/// \param __b 4353/// A 128-bit vector of [8 x i16]. 4354/// Bits [79:64] are written to bits [31:16] of the result. 4355/// Bits [95:80] are written to bits [63:48] of the result. 4356/// Bits [111:96] are written to bits [95:80] of the result. 4357/// Bits [127:112] are written to bits [127:112] of the result. 4358/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4359static __inline__ __m128i __DEFAULT_FN_ATTRS 4360_mm_unpackhi_epi16(__m128i __a, __m128i __b) 4361{ 4362 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 4363} 4364 4365/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4366/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4367/// 4368/// \headerfile <x86intrin.h> 4369/// 4370/// This intrinsic corresponds to the \c VPUNPCKHDQ / PUNPCKHDQ instruction. 4371/// 4372/// \param __a 4373/// A 128-bit vector of [4 x i32]. 4374/// Bits [95:64] are written to bits [31:0] of the destination. 4375/// Bits [127:96] are written to bits [95:64] of the destination. 4376/// \param __b 4377/// A 128-bit vector of [4 x i32]. 4378/// Bits [95:64] are written to bits [64:32] of the destination. 4379/// Bits [127:96] are written to bits [127:96] of the destination. 4380/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4381static __inline__ __m128i __DEFAULT_FN_ATTRS 4382_mm_unpackhi_epi32(__m128i __a, __m128i __b) 4383{ 4384 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 4385} 4386 4387/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors 4388/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4389/// 4390/// \headerfile <x86intrin.h> 4391/// 4392/// This intrinsic corresponds to the \c VPUNPCKHQDQ / PUNPCKHQDQ instruction. 4393/// 4394/// \param __a 4395/// A 128-bit vector of [2 x i64]. 4396/// Bits [127:64] are written to bits [63:0] of the destination. 4397/// \param __b 4398/// A 128-bit vector of [2 x i64]. 4399/// Bits [127:64] are written to bits [127:64] of the destination. 4400/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4401static __inline__ __m128i __DEFAULT_FN_ATTRS 4402_mm_unpackhi_epi64(__m128i __a, __m128i __b) 4403{ 4404 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 4405} 4406 4407/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4408/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4409/// 4410/// \headerfile <x86intrin.h> 4411/// 4412/// This intrinsic corresponds to the \c VPUNPCKLBW / PUNPCKLBW instruction. 4413/// 4414/// \param __a 4415/// A 128-bit vector of [16 x i8]. 4416/// Bits [7:0] are written to bits [7:0] of the result. 4417/// Bits [15:8] are written to bits [23:16] of the result. 4418/// Bits [23:16] are written to bits [39:32] of the result. 4419/// Bits [31:24] are written to bits [55:48] of the result. 4420/// Bits [39:32] are written to bits [71:64] of the result. 4421/// Bits [47:40] are written to bits [87:80] of the result. 4422/// Bits [55:48] are written to bits [103:96] of the result. 4423/// Bits [63:56] are written to bits [119:112] of the destination. 4424/// \param __b 4425/// A 128-bit vector of [16 x i8]. 4426/// Bits [7:0] are written to bits [15:8] of the result. 4427/// Bits [15:8] are written to bits [31:24] of the result. 4428/// Bits [23:16] are written to bits [47:40] of the result. 4429/// Bits [31:24] are written to bits [63:56] of the result. 4430/// Bits [39:32] are written to bits [79:72] of the result. 4431/// Bits [47:40] are written to bits [95:88] of the result. 4432/// Bits [55:48] are written to bits [111:104] of the result. 4433/// Bits [63:56] are written to bits [127:120] of the result. 4434/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4435static __inline__ __m128i __DEFAULT_FN_ATTRS 4436_mm_unpacklo_epi8(__m128i __a, __m128i __b) 4437{ 4438 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 4439} 4440 4441/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit 4442/// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4443/// [8 x i16]. 4444/// 4445/// \headerfile <x86intrin.h> 4446/// 4447/// This intrinsic corresponds to the \c VPUNPCKLWD / PUNPCKLWD instruction. 4448/// 4449/// \param __a 4450/// A 128-bit vector of [8 x i16]. 4451/// Bits [15:0] are written to bits [15:0] of the result. 4452/// Bits [31:16] are written to bits [47:32] of the result. 4453/// Bits [47:32] are written to bits [79:64] of the result. 4454/// Bits [63:48] are written to bits [111:96] of the result. 4455/// \param __b 4456/// A 128-bit vector of [8 x i16]. 4457/// Bits [15:0] are written to bits [31:16] of the result. 4458/// Bits [31:16] are written to bits [63:48] of the result. 4459/// Bits [47:32] are written to bits [95:80] of the result. 4460/// Bits [63:48] are written to bits [127:112] of the result. 4461/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4462static __inline__ __m128i __DEFAULT_FN_ATTRS 4463_mm_unpacklo_epi16(__m128i __a, __m128i __b) 4464{ 4465 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 4466} 4467 4468/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4469/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4470/// 4471/// \headerfile <x86intrin.h> 4472/// 4473/// This intrinsic corresponds to the \c VPUNPCKLDQ / PUNPCKLDQ instruction. 4474/// 4475/// \param __a 4476/// A 128-bit vector of [4 x i32]. 4477/// Bits [31:0] are written to bits [31:0] of the destination. 4478/// Bits [63:32] are written to bits [95:64] of the destination. 4479/// \param __b 4480/// A 128-bit vector of [4 x i32]. 4481/// Bits [31:0] are written to bits [64:32] of the destination. 4482/// Bits [63:32] are written to bits [127:96] of the destination. 4483/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4484static __inline__ __m128i __DEFAULT_FN_ATTRS 4485_mm_unpacklo_epi32(__m128i __a, __m128i __b) 4486{ 4487 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 4488} 4489 4490/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of 4491/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4492/// 4493/// \headerfile <x86intrin.h> 4494/// 4495/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction. 4496/// 4497/// \param __a 4498/// A 128-bit vector of [2 x i64]. 4499/// Bits [63:0] are written to bits [63:0] of the destination. 4500/// \param __b 4501/// A 128-bit vector of [2 x i64]. 4502/// Bits [63:0] are written to bits [127:64] of the destination. 4503/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4504static __inline__ __m128i __DEFAULT_FN_ATTRS 4505_mm_unpacklo_epi64(__m128i __a, __m128i __b) 4506{ 4507 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 4508} 4509 4510/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4511/// integer. 4512/// 4513/// \headerfile <x86intrin.h> 4514/// 4515/// This intrinsic has no corresponding instruction. 4516/// 4517/// \param __a 4518/// A 128-bit integer vector operand. The lower 64 bits are moved to the 4519/// destination. 4520/// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4521static __inline__ __m64 __DEFAULT_FN_ATTRS 4522_mm_movepi64_pi64(__m128i __a) 4523{ 4524 return (__m64)__a[0]; 4525} 4526 4527/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4528/// upper bits. 4529/// 4530/// \headerfile <x86intrin.h> 4531/// 4532/// This intrinsic corresponds to the \c VMOVQ / MOVQ / MOVD instruction. 4533/// 4534/// \param __a 4535/// A 64-bit value. 4536/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4537/// the operand. The upper 64 bits are assigned zeros. 4538static __inline__ __m128i __DEFAULT_FN_ATTRS 4539_mm_movpi64_epi64(__m64 __a) 4540{ 4541 return (__m128i){ (long long)__a, 0 }; 4542} 4543 4544/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4545/// integer vector, zeroing the upper bits. 4546/// 4547/// \headerfile <x86intrin.h> 4548/// 4549/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction. 4550/// 4551/// \param __a 4552/// A 128-bit integer vector operand. The lower 64 bits are moved to the 4553/// destination. 4554/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4555/// the operand. The upper 64 bits are assigned zeros. 4556static __inline__ __m128i __DEFAULT_FN_ATTRS 4557_mm_move_epi64(__m128i __a) 4558{ 4559 return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); 4560} 4561 4562/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors 4563/// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4564/// double]. 4565/// 4566/// \headerfile <x86intrin.h> 4567/// 4568/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction. 4569/// 4570/// \param __a 4571/// A 128-bit vector of [2 x double]. 4572/// Bits [127:64] are written to bits [63:0] of the destination. 4573/// \param __b 4574/// A 128-bit vector of [2 x double]. 4575/// Bits [127:64] are written to bits [127:64] of the destination. 4576/// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4577static __inline__ __m128d __DEFAULT_FN_ATTRS 4578_mm_unpackhi_pd(__m128d __a, __m128d __b) 4579{ 4580 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 4581} 4582 4583/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors 4584/// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4585/// double]. 4586/// 4587/// \headerfile <x86intrin.h> 4588/// 4589/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction. 4590/// 4591/// \param __a 4592/// A 128-bit vector of [2 x double]. 4593/// Bits [63:0] are written to bits [63:0] of the destination. 4594/// \param __b 4595/// A 128-bit vector of [2 x double]. 4596/// Bits [63:0] are written to bits [127:64] of the destination. 4597/// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4598static __inline__ __m128d __DEFAULT_FN_ATTRS 4599_mm_unpacklo_pd(__m128d __a, __m128d __b) 4600{ 4601 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 4602} 4603 4604/// \brief Extracts the sign bits of the double-precision values in the 128-bit 4605/// vector of [2 x double], zero-extends the value, and writes it to the 4606/// low-order bits of the destination. 4607/// 4608/// \headerfile <x86intrin.h> 4609/// 4610/// This intrinsic corresponds to the \c VMOVMSKPD / MOVMSKPD instruction. 4611/// 4612/// \param __a 4613/// A 128-bit vector of [2 x double] containing the values with sign bits to 4614/// be extracted. 4615/// \returns The sign bits from each of the double-precision elements in __a, 4616/// written to bits [1:0]. The remaining bits are assigned values of zero. 4617static __inline__ int __DEFAULT_FN_ATTRS 4618_mm_movemask_pd(__m128d __a) 4619{ 4620 return __builtin_ia32_movmskpd((__v2df)__a); 4621} 4622 4623 4624/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two 4625/// 128-bit vector parameters of [2 x double], using the immediate-value 4626/// parameter as a specifier. 4627/// 4628/// \headerfile <x86intrin.h> 4629/// 4630/// \code 4631/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4632/// \endcode 4633/// 4634/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction. 4635/// 4636/// \param a 4637/// A 128-bit vector of [2 x double]. 4638/// \param b 4639/// A 128-bit vector of [2 x double]. 4640/// \param i 4641/// An 8-bit immediate value. The least significant two bits specify which 4642/// elements to copy from a and b: 4643/// Bit[0] = 0: lower element of a copied to lower element of result. 4644/// Bit[0] = 1: upper element of a copied to lower element of result. 4645/// Bit[1] = 0: lower element of b copied to upper element of result. 4646/// Bit[1] = 1: upper element of b copied to upper element of result. 4647/// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4648#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ 4649 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4650 0 + (((i) >> 0) & 0x1), \ 4651 2 + (((i) >> 1) & 0x1)); }) 4652 4653/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4654/// floating-point vector of [4 x float]. 4655/// 4656/// \headerfile <x86intrin.h> 4657/// 4658/// This intrinsic has no corresponding instruction. 4659/// 4660/// \param __a 4661/// A 128-bit floating-point vector of [2 x double]. 4662/// \returns A 128-bit floating-point vector of [4 x float] containing the same 4663/// bitwise pattern as the parameter. 4664static __inline__ __m128 __DEFAULT_FN_ATTRS 4665_mm_castpd_ps(__m128d __a) 4666{ 4667 return (__m128)__a; 4668} 4669 4670/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4671/// integer vector. 4672/// 4673/// \headerfile <x86intrin.h> 4674/// 4675/// This intrinsic has no corresponding instruction. 4676/// 4677/// \param __a 4678/// A 128-bit floating-point vector of [2 x double]. 4679/// \returns A 128-bit integer vector containing the same bitwise pattern as the 4680/// parameter. 4681static __inline__ __m128i __DEFAULT_FN_ATTRS 4682_mm_castpd_si128(__m128d __a) 4683{ 4684 return (__m128i)__a; 4685} 4686 4687/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4688/// floating-point vector of [2 x double]. 4689/// 4690/// \headerfile <x86intrin.h> 4691/// 4692/// This intrinsic has no corresponding instruction. 4693/// 4694/// \param __a 4695/// A 128-bit floating-point vector of [4 x float]. 4696/// \returns A 128-bit floating-point vector of [2 x double] containing the same 4697/// bitwise pattern as the parameter. 4698static __inline__ __m128d __DEFAULT_FN_ATTRS 4699_mm_castps_pd(__m128 __a) 4700{ 4701 return (__m128d)__a; 4702} 4703 4704/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4705/// integer vector. 4706/// 4707/// \headerfile <x86intrin.h> 4708/// 4709/// This intrinsic has no corresponding instruction. 4710/// 4711/// \param __a 4712/// A 128-bit floating-point vector of [4 x float]. 4713/// \returns A 128-bit integer vector containing the same bitwise pattern as the 4714/// parameter. 4715static __inline__ __m128i __DEFAULT_FN_ATTRS 4716_mm_castps_si128(__m128 __a) 4717{ 4718 return (__m128i)__a; 4719} 4720 4721/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector 4722/// of [4 x float]. 4723/// 4724/// \headerfile <x86intrin.h> 4725/// 4726/// This intrinsic has no corresponding instruction. 4727/// 4728/// \param __a 4729/// A 128-bit integer vector. 4730/// \returns A 128-bit floating-point vector of [4 x float] containing the same 4731/// bitwise pattern as the parameter. 4732static __inline__ __m128 __DEFAULT_FN_ATTRS 4733_mm_castsi128_ps(__m128i __a) 4734{ 4735 return (__m128)__a; 4736} 4737 4738/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector 4739/// of [2 x double]. 4740/// 4741/// \headerfile <x86intrin.h> 4742/// 4743/// This intrinsic has no corresponding instruction. 4744/// 4745/// \param __a 4746/// A 128-bit integer vector. 4747/// \returns A 128-bit floating-point vector of [2 x double] containing the same 4748/// bitwise pattern as the parameter. 4749static __inline__ __m128d __DEFAULT_FN_ATTRS 4750_mm_castsi128_pd(__m128i __a) 4751{ 4752 return (__m128d)__a; 4753} 4754 4755/// \brief Indicates that a spin loop is being executed for the purposes of 4756/// optimizing power consumption during the loop. 4757/// 4758/// \headerfile <x86intrin.h> 4759/// 4760/// This intrinsic corresponds to the \c PAUSE instruction. 4761/// 4762#if defined(__cplusplus) 4763extern "C" 4764#endif 4765void _mm_pause(void); 4766 4767#undef __DEFAULT_FN_ATTRS 4768 4769#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4770 4771#endif /* __EMMINTRIN_H */ 4772