1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28#ifndef __AVXINTRIN_H 29#define __AVXINTRIN_H 30 31typedef double __v4df __attribute__ ((__vector_size__ (32))); 32typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34typedef int __v8si __attribute__ ((__vector_size__ (32))); 35typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38/* Unsigned types */ 39typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 40typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 41typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 42typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 43 44/* We need an explicitly signed variant for char. Note that this shouldn't 45 * appear in the interface though. */ 46typedef signed char __v32qs __attribute__((__vector_size__(32))); 47 48typedef float __m256 __attribute__ ((__vector_size__ (32))); 49typedef double __m256d __attribute__((__vector_size__(32))); 50typedef long long __m256i __attribute__((__vector_size__(32))); 51 52/* Define the default attributes for the functions in this file. */ 53#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) 54 55/* Arithmetic */ 56/// \brief Adds two 256-bit vectors of [4 x double]. 57/// 58/// \headerfile <x86intrin.h> 59/// 60/// This intrinsic corresponds to the <c> VADDPD </c> instruction. 61/// 62/// \param __a 63/// A 256-bit vector of [4 x double] containing one of the source operands. 64/// \param __b 65/// A 256-bit vector of [4 x double] containing one of the source operands. 66/// \returns A 256-bit vector of [4 x double] containing the sums of both 67/// operands. 68static __inline __m256d __DEFAULT_FN_ATTRS 69_mm256_add_pd(__m256d __a, __m256d __b) 70{ 71 return (__m256d)((__v4df)__a+(__v4df)__b); 72} 73 74/// \brief Adds two 256-bit vectors of [8 x float]. 75/// 76/// \headerfile <x86intrin.h> 77/// 78/// This intrinsic corresponds to the <c> VADDPS </c> instruction. 79/// 80/// \param __a 81/// A 256-bit vector of [8 x float] containing one of the source operands. 82/// \param __b 83/// A 256-bit vector of [8 x float] containing one of the source operands. 84/// \returns A 256-bit vector of [8 x float] containing the sums of both 85/// operands. 86static __inline __m256 __DEFAULT_FN_ATTRS 87_mm256_add_ps(__m256 __a, __m256 __b) 88{ 89 return (__m256)((__v8sf)__a+(__v8sf)__b); 90} 91 92/// \brief Subtracts two 256-bit vectors of [4 x double]. 93/// 94/// \headerfile <x86intrin.h> 95/// 96/// This intrinsic corresponds to the <c> VSUBPD </c> instruction. 97/// 98/// \param __a 99/// A 256-bit vector of [4 x double] containing the minuend. 100/// \param __b 101/// A 256-bit vector of [4 x double] containing the subtrahend. 102/// \returns A 256-bit vector of [4 x double] containing the differences between 103/// both operands. 104static __inline __m256d __DEFAULT_FN_ATTRS 105_mm256_sub_pd(__m256d __a, __m256d __b) 106{ 107 return (__m256d)((__v4df)__a-(__v4df)__b); 108} 109 110/// \brief Subtracts two 256-bit vectors of [8 x float]. 111/// 112/// \headerfile <x86intrin.h> 113/// 114/// This intrinsic corresponds to the <c> VSUBPS </c> instruction. 115/// 116/// \param __a 117/// A 256-bit vector of [8 x float] containing the minuend. 118/// \param __b 119/// A 256-bit vector of [8 x float] containing the subtrahend. 120/// \returns A 256-bit vector of [8 x float] containing the differences between 121/// both operands. 122static __inline __m256 __DEFAULT_FN_ATTRS 123_mm256_sub_ps(__m256 __a, __m256 __b) 124{ 125 return (__m256)((__v8sf)__a-(__v8sf)__b); 126} 127 128/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 129/// two 256-bit vectors of [4 x double]. 130/// 131/// \headerfile <x86intrin.h> 132/// 133/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 134/// 135/// \param __a 136/// A 256-bit vector of [4 x double] containing the left source operand. 137/// \param __b 138/// A 256-bit vector of [4 x double] containing the right source operand. 139/// \returns A 256-bit vector of [4 x double] containing the alternating sums 140/// and differences between both operands. 141static __inline __m256d __DEFAULT_FN_ATTRS 142_mm256_addsub_pd(__m256d __a, __m256d __b) 143{ 144 return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); 145} 146 147/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 148/// two 256-bit vectors of [8 x float]. 149/// 150/// \headerfile <x86intrin.h> 151/// 152/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 153/// 154/// \param __a 155/// A 256-bit vector of [8 x float] containing the left source operand. 156/// \param __b 157/// A 256-bit vector of [8 x float] containing the right source operand. 158/// \returns A 256-bit vector of [8 x float] containing the alternating sums and 159/// differences between both operands. 160static __inline __m256 __DEFAULT_FN_ATTRS 161_mm256_addsub_ps(__m256 __a, __m256 __b) 162{ 163 return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); 164} 165 166/// \brief Divides two 256-bit vectors of [4 x double]. 167/// 168/// \headerfile <x86intrin.h> 169/// 170/// This intrinsic corresponds to the <c> VDIVPD </c> instruction. 171/// 172/// \param __a 173/// A 256-bit vector of [4 x double] containing the dividend. 174/// \param __b 175/// A 256-bit vector of [4 x double] containing the divisor. 176/// \returns A 256-bit vector of [4 x double] containing the quotients of both 177/// operands. 178static __inline __m256d __DEFAULT_FN_ATTRS 179_mm256_div_pd(__m256d __a, __m256d __b) 180{ 181 return (__m256d)((__v4df)__a/(__v4df)__b); 182} 183 184/// \brief Divides two 256-bit vectors of [8 x float]. 185/// 186/// \headerfile <x86intrin.h> 187/// 188/// This intrinsic corresponds to the <c> VDIVPS </c> instruction. 189/// 190/// \param __a 191/// A 256-bit vector of [8 x float] containing the dividend. 192/// \param __b 193/// A 256-bit vector of [8 x float] containing the divisor. 194/// \returns A 256-bit vector of [8 x float] containing the quotients of both 195/// operands. 196static __inline __m256 __DEFAULT_FN_ATTRS 197_mm256_div_ps(__m256 __a, __m256 __b) 198{ 199 return (__m256)((__v8sf)__a/(__v8sf)__b); 200} 201 202/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater 203/// of each pair of values. 204/// 205/// \headerfile <x86intrin.h> 206/// 207/// This intrinsic corresponds to the <c> VMAXPD </c> instruction. 208/// 209/// \param __a 210/// A 256-bit vector of [4 x double] containing one of the operands. 211/// \param __b 212/// A 256-bit vector of [4 x double] containing one of the operands. 213/// \returns A 256-bit vector of [4 x double] containing the maximum values 214/// between both operands. 215static __inline __m256d __DEFAULT_FN_ATTRS 216_mm256_max_pd(__m256d __a, __m256d __b) 217{ 218 return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); 219} 220 221/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater 222/// of each pair of values. 223/// 224/// \headerfile <x86intrin.h> 225/// 226/// This intrinsic corresponds to the <c> VMAXPS </c> instruction. 227/// 228/// \param __a 229/// A 256-bit vector of [8 x float] containing one of the operands. 230/// \param __b 231/// A 256-bit vector of [8 x float] containing one of the operands. 232/// \returns A 256-bit vector of [8 x float] containing the maximum values 233/// between both operands. 234static __inline __m256 __DEFAULT_FN_ATTRS 235_mm256_max_ps(__m256 __a, __m256 __b) 236{ 237 return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); 238} 239 240/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser 241/// of each pair of values. 242/// 243/// \headerfile <x86intrin.h> 244/// 245/// This intrinsic corresponds to the <c> VMINPD </c> instruction. 246/// 247/// \param __a 248/// A 256-bit vector of [4 x double] containing one of the operands. 249/// \param __b 250/// A 256-bit vector of [4 x double] containing one of the operands. 251/// \returns A 256-bit vector of [4 x double] containing the minimum values 252/// between both operands. 253static __inline __m256d __DEFAULT_FN_ATTRS 254_mm256_min_pd(__m256d __a, __m256d __b) 255{ 256 return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); 257} 258 259/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser 260/// of each pair of values. 261/// 262/// \headerfile <x86intrin.h> 263/// 264/// This intrinsic corresponds to the <c> VMINPS </c> instruction. 265/// 266/// \param __a 267/// A 256-bit vector of [8 x float] containing one of the operands. 268/// \param __b 269/// A 256-bit vector of [8 x float] containing one of the operands. 270/// \returns A 256-bit vector of [8 x float] containing the minimum values 271/// between both operands. 272static __inline __m256 __DEFAULT_FN_ATTRS 273_mm256_min_ps(__m256 __a, __m256 __b) 274{ 275 return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); 276} 277 278/// \brief Multiplies two 256-bit vectors of [4 x double]. 279/// 280/// \headerfile <x86intrin.h> 281/// 282/// This intrinsic corresponds to the <c> VMULPD </c> instruction. 283/// 284/// \param __a 285/// A 256-bit vector of [4 x double] containing one of the operands. 286/// \param __b 287/// A 256-bit vector of [4 x double] containing one of the operands. 288/// \returns A 256-bit vector of [4 x double] containing the products of both 289/// operands. 290static __inline __m256d __DEFAULT_FN_ATTRS 291_mm256_mul_pd(__m256d __a, __m256d __b) 292{ 293 return (__m256d)((__v4df)__a * (__v4df)__b); 294} 295 296/// \brief Multiplies two 256-bit vectors of [8 x float]. 297/// 298/// \headerfile <x86intrin.h> 299/// 300/// This intrinsic corresponds to the <c> VMULPS </c> instruction. 301/// 302/// \param __a 303/// A 256-bit vector of [8 x float] containing one of the operands. 304/// \param __b 305/// A 256-bit vector of [8 x float] containing one of the operands. 306/// \returns A 256-bit vector of [8 x float] containing the products of both 307/// operands. 308static __inline __m256 __DEFAULT_FN_ATTRS 309_mm256_mul_ps(__m256 __a, __m256 __b) 310{ 311 return (__m256)((__v8sf)__a * (__v8sf)__b); 312} 313 314/// \brief Calculates the square roots of the values in a 256-bit vector of 315/// [4 x double]. 316/// 317/// \headerfile <x86intrin.h> 318/// 319/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction. 320/// 321/// \param __a 322/// A 256-bit vector of [4 x double]. 323/// \returns A 256-bit vector of [4 x double] containing the square roots of the 324/// values in the operand. 325static __inline __m256d __DEFAULT_FN_ATTRS 326_mm256_sqrt_pd(__m256d __a) 327{ 328 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); 329} 330 331/// \brief Calculates the square roots of the values in a 256-bit vector of 332/// [8 x float]. 333/// 334/// \headerfile <x86intrin.h> 335/// 336/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction. 337/// 338/// \param __a 339/// A 256-bit vector of [8 x float]. 340/// \returns A 256-bit vector of [8 x float] containing the square roots of the 341/// values in the operand. 342static __inline __m256 __DEFAULT_FN_ATTRS 343_mm256_sqrt_ps(__m256 __a) 344{ 345 return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); 346} 347 348/// \brief Calculates the reciprocal square roots of the values in a 256-bit 349/// vector of [8 x float]. 350/// 351/// \headerfile <x86intrin.h> 352/// 353/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction. 354/// 355/// \param __a 356/// A 256-bit vector of [8 x float]. 357/// \returns A 256-bit vector of [8 x float] containing the reciprocal square 358/// roots of the values in the operand. 359static __inline __m256 __DEFAULT_FN_ATTRS 360_mm256_rsqrt_ps(__m256 __a) 361{ 362 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); 363} 364 365/// \brief Calculates the reciprocals of the values in a 256-bit vector of 366/// [8 x float]. 367/// 368/// \headerfile <x86intrin.h> 369/// 370/// This intrinsic corresponds to the <c> VRCPPS </c> instruction. 371/// 372/// \param __a 373/// A 256-bit vector of [8 x float]. 374/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the 375/// values in the operand. 376static __inline __m256 __DEFAULT_FN_ATTRS 377_mm256_rcp_ps(__m256 __a) 378{ 379 return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); 380} 381 382/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified 383/// by the byte operand. The source values are rounded to integer values and 384/// returned as 64-bit double-precision floating-point values. 385/// 386/// \headerfile <x86intrin.h> 387/// 388/// \code 389/// __m256d _mm256_round_pd(__m256d V, const int M); 390/// \endcode 391/// 392/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 393/// 394/// \param V 395/// A 256-bit vector of [4 x double]. 396/// \param M 397/// An integer value that specifies the rounding operation. \n 398/// Bits [7:4] are reserved. \n 399/// Bit [3] is a precision exception value: \n 400/// 0: A normal PE exception is used. \n 401/// 1: The PE field is not updated. \n 402/// Bit [2] is the rounding control source: \n 403/// 0: Use bits [1:0] of \a M. \n 404/// 1: Use the current MXCSR setting. \n 405/// Bits [1:0] contain the rounding control definition: \n 406/// 00: Nearest. \n 407/// 01: Downward (toward negative infinity). \n 408/// 10: Upward (toward positive infinity). \n 409/// 11: Truncated. 410/// \returns A 256-bit vector of [4 x double] containing the rounded values. 411#define _mm256_round_pd(V, M) __extension__ ({ \ 412 (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) 413 414/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as 415/// specified by the byte operand. The source values are rounded to integer 416/// values and returned as floating-point values. 417/// 418/// \headerfile <x86intrin.h> 419/// 420/// \code 421/// __m256 _mm256_round_ps(__m256 V, const int M); 422/// \endcode 423/// 424/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 425/// 426/// \param V 427/// A 256-bit vector of [8 x float]. 428/// \param M 429/// An integer value that specifies the rounding operation. \n 430/// Bits [7:4] are reserved. \n 431/// Bit [3] is a precision exception value: \n 432/// 0: A normal PE exception is used. \n 433/// 1: The PE field is not updated. \n 434/// Bit [2] is the rounding control source: \n 435/// 0: Use bits [1:0] of \a M. \n 436/// 1: Use the current MXCSR setting. \n 437/// Bits [1:0] contain the rounding control definition: \n 438/// 00: Nearest. \n 439/// 01: Downward (toward negative infinity). \n 440/// 10: Upward (toward positive infinity). \n 441/// 11: Truncated. 442/// \returns A 256-bit vector of [8 x float] containing the rounded values. 443#define _mm256_round_ps(V, M) __extension__ ({ \ 444 (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) 445 446/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The 447/// source values are rounded up to integer values and returned as 64-bit 448/// double-precision floating-point values. 449/// 450/// \headerfile <x86intrin.h> 451/// 452/// \code 453/// __m256d _mm256_ceil_pd(__m256d V); 454/// \endcode 455/// 456/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 457/// 458/// \param V 459/// A 256-bit vector of [4 x double]. 460/// \returns A 256-bit vector of [4 x double] containing the rounded up values. 461#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 462 463/// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. 464/// The source values are rounded down to integer values and returned as 465/// 64-bit double-precision floating-point values. 466/// 467/// \headerfile <x86intrin.h> 468/// 469/// \code 470/// __m256d _mm256_floor_pd(__m256d V); 471/// \endcode 472/// 473/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction. 474/// 475/// \param V 476/// A 256-bit vector of [4 x double]. 477/// \returns A 256-bit vector of [4 x double] containing the rounded down 478/// values. 479#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 480 481/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The 482/// source values are rounded up to integer values and returned as 483/// floating-point values. 484/// 485/// \headerfile <x86intrin.h> 486/// 487/// \code 488/// __m256 _mm256_ceil_ps(__m256 V); 489/// \endcode 490/// 491/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 492/// 493/// \param V 494/// A 256-bit vector of [8 x float]. 495/// \returns A 256-bit vector of [8 x float] containing the rounded up values. 496#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 497 498/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The 499/// source values are rounded down to integer values and returned as 500/// floating-point values. 501/// 502/// \headerfile <x86intrin.h> 503/// 504/// \code 505/// __m256 _mm256_floor_ps(__m256 V); 506/// \endcode 507/// 508/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction. 509/// 510/// \param V 511/// A 256-bit vector of [8 x float]. 512/// \returns A 256-bit vector of [8 x float] containing the rounded down values. 513#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 514 515/* Logical */ 516/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. 517/// 518/// \headerfile <x86intrin.h> 519/// 520/// This intrinsic corresponds to the <c> VANDPD </c> instruction. 521/// 522/// \param __a 523/// A 256-bit vector of [4 x double] containing one of the source operands. 524/// \param __b 525/// A 256-bit vector of [4 x double] containing one of the source operands. 526/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 527/// values between both operands. 528static __inline __m256d __DEFAULT_FN_ATTRS 529_mm256_and_pd(__m256d __a, __m256d __b) 530{ 531 return (__m256d)((__v4du)__a & (__v4du)__b); 532} 533 534/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. 535/// 536/// \headerfile <x86intrin.h> 537/// 538/// This intrinsic corresponds to the <c> VANDPS </c> instruction. 539/// 540/// \param __a 541/// A 256-bit vector of [8 x float] containing one of the source operands. 542/// \param __b 543/// A 256-bit vector of [8 x float] containing one of the source operands. 544/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 545/// values between both operands. 546static __inline __m256 __DEFAULT_FN_ATTRS 547_mm256_and_ps(__m256 __a, __m256 __b) 548{ 549 return (__m256)((__v8su)__a & (__v8su)__b); 550} 551 552/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using 553/// the one's complement of the values contained in the first source operand. 554/// 555/// \headerfile <x86intrin.h> 556/// 557/// This intrinsic corresponds to the <c> VANDNPD </c> instruction. 558/// 559/// \param __a 560/// A 256-bit vector of [4 x double] containing the left source operand. The 561/// one's complement of this value is used in the bitwise AND. 562/// \param __b 563/// A 256-bit vector of [4 x double] containing the right source operand. 564/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the 565/// values of the second operand and the one's complement of the first 566/// operand. 567static __inline __m256d __DEFAULT_FN_ATTRS 568_mm256_andnot_pd(__m256d __a, __m256d __b) 569{ 570 return (__m256d)(~(__v4du)__a & (__v4du)__b); 571} 572 573/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using 574/// the one's complement of the values contained in the first source operand. 575/// 576/// \headerfile <x86intrin.h> 577/// 578/// This intrinsic corresponds to the <c> VANDNPS </c> instruction. 579/// 580/// \param __a 581/// A 256-bit vector of [8 x float] containing the left source operand. The 582/// one's complement of this value is used in the bitwise AND. 583/// \param __b 584/// A 256-bit vector of [8 x float] containing the right source operand. 585/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the 586/// values of the second operand and the one's complement of the first 587/// operand. 588static __inline __m256 __DEFAULT_FN_ATTRS 589_mm256_andnot_ps(__m256 __a, __m256 __b) 590{ 591 return (__m256)(~(__v8su)__a & (__v8su)__b); 592} 593 594/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. 595/// 596/// \headerfile <x86intrin.h> 597/// 598/// This intrinsic corresponds to the <c> VORPD </c> instruction. 599/// 600/// \param __a 601/// A 256-bit vector of [4 x double] containing one of the source operands. 602/// \param __b 603/// A 256-bit vector of [4 x double] containing one of the source operands. 604/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the 605/// values between both operands. 606static __inline __m256d __DEFAULT_FN_ATTRS 607_mm256_or_pd(__m256d __a, __m256d __b) 608{ 609 return (__m256d)((__v4du)__a | (__v4du)__b); 610} 611 612/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. 613/// 614/// \headerfile <x86intrin.h> 615/// 616/// This intrinsic corresponds to the <c> VORPS </c> instruction. 617/// 618/// \param __a 619/// A 256-bit vector of [8 x float] containing one of the source operands. 620/// \param __b 621/// A 256-bit vector of [8 x float] containing one of the source operands. 622/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the 623/// values between both operands. 624static __inline __m256 __DEFAULT_FN_ATTRS 625_mm256_or_ps(__m256 __a, __m256 __b) 626{ 627 return (__m256)((__v8su)__a | (__v8su)__b); 628} 629 630/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. 631/// 632/// \headerfile <x86intrin.h> 633/// 634/// This intrinsic corresponds to the <c> VXORPD </c> instruction. 635/// 636/// \param __a 637/// A 256-bit vector of [4 x double] containing one of the source operands. 638/// \param __b 639/// A 256-bit vector of [4 x double] containing one of the source operands. 640/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the 641/// values between both operands. 642static __inline __m256d __DEFAULT_FN_ATTRS 643_mm256_xor_pd(__m256d __a, __m256d __b) 644{ 645 return (__m256d)((__v4du)__a ^ (__v4du)__b); 646} 647 648/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. 649/// 650/// \headerfile <x86intrin.h> 651/// 652/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 653/// 654/// \param __a 655/// A 256-bit vector of [8 x float] containing one of the source operands. 656/// \param __b 657/// A 256-bit vector of [8 x float] containing one of the source operands. 658/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the 659/// values between both operands. 660static __inline __m256 __DEFAULT_FN_ATTRS 661_mm256_xor_ps(__m256 __a, __m256 __b) 662{ 663 return (__m256)((__v8su)__a ^ (__v8su)__b); 664} 665 666/* Horizontal arithmetic */ 667/// \brief Horizontally adds the adjacent pairs of values contained in two 668/// 256-bit vectors of [4 x double]. 669/// 670/// \headerfile <x86intrin.h> 671/// 672/// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 673/// 674/// \param __a 675/// A 256-bit vector of [4 x double] containing one of the source operands. 676/// The horizontal sums of the values are returned in the even-indexed 677/// elements of a vector of [4 x double]. 678/// \param __b 679/// A 256-bit vector of [4 x double] containing one of the source operands. 680/// The horizontal sums of the values are returned in the odd-indexed 681/// elements of a vector of [4 x double]. 682/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of 683/// both operands. 684static __inline __m256d __DEFAULT_FN_ATTRS 685_mm256_hadd_pd(__m256d __a, __m256d __b) 686{ 687 return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); 688} 689 690/// \brief Horizontally adds the adjacent pairs of values contained in two 691/// 256-bit vectors of [8 x float]. 692/// 693/// \headerfile <x86intrin.h> 694/// 695/// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 696/// 697/// \param __a 698/// A 256-bit vector of [8 x float] containing one of the source operands. 699/// The horizontal sums of the values are returned in the elements with 700/// index 0, 1, 4, 5 of a vector of [8 x float]. 701/// \param __b 702/// A 256-bit vector of [8 x float] containing one of the source operands. 703/// The horizontal sums of the values are returned in the elements with 704/// index 2, 3, 6, 7 of a vector of [8 x float]. 705/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 706/// both operands. 707static __inline __m256 __DEFAULT_FN_ATTRS 708_mm256_hadd_ps(__m256 __a, __m256 __b) 709{ 710 return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); 711} 712 713/// \brief Horizontally subtracts the adjacent pairs of values contained in two 714/// 256-bit vectors of [4 x double]. 715/// 716/// \headerfile <x86intrin.h> 717/// 718/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 719/// 720/// \param __a 721/// A 256-bit vector of [4 x double] containing one of the source operands. 722/// The horizontal differences between the values are returned in the 723/// even-indexed elements of a vector of [4 x double]. 724/// \param __b 725/// A 256-bit vector of [4 x double] containing one of the source operands. 726/// The horizontal differences between the values are returned in the 727/// odd-indexed elements of a vector of [4 x double]. 728/// \returns A 256-bit vector of [4 x double] containing the horizontal 729/// differences of both operands. 730static __inline __m256d __DEFAULT_FN_ATTRS 731_mm256_hsub_pd(__m256d __a, __m256d __b) 732{ 733 return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); 734} 735 736/// \brief Horizontally subtracts the adjacent pairs of values contained in two 737/// 256-bit vectors of [8 x float]. 738/// 739/// \headerfile <x86intrin.h> 740/// 741/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 742/// 743/// \param __a 744/// A 256-bit vector of [8 x float] containing one of the source operands. 745/// The horizontal differences between the values are returned in the 746/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. 747/// \param __b 748/// A 256-bit vector of [8 x float] containing one of the source operands. 749/// The horizontal differences between the values are returned in the 750/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. 751/// \returns A 256-bit vector of [8 x float] containing the horizontal 752/// differences of both operands. 753static __inline __m256 __DEFAULT_FN_ATTRS 754_mm256_hsub_ps(__m256 __a, __m256 __b) 755{ 756 return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); 757} 758 759/* Vector permutations */ 760/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 761/// by the 128-bit integer vector operand. 762/// 763/// \headerfile <x86intrin.h> 764/// 765/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 766/// 767/// \param __a 768/// A 128-bit vector of [2 x double]. 769/// \param __c 770/// A 128-bit integer vector operand specifying how the values are to be 771/// copied. \n 772/// Bit [1]: \n 773/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 774/// vector. \n 775/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 776/// returned vector. \n 777/// Bit [65]: \n 778/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 779/// returned vector. \n 780/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 781/// returned vector. 782/// \returns A 128-bit vector of [2 x double] containing the copied values. 783static __inline __m128d __DEFAULT_FN_ATTRS 784_mm_permutevar_pd(__m128d __a, __m128i __c) 785{ 786 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); 787} 788 789/// \brief Copies the values in a 256-bit vector of [4 x double] as specified 790/// by the 256-bit integer vector operand. 791/// 792/// \headerfile <x86intrin.h> 793/// 794/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 795/// 796/// \param __a 797/// A 256-bit vector of [4 x double]. 798/// \param __c 799/// A 256-bit integer vector operand specifying how the values are to be 800/// copied. \n 801/// Bit [1]: \n 802/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 803/// vector. \n 804/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 805/// returned vector. \n 806/// Bit [65]: \n 807/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 808/// returned vector. \n 809/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 810/// returned vector. \n 811/// Bit [129]: \n 812/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 813/// returned vector. \n 814/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 815/// returned vector. \n 816/// Bit [193]: \n 817/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 818/// returned vector. \n 819/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 820/// returned vector. 821/// \returns A 256-bit vector of [4 x double] containing the copied values. 822static __inline __m256d __DEFAULT_FN_ATTRS 823_mm256_permutevar_pd(__m256d __a, __m256i __c) 824{ 825 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); 826} 827 828/// \brief Copies the values stored in a 128-bit vector of [4 x float] as 829/// specified by the 128-bit integer vector operand. 830/// \headerfile <x86intrin.h> 831/// 832/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 833/// 834/// \param __a 835/// A 128-bit vector of [4 x float]. 836/// \param __c 837/// A 128-bit integer vector operand specifying how the values are to be 838/// copied. \n 839/// Bits [1:0]: \n 840/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 841/// returned vector. \n 842/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 843/// returned vector. \n 844/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 845/// returned vector. \n 846/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 847/// returned vector. \n 848/// Bits [33:32]: \n 849/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 850/// returned vector. \n 851/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 852/// returned vector. \n 853/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 854/// returned vector. \n 855/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 856/// returned vector. \n 857/// Bits [65:64]: \n 858/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 859/// returned vector. \n 860/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 861/// returned vector. \n 862/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 863/// returned vector. \n 864/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 865/// returned vector. \n 866/// Bits [97:96]: \n 867/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 868/// returned vector. \n 869/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 870/// returned vector. \n 871/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 872/// returned vector. \n 873/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 874/// returned vector. 875/// \returns A 128-bit vector of [4 x float] containing the copied values. 876static __inline __m128 __DEFAULT_FN_ATTRS 877_mm_permutevar_ps(__m128 __a, __m128i __c) 878{ 879 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); 880} 881 882/// \brief Copies the values stored in a 256-bit vector of [8 x float] as 883/// specified by the 256-bit integer vector operand. 884/// 885/// \headerfile <x86intrin.h> 886/// 887/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 888/// 889/// \param __a 890/// A 256-bit vector of [8 x float]. 891/// \param __c 892/// A 256-bit integer vector operand specifying how the values are to be 893/// copied. \n 894/// Bits [1:0]: \n 895/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 896/// returned vector. \n 897/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 898/// returned vector. \n 899/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 900/// returned vector. \n 901/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 902/// returned vector. \n 903/// Bits [33:32]: \n 904/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 905/// returned vector. \n 906/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 907/// returned vector. \n 908/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 909/// returned vector. \n 910/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 911/// returned vector. \n 912/// Bits [65:64]: \n 913/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 914/// returned vector. \n 915/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 916/// returned vector. \n 917/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 918/// returned vector. \n 919/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 920/// returned vector. \n 921/// Bits [97:96]: \n 922/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 923/// returned vector. \n 924/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 925/// returned vector. \n 926/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 927/// returned vector. \n 928/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 929/// returned vector. \n 930/// Bits [129:128]: \n 931/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 932/// returned vector. \n 933/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 934/// returned vector. \n 935/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 936/// returned vector. \n 937/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 938/// returned vector. \n 939/// Bits [161:160]: \n 940/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 941/// returned vector. \n 942/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 943/// returned vector. \n 944/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 945/// returned vector. \n 946/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 947/// returned vector. \n 948/// Bits [193:192]: \n 949/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 950/// returned vector. \n 951/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 952/// returned vector. \n 953/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 954/// returned vector. \n 955/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 956/// returned vector. \n 957/// Bits [225:224]: \n 958/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 959/// returned vector. \n 960/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 961/// returned vector. \n 962/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 963/// returned vector. \n 964/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 965/// returned vector. 966/// \returns A 256-bit vector of [8 x float] containing the copied values. 967static __inline __m256 __DEFAULT_FN_ATTRS 968_mm256_permutevar_ps(__m256 __a, __m256i __c) 969{ 970 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); 971} 972 973/// \brief Copies the values in a 128-bit vector of [2 x double] as specified 974/// by the immediate integer operand. 975/// 976/// \headerfile <x86intrin.h> 977/// 978/// \code 979/// __m128d _mm_permute_pd(__m128d A, const int C); 980/// \endcode 981/// 982/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 983/// 984/// \param A 985/// A 128-bit vector of [2 x double]. 986/// \param C 987/// An immediate integer operand specifying how the values are to be 988/// copied. \n 989/// Bit [0]: \n 990/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 991/// vector. \n 992/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 993/// returned vector. \n 994/// Bit [1]: \n 995/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 996/// returned vector. \n 997/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 998/// returned vector. 999/// \returns A 128-bit vector of [2 x double] containing the copied values. 1000#define _mm_permute_pd(A, C) __extension__ ({ \ 1001 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ 1002 (__v2df)_mm_undefined_pd(), \ 1003 ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) 1004 1005/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by 1006/// the immediate integer operand. 1007/// 1008/// \headerfile <x86intrin.h> 1009/// 1010/// \code 1011/// __m256d _mm256_permute_pd(__m256d A, const int C); 1012/// \endcode 1013/// 1014/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction. 1015/// 1016/// \param A 1017/// A 256-bit vector of [4 x double]. 1018/// \param C 1019/// An immediate integer operand specifying how the values are to be 1020/// copied. \n 1021/// Bit [0]: \n 1022/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned 1023/// vector. \n 1024/// 1: Bits [127:64] of the source are copied to bits [63:0] of the 1025/// returned vector. \n 1026/// Bit [1]: \n 1027/// 0: Bits [63:0] of the source are copied to bits [127:64] of the 1028/// returned vector. \n 1029/// 1: Bits [127:64] of the source are copied to bits [127:64] of the 1030/// returned vector. \n 1031/// Bit [2]: \n 1032/// 0: Bits [191:128] of the source are copied to bits [191:128] of the 1033/// returned vector. \n 1034/// 1: Bits [255:192] of the source are copied to bits [191:128] of the 1035/// returned vector. \n 1036/// Bit [3]: \n 1037/// 0: Bits [191:128] of the source are copied to bits [255:192] of the 1038/// returned vector. \n 1039/// 1: Bits [255:192] of the source are copied to bits [255:192] of the 1040/// returned vector. 1041/// \returns A 256-bit vector of [4 x double] containing the copied values. 1042#define _mm256_permute_pd(A, C) __extension__ ({ \ 1043 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1044 (__v4df)_mm256_undefined_pd(), \ 1045 0 + (((C) >> 0) & 0x1), \ 1046 0 + (((C) >> 1) & 0x1), \ 1047 2 + (((C) >> 2) & 0x1), \ 1048 2 + (((C) >> 3) & 0x1)); }) 1049 1050/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by 1051/// the immediate integer operand. 1052/// 1053/// \headerfile <x86intrin.h> 1054/// 1055/// \code 1056/// __m128 _mm_permute_ps(__m128 A, const int C); 1057/// \endcode 1058/// 1059/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1060/// 1061/// \param A 1062/// A 128-bit vector of [4 x float]. 1063/// \param C 1064/// An immediate integer operand specifying how the values are to be 1065/// copied. \n 1066/// Bits [1:0]: \n 1067/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1068/// returned vector. \n 1069/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1070/// returned vector. \n 1071/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1072/// returned vector. \n 1073/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1074/// returned vector. \n 1075/// Bits [3:2]: \n 1076/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1077/// returned vector. \n 1078/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1079/// returned vector. \n 1080/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1081/// returned vector. \n 1082/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1083/// returned vector. \n 1084/// Bits [5:4]: \n 1085/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1086/// returned vector. \n 1087/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1088/// returned vector. \n 1089/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1090/// returned vector. \n 1091/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1092/// returned vector. \n 1093/// Bits [7:6]: \n 1094/// 00: Bits [31:0] of the source are copied to bits [127:96] of the 1095/// returned vector. \n 1096/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1097/// returned vector. \n 1098/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1099/// returned vector. \n 1100/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1101/// returned vector. 1102/// \returns A 128-bit vector of [4 x float] containing the copied values. 1103#define _mm_permute_ps(A, C) __extension__ ({ \ 1104 (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ 1105 (__v4sf)_mm_undefined_ps(), \ 1106 ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ 1107 ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) 1108 1109/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by 1110/// the immediate integer operand. 1111/// 1112/// \headerfile <x86intrin.h> 1113/// 1114/// \code 1115/// __m256 _mm256_permute_ps(__m256 A, const int C); 1116/// \endcode 1117/// 1118/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction. 1119/// 1120/// \param A 1121/// A 256-bit vector of [8 x float]. 1122/// \param C 1123/// An immediate integer operand specifying how the values are to be \n 1124/// copied. \n 1125/// Bits [1:0]: \n 1126/// 00: Bits [31:0] of the source are copied to bits [31:0] of the 1127/// returned vector. \n 1128/// 01: Bits [63:32] of the source are copied to bits [31:0] of the 1129/// returned vector. \n 1130/// 10: Bits [95:64] of the source are copied to bits [31:0] of the 1131/// returned vector. \n 1132/// 11: Bits [127:96] of the source are copied to bits [31:0] of the 1133/// returned vector. \n 1134/// Bits [3:2]: \n 1135/// 00: Bits [31:0] of the source are copied to bits [63:32] of the 1136/// returned vector. \n 1137/// 01: Bits [63:32] of the source are copied to bits [63:32] of the 1138/// returned vector. \n 1139/// 10: Bits [95:64] of the source are copied to bits [63:32] of the 1140/// returned vector. \n 1141/// 11: Bits [127:96] of the source are copied to bits [63:32] of the 1142/// returned vector. \n 1143/// Bits [5:4]: \n 1144/// 00: Bits [31:0] of the source are copied to bits [95:64] of the 1145/// returned vector. \n 1146/// 01: Bits [63:32] of the source are copied to bits [95:64] of the 1147/// returned vector. \n 1148/// 10: Bits [95:64] of the source are copied to bits [95:64] of the 1149/// returned vector. \n 1150/// 11: Bits [127:96] of the source are copied to bits [95:64] of the 1151/// returned vector. \n 1152/// Bits [7:6]: \n 1153/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the 1154/// returned vector. \n 1155/// 01: Bits [63:32] of the source are copied to bits [127:96] of the 1156/// returned vector. \n 1157/// 10: Bits [95:64] of the source are copied to bits [127:96] of the 1158/// returned vector. \n 1159/// 11: Bits [127:96] of the source are copied to bits [127:96] of the 1160/// returned vector. \n 1161/// Bits [1:0]: \n 1162/// 00: Bits [159:128] of the source are copied to bits [159:128] of the 1163/// returned vector. \n 1164/// 01: Bits [191:160] of the source are copied to bits [159:128] of the 1165/// returned vector. \n 1166/// 10: Bits [223:192] of the source are copied to bits [159:128] of the 1167/// returned vector. \n 1168/// 11: Bits [255:224] of the source are copied to bits [159:128] of the 1169/// returned vector. \n 1170/// Bits [3:2]: \n 1171/// 00: Bits [159:128] of the source are copied to bits [191:160] of the 1172/// returned vector. \n 1173/// 01: Bits [191:160] of the source are copied to bits [191:160] of the 1174/// returned vector. \n 1175/// 10: Bits [223:192] of the source are copied to bits [191:160] of the 1176/// returned vector. \n 1177/// 11: Bits [255:224] of the source are copied to bits [191:160] of the 1178/// returned vector. \n 1179/// Bits [5:4]: \n 1180/// 00: Bits [159:128] of the source are copied to bits [223:192] of the 1181/// returned vector. \n 1182/// 01: Bits [191:160] of the source are copied to bits [223:192] of the 1183/// returned vector. \n 1184/// 10: Bits [223:192] of the source are copied to bits [223:192] of the 1185/// returned vector. \n 1186/// 11: Bits [255:224] of the source are copied to bits [223:192] of the 1187/// returned vector. \n 1188/// Bits [7:6]: \n 1189/// 00: Bits [159:128] of the source are copied to bits [255:224] of the 1190/// returned vector. \n 1191/// 01: Bits [191:160] of the source are copied to bits [255:224] of the 1192/// returned vector. \n 1193/// 10: Bits [223:192] of the source are copied to bits [255:224] of the 1194/// returned vector. \n 1195/// 11: Bits [255:224] of the source are copied to bits [255:224] of the 1196/// returned vector. 1197/// \returns A 256-bit vector of [8 x float] containing the copied values. 1198#define _mm256_permute_ps(A, C) __extension__ ({ \ 1199 (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ 1200 (__v8sf)_mm256_undefined_ps(), \ 1201 0 + (((C) >> 0) & 0x3), \ 1202 0 + (((C) >> 2) & 0x3), \ 1203 0 + (((C) >> 4) & 0x3), \ 1204 0 + (((C) >> 6) & 0x3), \ 1205 4 + (((C) >> 0) & 0x3), \ 1206 4 + (((C) >> 2) & 0x3), \ 1207 4 + (((C) >> 4) & 0x3), \ 1208 4 + (((C) >> 6) & 0x3)); }) 1209 1210/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1211/// [4 x double], as specified by the immediate integer operand. 1212/// 1213/// \headerfile <x86intrin.h> 1214/// 1215/// \code 1216/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); 1217/// \endcode 1218/// 1219/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1220/// 1221/// \param V1 1222/// A 256-bit vector of [4 x double]. 1223/// \param V2 1224/// A 256-bit vector of [4 x double. 1225/// \param M 1226/// An immediate integer operand specifying how the values are to be 1227/// permuted. \n 1228/// Bits [1:0]: \n 1229/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1230/// destination. \n 1231/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1232/// destination. \n 1233/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1234/// destination. \n 1235/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1236/// destination. \n 1237/// Bits [5:4]: \n 1238/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1239/// destination. \n 1240/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1241/// destination. \n 1242/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1243/// destination. \n 1244/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1245/// destination. 1246/// \returns A 256-bit vector of [4 x double] containing the copied values. 1247#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 1248 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ 1249 (__v4df)(__m256d)(V2), (M)); }) 1250 1251/// \brief Permutes 128-bit data values stored in two 256-bit vectors of 1252/// [8 x float], as specified by the immediate integer operand. 1253/// 1254/// \headerfile <x86intrin.h> 1255/// 1256/// \code 1257/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); 1258/// \endcode 1259/// 1260/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1261/// 1262/// \param V1 1263/// A 256-bit vector of [8 x float]. 1264/// \param V2 1265/// A 256-bit vector of [8 x float]. 1266/// \param M 1267/// An immediate integer operand specifying how the values are to be 1268/// permuted. \n 1269/// Bits [1:0]: \n 1270/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1271/// destination. \n 1272/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1273/// destination. \n 1274/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1275/// destination. \n 1276/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1277/// destination. \n 1278/// Bits [5:4]: \n 1279/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1280/// destination. \n 1281/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1282/// destination. \n 1283/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1284/// destination. \n 1285/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1286/// destination. 1287/// \returns A 256-bit vector of [8 x float] containing the copied values. 1288#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 1289 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ 1290 (__v8sf)(__m256)(V2), (M)); }) 1291 1292/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, 1293/// as specified by the immediate integer operand. 1294/// 1295/// \headerfile <x86intrin.h> 1296/// 1297/// \code 1298/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); 1299/// \endcode 1300/// 1301/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction. 1302/// 1303/// \param V1 1304/// A 256-bit integer vector. 1305/// \param V2 1306/// A 256-bit integer vector. 1307/// \param M 1308/// An immediate integer operand specifying how the values are to be copied. 1309/// Bits [1:0]: \n 1310/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the 1311/// destination. \n 1312/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the 1313/// destination. \n 1314/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the 1315/// destination. \n 1316/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the 1317/// destination. \n 1318/// Bits [5:4]: \n 1319/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the 1320/// destination. \n 1321/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the 1322/// destination. \n 1323/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the 1324/// destination. \n 1325/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the 1326/// destination. 1327/// \returns A 256-bit integer vector containing the copied values. 1328#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 1329 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ 1330 (__v8si)(__m256i)(V2), (M)); }) 1331 1332/* Vector Blend */ 1333/// \brief Merges 64-bit double-precision data values stored in either of the 1334/// two 256-bit vectors of [4 x double], as specified by the immediate 1335/// integer operand. 1336/// 1337/// \headerfile <x86intrin.h> 1338/// 1339/// \code 1340/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); 1341/// \endcode 1342/// 1343/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction. 1344/// 1345/// \param V1 1346/// A 256-bit vector of [4 x double]. 1347/// \param V2 1348/// A 256-bit vector of [4 x double]. 1349/// \param M 1350/// An immediate integer operand, with mask bits [3:0] specifying how the 1351/// values are to be copied. The position of the mask bit corresponds to the 1352/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 1353/// element in operand \a V1 is copied to the same position in the 1354/// destination. When a mask bit is 1, the corresponding 64-bit element in 1355/// operand \a V2 is copied to the same position in the destination. 1356/// \returns A 256-bit vector of [4 x double] containing the copied values. 1357#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 1358 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ 1359 (__v4df)(__m256d)(V2), \ 1360 (((M) & 0x01) ? 4 : 0), \ 1361 (((M) & 0x02) ? 5 : 1), \ 1362 (((M) & 0x04) ? 6 : 2), \ 1363 (((M) & 0x08) ? 7 : 3)); }) 1364 1365/// \brief Merges 32-bit single-precision data values stored in either of the 1366/// two 256-bit vectors of [8 x float], as specified by the immediate 1367/// integer operand. 1368/// 1369/// \headerfile <x86intrin.h> 1370/// 1371/// \code 1372/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); 1373/// \endcode 1374/// 1375/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction. 1376/// 1377/// \param V1 1378/// A 256-bit vector of [8 x float]. 1379/// \param V2 1380/// A 256-bit vector of [8 x float]. 1381/// \param M 1382/// An immediate integer operand, with mask bits [7:0] specifying how the 1383/// values are to be copied. The position of the mask bit corresponds to the 1384/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 1385/// element in operand \a V1 is copied to the same position in the 1386/// destination. When a mask bit is 1, the corresponding 32-bit element in 1387/// operand \a V2 is copied to the same position in the destination. 1388/// \returns A 256-bit vector of [8 x float] containing the copied values. 1389#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 1390 (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ 1391 (__v8sf)(__m256)(V2), \ 1392 (((M) & 0x01) ? 8 : 0), \ 1393 (((M) & 0x02) ? 9 : 1), \ 1394 (((M) & 0x04) ? 10 : 2), \ 1395 (((M) & 0x08) ? 11 : 3), \ 1396 (((M) & 0x10) ? 12 : 4), \ 1397 (((M) & 0x20) ? 13 : 5), \ 1398 (((M) & 0x40) ? 14 : 6), \ 1399 (((M) & 0x80) ? 15 : 7)); }) 1400 1401/// \brief Merges 64-bit double-precision data values stored in either of the 1402/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector 1403/// operand. 1404/// 1405/// \headerfile <x86intrin.h> 1406/// 1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction. 1408/// 1409/// \param __a 1410/// A 256-bit vector of [4 x double]. 1411/// \param __b 1412/// A 256-bit vector of [4 x double]. 1413/// \param __c 1414/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying 1415/// how the values are to be copied. The position of the mask bit corresponds 1416/// to the most significant bit of a copied value. When a mask bit is 0, the 1417/// corresponding 64-bit element in operand \a __a is copied to the same 1418/// position in the destination. When a mask bit is 1, the corresponding 1419/// 64-bit element in operand \a __b is copied to the same position in the 1420/// destination. 1421/// \returns A 256-bit vector of [4 x double] containing the copied values. 1422static __inline __m256d __DEFAULT_FN_ATTRS 1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) 1424{ 1425 return (__m256d)__builtin_ia32_blendvpd256( 1426 (__v4df)__a, (__v4df)__b, (__v4df)__c); 1427} 1428 1429/// \brief Merges 32-bit single-precision data values stored in either of the 1430/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector 1431/// operand. 1432/// 1433/// \headerfile <x86intrin.h> 1434/// 1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction. 1436/// 1437/// \param __a 1438/// A 256-bit vector of [8 x float]. 1439/// \param __b 1440/// A 256-bit vector of [8 x float]. 1441/// \param __c 1442/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, 1443/// and 31 specifying how the values are to be copied. The position of the 1444/// mask bit corresponds to the most significant bit of a copied value. When 1445/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is 1446/// copied to the same position in the destination. When a mask bit is 1, the 1447/// corresponding 32-bit element in operand \a __b is copied to the same 1448/// position in the destination. 1449/// \returns A 256-bit vector of [8 x float] containing the copied values. 1450static __inline __m256 __DEFAULT_FN_ATTRS 1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) 1452{ 1453 return (__m256)__builtin_ia32_blendvps256( 1454 (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); 1455} 1456 1457/* Vector Dot Product */ 1458/// \brief Computes two dot products in parallel, using the lower and upper 1459/// halves of two [8 x float] vectors as input to the two computations, and 1460/// returning the two dot products in the lower and upper halves of the 1461/// [8 x float] result. The immediate integer operand controls which input 1462/// elements will contribute to the dot product, and where the final results 1463/// are returned. In general, for each dot product, the four corresponding 1464/// elements of the input vectors are multiplied; the first two and second 1465/// two products are summed, then the two sums are added to form the final 1466/// result. 1467/// 1468/// \headerfile <x86intrin.h> 1469/// 1470/// \code 1471/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); 1472/// \endcode 1473/// 1474/// This intrinsic corresponds to the <c> VDPPS </c> instruction. 1475/// 1476/// \param V1 1477/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1478/// \param V2 1479/// A vector of [8 x float] values, treated as two [4 x float] vectors. 1480/// \param M 1481/// An immediate integer argument. Bits [7:4] determine which elements of 1482/// the input vectors are used, with bit [4] corresponding to the lowest 1483/// element and bit [7] corresponding to the highest element of each [4 x 1484/// float] subvector. If a bit is set, the corresponding elements from the 1485/// two input vectors are used as an input for dot product; otherwise that 1486/// input is treated as zero. Bits [3:0] determine which elements of the 1487/// result will receive a copy of the final dot product, with bit [0] 1488/// corresponding to the lowest element and bit [3] corresponding to the 1489/// highest element of each [4 x float] subvector. If a bit is set, the dot 1490/// product is returned in the corresponding element; otherwise that element 1491/// is set to zero. The bitmask is applied in the same way to each of the 1492/// two parallel dot product computations. 1493/// \returns A 256-bit vector of [8 x float] containing the two dot products. 1494#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 1495 (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ 1496 (__v8sf)(__m256)(V2), (M)); }) 1497 1498/* Vector shuffle */ 1499/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as 1500/// specified by the immediate value operand. The four selected elements in 1501/// each operand are copied to the destination according to the bits 1502/// specified in the immediate operand. The selected elements from the first 1503/// 256-bit operand are copied to bits [63:0] and bits [191:128] of the 1504/// destination, and the selected elements from the second 256-bit operand 1505/// are copied to bits [127:64] and bits [255:192] of the destination. For 1506/// example, if bits [7:0] of the immediate operand contain a value of 0xFF, 1507/// the 256-bit destination vector would contain the following values: b[7], 1508/// b[7], a[7], a[7], b[3], b[3], a[3], a[3]. 1509/// 1510/// \headerfile <x86intrin.h> 1511/// 1512/// \code 1513/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); 1514/// \endcode 1515/// 1516/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction. 1517/// 1518/// \param a 1519/// A 256-bit vector of [8 x float]. The four selected elements in this 1520/// operand are copied to bits [63:0] and bits [191:128] in the destination, 1521/// according to the bits specified in the immediate operand. 1522/// \param b 1523/// A 256-bit vector of [8 x float]. The four selected elements in this 1524/// operand are copied to bits [127:64] and bits [255:192] in the 1525/// destination, according to the bits specified in the immediate operand. 1526/// \param mask 1527/// An immediate value containing an 8-bit value specifying which elements to 1528/// copy from \a a and \a b \n. 1529/// Bits [3:0] specify the values copied from operand \a a. \n 1530/// Bits [7:4] specify the values copied from operand \a b. \n 1531/// The destinations within the 256-bit destination are assigned values as 1532/// follows, according to the bit value assignments described below: \n 1533/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the 1534/// destination. \n 1535/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the 1536/// destination. \n 1537/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the 1538/// destination. \n 1539/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in 1540/// the destination. \n 1541/// Bit value assignments: \n 1542/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n 1543/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n 1544/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n 1545/// 11: Bits [127:96] and [255:224] are copied from the selected operand. 1546/// \returns A 256-bit vector of [8 x float] containing the shuffled values. 1547#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 1548 (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ 1549 (__v8sf)(__m256)(b), \ 1550 0 + (((mask) >> 0) & 0x3), \ 1551 0 + (((mask) >> 2) & 0x3), \ 1552 8 + (((mask) >> 4) & 0x3), \ 1553 8 + (((mask) >> 6) & 0x3), \ 1554 4 + (((mask) >> 0) & 0x3), \ 1555 4 + (((mask) >> 2) & 0x3), \ 1556 12 + (((mask) >> 4) & 0x3), \ 1557 12 + (((mask) >> 6) & 0x3)); }) 1558 1559/// \brief Selects four double-precision values from the 256-bit operands of 1560/// [4 x double], as specified by the immediate value operand. The selected 1561/// elements from the first 256-bit operand are copied to bits [63:0] and 1562/// bits [191:128] in the destination, and the selected elements from the 1563/// second 256-bit operand are copied to bits [127:64] and bits [255:192] in 1564/// the destination. For example, if bits [3:0] of the immediate operand 1565/// contain a value of 0xF, the 256-bit destination vector would contain the 1566/// following values: b[3], a[3], b[1], a[1]. 1567/// 1568/// \headerfile <x86intrin.h> 1569/// 1570/// \code 1571/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); 1572/// \endcode 1573/// 1574/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction. 1575/// 1576/// \param a 1577/// A 256-bit vector of [4 x double]. 1578/// \param b 1579/// A 256-bit vector of [4 x double]. 1580/// \param mask 1581/// An immediate value containing 8-bit values specifying which elements to 1582/// copy from \a a and \a b: \n 1583/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the 1584/// destination. \n 1585/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the 1586/// destination. \n 1587/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the 1588/// destination. \n 1589/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the 1590/// destination. \n 1591/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the 1592/// destination. \n 1593/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the 1594/// destination. \n 1595/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the 1596/// destination. \n 1597/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the 1598/// destination. 1599/// \returns A 256-bit vector of [4 x double] containing the shuffled values. 1600#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 1601 (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ 1602 (__v4df)(__m256d)(b), \ 1603 0 + (((mask) >> 0) & 0x1), \ 1604 4 + (((mask) >> 1) & 0x1), \ 1605 2 + (((mask) >> 2) & 0x1), \ 1606 6 + (((mask) >> 3) & 0x1)); }) 1607 1608/* Compare */ 1609#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 1610#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 1611#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 1612#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 1613#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 1614#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 1615#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 1616#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ 1617#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 1618#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ 1619#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 1620#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 1621#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 1622#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 1623#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 1624#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 1625#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 1626#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 1627#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 1628#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 1629#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 1630#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 1631#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ 1632#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 1633#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 1634#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ 1635#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 1636#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 1637#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 1638#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 1639#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 1640#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 1641 1642/// \brief Compares each of the corresponding double-precision values of two 1643/// 128-bit vectors of [2 x double], using the operation specified by the 1644/// immediate integer operand. Returns a [2 x double] vector consisting of 1645/// two doubles corresponding to the two comparison results: zero if the 1646/// comparison is false, and all 1's if the comparison is true. 1647/// 1648/// \headerfile <x86intrin.h> 1649/// 1650/// \code 1651/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 1652/// \endcode 1653/// 1654/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1655/// 1656/// \param a 1657/// A 128-bit vector of [2 x double]. 1658/// \param b 1659/// A 128-bit vector of [2 x double]. 1660/// \param c 1661/// An immediate integer operand, with bits [4:0] specifying which comparison 1662/// operation to use: \n 1663/// 0x00 : Equal (ordered, non-signaling) 1664/// 0x01 : Less-than (ordered, signaling) 1665/// 0x02 : Less-than-or-equal (ordered, signaling) 1666/// 0x03 : Unordered (non-signaling) 1667/// 0x04 : Not-equal (unordered, non-signaling) 1668/// 0x05 : Not-less-than (unordered, signaling) 1669/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1670/// 0x07 : Ordered (non-signaling) 1671/// 0x08 : Equal (unordered, non-signaling) 1672/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1673/// 0x0a : Not-greater-than (unordered, signaling) 1674/// 0x0b : False (ordered, non-signaling) 1675/// 0x0c : Not-equal (ordered, non-signaling) 1676/// 0x0d : Greater-than-or-equal (ordered, signaling) 1677/// 0x0e : Greater-than (ordered, signaling) 1678/// 0x0f : True (unordered, non-signaling) 1679/// 0x10 : Equal (ordered, signaling) 1680/// 0x11 : Less-than (ordered, non-signaling) 1681/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1682/// 0x13 : Unordered (signaling) 1683/// 0x14 : Not-equal (unordered, signaling) 1684/// 0x15 : Not-less-than (unordered, non-signaling) 1685/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1686/// 0x17 : Ordered (signaling) 1687/// 0x18 : Equal (unordered, signaling) 1688/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1689/// 0x1a : Not-greater-than (unordered, non-signaling) 1690/// 0x1b : False (ordered, signaling) 1691/// 0x1c : Not-equal (ordered, signaling) 1692/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1693/// 0x1e : Greater-than (ordered, non-signaling) 1694/// 0x1f : True (unordered, signaling) 1695/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1696#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 1697 (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ 1698 (__v2df)(__m128d)(b), (c)); }) 1699 1700/// \brief Compares each of the corresponding values of two 128-bit vectors of 1701/// [4 x float], using the operation specified by the immediate integer 1702/// operand. Returns a [4 x float] vector consisting of four floats 1703/// corresponding to the four comparison results: zero if the comparison is 1704/// false, and all 1's if the comparison is true. 1705/// 1706/// \headerfile <x86intrin.h> 1707/// 1708/// \code 1709/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); 1710/// \endcode 1711/// 1712/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1713/// 1714/// \param a 1715/// A 128-bit vector of [4 x float]. 1716/// \param b 1717/// A 128-bit vector of [4 x float]. 1718/// \param c 1719/// An immediate integer operand, with bits [4:0] specifying which comparison 1720/// operation to use: \n 1721/// 0x00 : Equal (ordered, non-signaling) 1722/// 0x01 : Less-than (ordered, signaling) 1723/// 0x02 : Less-than-or-equal (ordered, signaling) 1724/// 0x03 : Unordered (non-signaling) 1725/// 0x04 : Not-equal (unordered, non-signaling) 1726/// 0x05 : Not-less-than (unordered, signaling) 1727/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1728/// 0x07 : Ordered (non-signaling) 1729/// 0x08 : Equal (unordered, non-signaling) 1730/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1731/// 0x0a : Not-greater-than (unordered, signaling) 1732/// 0x0b : False (ordered, non-signaling) 1733/// 0x0c : Not-equal (ordered, non-signaling) 1734/// 0x0d : Greater-than-or-equal (ordered, signaling) 1735/// 0x0e : Greater-than (ordered, signaling) 1736/// 0x0f : True (unordered, non-signaling) 1737/// 0x10 : Equal (ordered, signaling) 1738/// 0x11 : Less-than (ordered, non-signaling) 1739/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1740/// 0x13 : Unordered (signaling) 1741/// 0x14 : Not-equal (unordered, signaling) 1742/// 0x15 : Not-less-than (unordered, non-signaling) 1743/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1744/// 0x17 : Ordered (signaling) 1745/// 0x18 : Equal (unordered, signaling) 1746/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1747/// 0x1a : Not-greater-than (unordered, non-signaling) 1748/// 0x1b : False (ordered, signaling) 1749/// 0x1c : Not-equal (ordered, signaling) 1750/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1751/// 0x1e : Greater-than (ordered, non-signaling) 1752/// 0x1f : True (unordered, signaling) 1753/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1754#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 1755 (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ 1756 (__v4sf)(__m128)(b), (c)); }) 1757 1758/// \brief Compares each of the corresponding double-precision values of two 1759/// 256-bit vectors of [4 x double], using the operation specified by the 1760/// immediate integer operand. Returns a [4 x double] vector consisting of 1761/// four doubles corresponding to the four comparison results: zero if the 1762/// comparison is false, and all 1's if the comparison is true. 1763/// 1764/// \headerfile <x86intrin.h> 1765/// 1766/// \code 1767/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); 1768/// \endcode 1769/// 1770/// This intrinsic corresponds to the <c> VCMPPD </c> instruction. 1771/// 1772/// \param a 1773/// A 256-bit vector of [4 x double]. 1774/// \param b 1775/// A 256-bit vector of [4 x double]. 1776/// \param c 1777/// An immediate integer operand, with bits [4:0] specifying which comparison 1778/// operation to use: \n 1779/// 0x00 : Equal (ordered, non-signaling) 1780/// 0x01 : Less-than (ordered, signaling) 1781/// 0x02 : Less-than-or-equal (ordered, signaling) 1782/// 0x03 : Unordered (non-signaling) 1783/// 0x04 : Not-equal (unordered, non-signaling) 1784/// 0x05 : Not-less-than (unordered, signaling) 1785/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1786/// 0x07 : Ordered (non-signaling) 1787/// 0x08 : Equal (unordered, non-signaling) 1788/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1789/// 0x0a : Not-greater-than (unordered, signaling) 1790/// 0x0b : False (ordered, non-signaling) 1791/// 0x0c : Not-equal (ordered, non-signaling) 1792/// 0x0d : Greater-than-or-equal (ordered, signaling) 1793/// 0x0e : Greater-than (ordered, signaling) 1794/// 0x0f : True (unordered, non-signaling) 1795/// 0x10 : Equal (ordered, signaling) 1796/// 0x11 : Less-than (ordered, non-signaling) 1797/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1798/// 0x13 : Unordered (signaling) 1799/// 0x14 : Not-equal (unordered, signaling) 1800/// 0x15 : Not-less-than (unordered, non-signaling) 1801/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1802/// 0x17 : Ordered (signaling) 1803/// 0x18 : Equal (unordered, signaling) 1804/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1805/// 0x1a : Not-greater-than (unordered, non-signaling) 1806/// 0x1b : False (ordered, signaling) 1807/// 0x1c : Not-equal (ordered, signaling) 1808/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1809/// 0x1e : Greater-than (ordered, non-signaling) 1810/// 0x1f : True (unordered, signaling) 1811/// \returns A 256-bit vector of [4 x double] containing the comparison results. 1812#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 1813 (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ 1814 (__v4df)(__m256d)(b), (c)); }) 1815 1816/// \brief Compares each of the corresponding values of two 256-bit vectors of 1817/// [8 x float], using the operation specified by the immediate integer 1818/// operand. Returns a [8 x float] vector consisting of eight floats 1819/// corresponding to the eight comparison results: zero if the comparison is 1820/// false, and all 1's if the comparison is true. 1821/// 1822/// \headerfile <x86intrin.h> 1823/// 1824/// \code 1825/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); 1826/// \endcode 1827/// 1828/// This intrinsic corresponds to the <c> VCMPPS </c> instruction. 1829/// 1830/// \param a 1831/// A 256-bit vector of [8 x float]. 1832/// \param b 1833/// A 256-bit vector of [8 x float]. 1834/// \param c 1835/// An immediate integer operand, with bits [4:0] specifying which comparison 1836/// operation to use: \n 1837/// 0x00 : Equal (ordered, non-signaling) 1838/// 0x01 : Less-than (ordered, signaling) 1839/// 0x02 : Less-than-or-equal (ordered, signaling) 1840/// 0x03 : Unordered (non-signaling) 1841/// 0x04 : Not-equal (unordered, non-signaling) 1842/// 0x05 : Not-less-than (unordered, signaling) 1843/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1844/// 0x07 : Ordered (non-signaling) 1845/// 0x08 : Equal (unordered, non-signaling) 1846/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1847/// 0x0a : Not-greater-than (unordered, signaling) 1848/// 0x0b : False (ordered, non-signaling) 1849/// 0x0c : Not-equal (ordered, non-signaling) 1850/// 0x0d : Greater-than-or-equal (ordered, signaling) 1851/// 0x0e : Greater-than (ordered, signaling) 1852/// 0x0f : True (unordered, non-signaling) 1853/// 0x10 : Equal (ordered, signaling) 1854/// 0x11 : Less-than (ordered, non-signaling) 1855/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1856/// 0x13 : Unordered (signaling) 1857/// 0x14 : Not-equal (unordered, signaling) 1858/// 0x15 : Not-less-than (unordered, non-signaling) 1859/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1860/// 0x17 : Ordered (signaling) 1861/// 0x18 : Equal (unordered, signaling) 1862/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1863/// 0x1a : Not-greater-than (unordered, non-signaling) 1864/// 0x1b : False (ordered, signaling) 1865/// 0x1c : Not-equal (ordered, signaling) 1866/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1867/// 0x1e : Greater-than (ordered, non-signaling) 1868/// 0x1f : True (unordered, signaling) 1869/// \returns A 256-bit vector of [8 x float] containing the comparison results. 1870#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 1871 (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ 1872 (__v8sf)(__m256)(b), (c)); }) 1873 1874/// \brief Compares each of the corresponding scalar double-precision values of 1875/// two 128-bit vectors of [2 x double], using the operation specified by the 1876/// immediate integer operand. If the result is true, all 64 bits of the 1877/// destination vector are set; otherwise they are cleared. 1878/// 1879/// \headerfile <x86intrin.h> 1880/// 1881/// \code 1882/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 1883/// \endcode 1884/// 1885/// This intrinsic corresponds to the <c> VCMPSD </c> instruction. 1886/// 1887/// \param a 1888/// A 128-bit vector of [2 x double]. 1889/// \param b 1890/// A 128-bit vector of [2 x double]. 1891/// \param c 1892/// An immediate integer operand, with bits [4:0] specifying which comparison 1893/// operation to use: \n 1894/// 0x00 : Equal (ordered, non-signaling) 1895/// 0x01 : Less-than (ordered, signaling) 1896/// 0x02 : Less-than-or-equal (ordered, signaling) 1897/// 0x03 : Unordered (non-signaling) 1898/// 0x04 : Not-equal (unordered, non-signaling) 1899/// 0x05 : Not-less-than (unordered, signaling) 1900/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1901/// 0x07 : Ordered (non-signaling) 1902/// 0x08 : Equal (unordered, non-signaling) 1903/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1904/// 0x0a : Not-greater-than (unordered, signaling) 1905/// 0x0b : False (ordered, non-signaling) 1906/// 0x0c : Not-equal (ordered, non-signaling) 1907/// 0x0d : Greater-than-or-equal (ordered, signaling) 1908/// 0x0e : Greater-than (ordered, signaling) 1909/// 0x0f : True (unordered, non-signaling) 1910/// 0x10 : Equal (ordered, signaling) 1911/// 0x11 : Less-than (ordered, non-signaling) 1912/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1913/// 0x13 : Unordered (signaling) 1914/// 0x14 : Not-equal (unordered, signaling) 1915/// 0x15 : Not-less-than (unordered, non-signaling) 1916/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1917/// 0x17 : Ordered (signaling) 1918/// 0x18 : Equal (unordered, signaling) 1919/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1920/// 0x1a : Not-greater-than (unordered, non-signaling) 1921/// 0x1b : False (ordered, signaling) 1922/// 0x1c : Not-equal (ordered, signaling) 1923/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1924/// 0x1e : Greater-than (ordered, non-signaling) 1925/// 0x1f : True (unordered, signaling) 1926/// \returns A 128-bit vector of [2 x double] containing the comparison results. 1927#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 1928 (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ 1929 (__v2df)(__m128d)(b), (c)); }) 1930 1931/// \brief Compares each of the corresponding scalar values of two 128-bit 1932/// vectors of [4 x float], using the operation specified by the immediate 1933/// integer operand. If the result is true, all 32 bits of the destination 1934/// vector are set; otherwise they are cleared. 1935/// 1936/// \headerfile <x86intrin.h> 1937/// 1938/// \code 1939/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); 1940/// \endcode 1941/// 1942/// This intrinsic corresponds to the <c> VCMPSS </c> instruction. 1943/// 1944/// \param a 1945/// A 128-bit vector of [4 x float]. 1946/// \param b 1947/// A 128-bit vector of [4 x float]. 1948/// \param c 1949/// An immediate integer operand, with bits [4:0] specifying which comparison 1950/// operation to use: \n 1951/// 0x00 : Equal (ordered, non-signaling) 1952/// 0x01 : Less-than (ordered, signaling) 1953/// 0x02 : Less-than-or-equal (ordered, signaling) 1954/// 0x03 : Unordered (non-signaling) 1955/// 0x04 : Not-equal (unordered, non-signaling) 1956/// 0x05 : Not-less-than (unordered, signaling) 1957/// 0x06 : Not-less-than-or-equal (unordered, signaling) 1958/// 0x07 : Ordered (non-signaling) 1959/// 0x08 : Equal (unordered, non-signaling) 1960/// 0x09 : Not-greater-than-or-equal (unordered, signaling) 1961/// 0x0a : Not-greater-than (unordered, signaling) 1962/// 0x0b : False (ordered, non-signaling) 1963/// 0x0c : Not-equal (ordered, non-signaling) 1964/// 0x0d : Greater-than-or-equal (ordered, signaling) 1965/// 0x0e : Greater-than (ordered, signaling) 1966/// 0x0f : True (unordered, non-signaling) 1967/// 0x10 : Equal (ordered, signaling) 1968/// 0x11 : Less-than (ordered, non-signaling) 1969/// 0x12 : Less-than-or-equal (ordered, non-signaling) 1970/// 0x13 : Unordered (signaling) 1971/// 0x14 : Not-equal (unordered, signaling) 1972/// 0x15 : Not-less-than (unordered, non-signaling) 1973/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) 1974/// 0x17 : Ordered (signaling) 1975/// 0x18 : Equal (unordered, signaling) 1976/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) 1977/// 0x1a : Not-greater-than (unordered, non-signaling) 1978/// 0x1b : False (ordered, signaling) 1979/// 0x1c : Not-equal (ordered, signaling) 1980/// 0x1d : Greater-than-or-equal (ordered, non-signaling) 1981/// 0x1e : Greater-than (ordered, non-signaling) 1982/// 0x1f : True (unordered, signaling) 1983/// \returns A 128-bit vector of [4 x float] containing the comparison results. 1984#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 1985 (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ 1986 (__v4sf)(__m128)(b), (c)); }) 1987 1988/// \brief Takes a [8 x i32] vector and returns the vector element value 1989/// indexed by the immediate constant operand. 1990/// 1991/// \headerfile <x86intrin.h> 1992/// 1993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 1994/// instruction. 1995/// 1996/// \param __a 1997/// A 256-bit vector of [8 x i32]. 1998/// \param __imm 1999/// An immediate integer operand with bits [2:0] determining which vector 2000/// element is extracted and returned. 2001/// \returns A 32-bit integer containing the extracted 32 bits of extended 2002/// packed data. 2003static __inline int __DEFAULT_FN_ATTRS 2004_mm256_extract_epi32(__m256i __a, const int __imm) 2005{ 2006 __v8si __b = (__v8si)__a; 2007 return __b[__imm & 7]; 2008} 2009 2010/// \brief Takes a [16 x i16] vector and returns the vector element value 2011/// indexed by the immediate constant operand. 2012/// 2013/// \headerfile <x86intrin.h> 2014/// 2015/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2016/// instruction. 2017/// 2018/// \param __a 2019/// A 256-bit integer vector of [16 x i16]. 2020/// \param __imm 2021/// An immediate integer operand with bits [3:0] determining which vector 2022/// element is extracted and returned. 2023/// \returns A 32-bit integer containing the extracted 16 bits of zero extended 2024/// packed data. 2025static __inline int __DEFAULT_FN_ATTRS 2026_mm256_extract_epi16(__m256i __a, const int __imm) 2027{ 2028 __v16hi __b = (__v16hi)__a; 2029 return (unsigned short)__b[__imm & 15]; 2030} 2031 2032/// \brief Takes a [32 x i8] vector and returns the vector element value 2033/// indexed by the immediate constant operand. 2034/// 2035/// \headerfile <x86intrin.h> 2036/// 2037/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2038/// instruction. 2039/// 2040/// \param __a 2041/// A 256-bit integer vector of [32 x i8]. 2042/// \param __imm 2043/// An immediate integer operand with bits [4:0] determining which vector 2044/// element is extracted and returned. 2045/// \returns A 32-bit integer containing the extracted 8 bits of zero extended 2046/// packed data. 2047static __inline int __DEFAULT_FN_ATTRS 2048_mm256_extract_epi8(__m256i __a, const int __imm) 2049{ 2050 __v32qi __b = (__v32qi)__a; 2051 return (unsigned char)__b[__imm & 31]; 2052} 2053 2054#ifdef __x86_64__ 2055/// \brief Takes a [4 x i64] vector and returns the vector element value 2056/// indexed by the immediate constant operand. 2057/// 2058/// \headerfile <x86intrin.h> 2059/// 2060/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c> 2061/// instruction. 2062/// 2063/// \param __a 2064/// A 256-bit integer vector of [4 x i64]. 2065/// \param __imm 2066/// An immediate integer operand with bits [1:0] determining which vector 2067/// element is extracted and returned. 2068/// \returns A 64-bit integer containing the extracted 64 bits of extended 2069/// packed data. 2070static __inline long long __DEFAULT_FN_ATTRS 2071_mm256_extract_epi64(__m256i __a, const int __imm) 2072{ 2073 __v4di __b = (__v4di)__a; 2074 return __b[__imm & 3]; 2075} 2076#endif 2077 2078/// \brief Takes a [8 x i32] vector and replaces the vector element value 2079/// indexed by the immediate constant operand by a new value. Returns the 2080/// modified vector. 2081/// 2082/// \headerfile <x86intrin.h> 2083/// 2084/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2085/// instruction. 2086/// 2087/// \param __a 2088/// A vector of [8 x i32] to be used by the insert operation. 2089/// \param __b 2090/// An integer value. The replacement value for the insert operation. 2091/// \param __imm 2092/// An immediate integer specifying the index of the vector element to be 2093/// replaced. 2094/// \returns A copy of vector \a __a, after replacing its element indexed by 2095/// \a __imm with \a __b. 2096static __inline __m256i __DEFAULT_FN_ATTRS 2097_mm256_insert_epi32(__m256i __a, int __b, int const __imm) 2098{ 2099 __v8si __c = (__v8si)__a; 2100 __c[__imm & 7] = __b; 2101 return (__m256i)__c; 2102} 2103 2104 2105/// \brief Takes a [16 x i16] vector and replaces the vector element value 2106/// indexed by the immediate constant operand with a new value. Returns the 2107/// modified vector. 2108/// 2109/// \headerfile <x86intrin.h> 2110/// 2111/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2112/// instruction. 2113/// 2114/// \param __a 2115/// A vector of [16 x i16] to be used by the insert operation. 2116/// \param __b 2117/// An i16 integer value. The replacement value for the insert operation. 2118/// \param __imm 2119/// An immediate integer specifying the index of the vector element to be 2120/// replaced. 2121/// \returns A copy of vector \a __a, after replacing its element indexed by 2122/// \a __imm with \a __b. 2123static __inline __m256i __DEFAULT_FN_ATTRS 2124_mm256_insert_epi16(__m256i __a, int __b, int const __imm) 2125{ 2126 __v16hi __c = (__v16hi)__a; 2127 __c[__imm & 15] = __b; 2128 return (__m256i)__c; 2129} 2130 2131/// \brief Takes a [32 x i8] vector and replaces the vector element value 2132/// indexed by the immediate constant operand with a new value. Returns the 2133/// modified vector. 2134/// 2135/// \headerfile <x86intrin.h> 2136/// 2137/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2138/// instruction. 2139/// 2140/// \param __a 2141/// A vector of [32 x i8] to be used by the insert operation. 2142/// \param __b 2143/// An i8 integer value. The replacement value for the insert operation. 2144/// \param __imm 2145/// An immediate integer specifying the index of the vector element to be 2146/// replaced. 2147/// \returns A copy of vector \a __a, after replacing its element indexed by 2148/// \a __imm with \a __b. 2149static __inline __m256i __DEFAULT_FN_ATTRS 2150_mm256_insert_epi8(__m256i __a, int __b, int const __imm) 2151{ 2152 __v32qi __c = (__v32qi)__a; 2153 __c[__imm & 31] = __b; 2154 return (__m256i)__c; 2155} 2156 2157#ifdef __x86_64__ 2158/// \brief Takes a [4 x i64] vector and replaces the vector element value 2159/// indexed by the immediate constant operand with a new value. Returns the 2160/// modified vector. 2161/// 2162/// \headerfile <x86intrin.h> 2163/// 2164/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c> 2165/// instruction. 2166/// 2167/// \param __a 2168/// A vector of [4 x i64] to be used by the insert operation. 2169/// \param __b 2170/// A 64-bit integer value. The replacement value for the insert operation. 2171/// \param __imm 2172/// An immediate integer specifying the index of the vector element to be 2173/// replaced. 2174/// \returns A copy of vector \a __a, after replacing its element indexed by 2175/// \a __imm with \a __b. 2176static __inline __m256i __DEFAULT_FN_ATTRS 2177_mm256_insert_epi64(__m256i __a, long long __b, int const __imm) 2178{ 2179 __v4di __c = (__v4di)__a; 2180 __c[__imm & 3] = __b; 2181 return (__m256i)__c; 2182} 2183#endif 2184 2185/* Conversion */ 2186/// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. 2187/// 2188/// \headerfile <x86intrin.h> 2189/// 2190/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction. 2191/// 2192/// \param __a 2193/// A 128-bit integer vector of [4 x i32]. 2194/// \returns A 256-bit vector of [4 x double] containing the converted values. 2195static __inline __m256d __DEFAULT_FN_ATTRS 2196_mm256_cvtepi32_pd(__m128i __a) 2197{ 2198 return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); 2199} 2200 2201/// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. 2202/// 2203/// \headerfile <x86intrin.h> 2204/// 2205/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction. 2206/// 2207/// \param __a 2208/// A 256-bit integer vector. 2209/// \returns A 256-bit vector of [8 x float] containing the converted values. 2210static __inline __m256 __DEFAULT_FN_ATTRS 2211_mm256_cvtepi32_ps(__m256i __a) 2212{ 2213 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); 2214} 2215 2216/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 2217/// [4 x float]. 2218/// 2219/// \headerfile <x86intrin.h> 2220/// 2221/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction. 2222/// 2223/// \param __a 2224/// A 256-bit vector of [4 x double]. 2225/// \returns A 128-bit vector of [4 x float] containing the converted values. 2226static __inline __m128 __DEFAULT_FN_ATTRS 2227_mm256_cvtpd_ps(__m256d __a) 2228{ 2229 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); 2230} 2231 2232/// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. 2233/// 2234/// \headerfile <x86intrin.h> 2235/// 2236/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction. 2237/// 2238/// \param __a 2239/// A 256-bit vector of [8 x float]. 2240/// \returns A 256-bit integer vector containing the converted values. 2241static __inline __m256i __DEFAULT_FN_ATTRS 2242_mm256_cvtps_epi32(__m256 __a) 2243{ 2244 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); 2245} 2246 2247/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 2248/// x double]. 2249/// 2250/// \headerfile <x86intrin.h> 2251/// 2252/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction. 2253/// 2254/// \param __a 2255/// A 128-bit vector of [4 x float]. 2256/// \returns A 256-bit vector of [4 x double] containing the converted values. 2257static __inline __m256d __DEFAULT_FN_ATTRS 2258_mm256_cvtps_pd(__m128 __a) 2259{ 2260 return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); 2261} 2262 2263/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2264/// x i32], truncating the result by rounding towards zero when it is 2265/// inexact. 2266/// 2267/// \headerfile <x86intrin.h> 2268/// 2269/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction. 2270/// 2271/// \param __a 2272/// A 256-bit vector of [4 x double]. 2273/// \returns A 128-bit integer vector containing the converted values. 2274static __inline __m128i __DEFAULT_FN_ATTRS 2275_mm256_cvttpd_epi32(__m256d __a) 2276{ 2277 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); 2278} 2279 2280/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 2281/// x i32]. When a conversion is inexact, the value returned is rounded 2282/// according to the rounding control bits in the MXCSR register. 2283/// 2284/// \headerfile <x86intrin.h> 2285/// 2286/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction. 2287/// 2288/// \param __a 2289/// A 256-bit vector of [4 x double]. 2290/// \returns A 128-bit integer vector containing the converted values. 2291static __inline __m128i __DEFAULT_FN_ATTRS 2292_mm256_cvtpd_epi32(__m256d __a) 2293{ 2294 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); 2295} 2296 2297/// \brief Converts a vector of [8 x float] into a vector of [8 x i32], 2298/// truncating the result by rounding towards zero when it is inexact. 2299/// 2300/// \headerfile <x86intrin.h> 2301/// 2302/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction. 2303/// 2304/// \param __a 2305/// A 256-bit vector of [8 x float]. 2306/// \returns A 256-bit integer vector containing the converted values. 2307static __inline __m256i __DEFAULT_FN_ATTRS 2308_mm256_cvttps_epi32(__m256 __a) 2309{ 2310 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); 2311} 2312 2313/// \brief Returns the first element of the input vector of [4 x double]. 2314/// 2315/// \headerfile <avxintrin.h> 2316/// 2317/// This intrinsic is a utility function and does not correspond to a specific 2318/// instruction. 2319/// 2320/// \param __a 2321/// A 256-bit vector of [4 x double]. 2322/// \returns A 64 bit double containing the first element of the input vector. 2323static __inline double __DEFAULT_FN_ATTRS 2324_mm256_cvtsd_f64(__m256d __a) 2325{ 2326 return __a[0]; 2327} 2328 2329/// \brief Returns the first element of the input vector of [8 x i32]. 2330/// 2331/// \headerfile <avxintrin.h> 2332/// 2333/// This intrinsic is a utility function and does not correspond to a specific 2334/// instruction. 2335/// 2336/// \param __a 2337/// A 256-bit vector of [8 x i32]. 2338/// \returns A 32 bit integer containing the first element of the input vector. 2339static __inline int __DEFAULT_FN_ATTRS 2340_mm256_cvtsi256_si32(__m256i __a) 2341{ 2342 __v8si __b = (__v8si)__a; 2343 return __b[0]; 2344} 2345 2346/// \brief Returns the first element of the input vector of [8 x float]. 2347/// 2348/// \headerfile <avxintrin.h> 2349/// 2350/// This intrinsic is a utility function and does not correspond to a specific 2351/// instruction. 2352/// 2353/// \param __a 2354/// A 256-bit vector of [8 x float]. 2355/// \returns A 32 bit float containing the first element of the input vector. 2356static __inline float __DEFAULT_FN_ATTRS 2357_mm256_cvtss_f32(__m256 __a) 2358{ 2359 return __a[0]; 2360} 2361 2362/* Vector replicate */ 2363/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit 2364/// vector of [8 x float] to float values in a 256-bit vector of 2365/// [8 x float]. 2366/// 2367/// \headerfile <x86intrin.h> 2368/// 2369/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 2370/// 2371/// \param __a 2372/// A 256-bit vector of [8 x float]. \n 2373/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of 2374/// the return value. \n 2375/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of 2376/// the return value. \n 2377/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the 2378/// return value. \n 2379/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the 2380/// return value. 2381/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2382/// values. 2383static __inline __m256 __DEFAULT_FN_ATTRS 2384_mm256_movehdup_ps(__m256 __a) 2385{ 2386 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); 2387} 2388 2389/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit 2390/// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. 2391/// 2392/// \headerfile <x86intrin.h> 2393/// 2394/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 2395/// 2396/// \param __a 2397/// A 256-bit vector of [8 x float]. \n 2398/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of 2399/// the return value. \n 2400/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of 2401/// the return value. \n 2402/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the 2403/// return value. \n 2404/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the 2405/// return value. 2406/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated 2407/// values. 2408static __inline __m256 __DEFAULT_FN_ATTRS 2409_mm256_moveldup_ps(__m256 __a) 2410{ 2411 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); 2412} 2413 2414/// \brief Moves and duplicates double-precision floating point values from a 2415/// 256-bit vector of [4 x double] to double-precision values in a 256-bit 2416/// vector of [4 x double]. 2417/// 2418/// \headerfile <x86intrin.h> 2419/// 2420/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 2421/// 2422/// \param __a 2423/// A 256-bit vector of [4 x double]. \n 2424/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the 2425/// return value. \n 2426/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of 2427/// the return value. 2428/// \returns A 256-bit vector of [4 x double] containing the moved and 2429/// duplicated values. 2430static __inline __m256d __DEFAULT_FN_ATTRS 2431_mm256_movedup_pd(__m256d __a) 2432{ 2433 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); 2434} 2435 2436/* Unpack and Interleave */ 2437/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of 2438/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2439/// 2440/// \headerfile <x86intrin.h> 2441/// 2442/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction. 2443/// 2444/// \param __a 2445/// A 256-bit floating-point vector of [4 x double]. \n 2446/// Bits [127:64] are written to bits [63:0] of the return value. \n 2447/// Bits [255:192] are written to bits [191:128] of the return value. \n 2448/// \param __b 2449/// A 256-bit floating-point vector of [4 x double]. \n 2450/// Bits [127:64] are written to bits [127:64] of the return value. \n 2451/// Bits [255:192] are written to bits [255:192] of the return value. \n 2452/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2453static __inline __m256d __DEFAULT_FN_ATTRS 2454_mm256_unpackhi_pd(__m256d __a, __m256d __b) 2455{ 2456 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); 2457} 2458 2459/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of 2460/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. 2461/// 2462/// \headerfile <x86intrin.h> 2463/// 2464/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction. 2465/// 2466/// \param __a 2467/// A 256-bit floating-point vector of [4 x double]. \n 2468/// Bits [63:0] are written to bits [63:0] of the return value. \n 2469/// Bits [191:128] are written to bits [191:128] of the return value. 2470/// \param __b 2471/// A 256-bit floating-point vector of [4 x double]. \n 2472/// Bits [63:0] are written to bits [127:64] of the return value. \n 2473/// Bits [191:128] are written to bits [255:192] of the return value. \n 2474/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 2475static __inline __m256d __DEFAULT_FN_ATTRS 2476_mm256_unpacklo_pd(__m256d __a, __m256d __b) 2477{ 2478 return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); 2479} 2480 2481/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the 2482/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2483/// vector of [8 x float]. 2484/// 2485/// \headerfile <x86intrin.h> 2486/// 2487/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction. 2488/// 2489/// \param __a 2490/// A 256-bit vector of [8 x float]. \n 2491/// Bits [95:64] are written to bits [31:0] of the return value. \n 2492/// Bits [127:96] are written to bits [95:64] of the return value. \n 2493/// Bits [223:192] are written to bits [159:128] of the return value. \n 2494/// Bits [255:224] are written to bits [223:192] of the return value. 2495/// \param __b 2496/// A 256-bit vector of [8 x float]. \n 2497/// Bits [95:64] are written to bits [63:32] of the return value. \n 2498/// Bits [127:96] are written to bits [127:96] of the return value. \n 2499/// Bits [223:192] are written to bits [191:160] of the return value. \n 2500/// Bits [255:224] are written to bits [255:224] of the return value. 2501/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2502static __inline __m256 __DEFAULT_FN_ATTRS 2503_mm256_unpackhi_ps(__m256 __a, __m256 __b) 2504{ 2505 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 2506} 2507 2508/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the 2509/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit 2510/// vector of [8 x float]. 2511/// 2512/// \headerfile <x86intrin.h> 2513/// 2514/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction. 2515/// 2516/// \param __a 2517/// A 256-bit vector of [8 x float]. \n 2518/// Bits [31:0] are written to bits [31:0] of the return value. \n 2519/// Bits [63:32] are written to bits [95:64] of the return value. \n 2520/// Bits [159:128] are written to bits [159:128] of the return value. \n 2521/// Bits [191:160] are written to bits [223:192] of the return value. 2522/// \param __b 2523/// A 256-bit vector of [8 x float]. \n 2524/// Bits [31:0] are written to bits [63:32] of the return value. \n 2525/// Bits [63:32] are written to bits [127:96] of the return value. \n 2526/// Bits [159:128] are written to bits [191:160] of the return value. \n 2527/// Bits [191:160] are written to bits [255:224] of the return value. 2528/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 2529static __inline __m256 __DEFAULT_FN_ATTRS 2530_mm256_unpacklo_ps(__m256 __a, __m256 __b) 2531{ 2532 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 2533} 2534 2535/* Bit Test */ 2536/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2537/// element-by-element comparison of the double-precision element in the 2538/// first source vector and the corresponding element in the second source 2539/// vector. The EFLAGS register is updated as follows: \n 2540/// If there is at least one pair of double-precision elements where the 2541/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2542/// ZF flag is set to 1. \n 2543/// If there is at least one pair of double-precision elements where the 2544/// sign-bit of the first element is 0 and the sign-bit of the second element 2545/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2546/// This intrinsic returns the value of the ZF flag. 2547/// 2548/// \headerfile <x86intrin.h> 2549/// 2550/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2551/// 2552/// \param __a 2553/// A 128-bit vector of [2 x double]. 2554/// \param __b 2555/// A 128-bit vector of [2 x double]. 2556/// \returns the ZF flag in the EFLAGS register. 2557static __inline int __DEFAULT_FN_ATTRS 2558_mm_testz_pd(__m128d __a, __m128d __b) 2559{ 2560 return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); 2561} 2562 2563/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2564/// element-by-element comparison of the double-precision element in the 2565/// first source vector and the corresponding element in the second source 2566/// vector. The EFLAGS register is updated as follows: \n 2567/// If there is at least one pair of double-precision elements where the 2568/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2569/// ZF flag is set to 1. \n 2570/// If there is at least one pair of double-precision elements where the 2571/// sign-bit of the first element is 0 and the sign-bit of the second element 2572/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2573/// This intrinsic returns the value of the CF flag. 2574/// 2575/// \headerfile <x86intrin.h> 2576/// 2577/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2578/// 2579/// \param __a 2580/// A 128-bit vector of [2 x double]. 2581/// \param __b 2582/// A 128-bit vector of [2 x double]. 2583/// \returns the CF flag in the EFLAGS register. 2584static __inline int __DEFAULT_FN_ATTRS 2585_mm_testc_pd(__m128d __a, __m128d __b) 2586{ 2587 return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); 2588} 2589 2590/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an 2591/// element-by-element comparison of the double-precision element in the 2592/// first source vector and the corresponding element in the second source 2593/// vector. The EFLAGS register is updated as follows: \n 2594/// If there is at least one pair of double-precision elements where the 2595/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2596/// ZF flag is set to 1. \n 2597/// If there is at least one pair of double-precision elements where the 2598/// sign-bit of the first element is 0 and the sign-bit of the second element 2599/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2600/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2601/// otherwise it returns 0. 2602/// 2603/// \headerfile <x86intrin.h> 2604/// 2605/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2606/// 2607/// \param __a 2608/// A 128-bit vector of [2 x double]. 2609/// \param __b 2610/// A 128-bit vector of [2 x double]. 2611/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2612static __inline int __DEFAULT_FN_ATTRS 2613_mm_testnzc_pd(__m128d __a, __m128d __b) 2614{ 2615 return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); 2616} 2617 2618/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2619/// element-by-element comparison of the single-precision element in the 2620/// first source vector and the corresponding element in the second source 2621/// vector. The EFLAGS register is updated as follows: \n 2622/// If there is at least one pair of single-precision elements where the 2623/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2624/// ZF flag is set to 1. \n 2625/// If there is at least one pair of single-precision elements where the 2626/// sign-bit of the first element is 0 and the sign-bit of the second element 2627/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2628/// This intrinsic returns the value of the ZF flag. 2629/// 2630/// \headerfile <x86intrin.h> 2631/// 2632/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2633/// 2634/// \param __a 2635/// A 128-bit vector of [4 x float]. 2636/// \param __b 2637/// A 128-bit vector of [4 x float]. 2638/// \returns the ZF flag. 2639static __inline int __DEFAULT_FN_ATTRS 2640_mm_testz_ps(__m128 __a, __m128 __b) 2641{ 2642 return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); 2643} 2644 2645/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2646/// element-by-element comparison of the single-precision element in the 2647/// first source vector and the corresponding element in the second source 2648/// vector. The EFLAGS register is updated as follows: \n 2649/// If there is at least one pair of single-precision elements where the 2650/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2651/// ZF flag is set to 1. \n 2652/// If there is at least one pair of single-precision elements where the 2653/// sign-bit of the first element is 0 and the sign-bit of the second element 2654/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2655/// This intrinsic returns the value of the CF flag. 2656/// 2657/// \headerfile <x86intrin.h> 2658/// 2659/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2660/// 2661/// \param __a 2662/// A 128-bit vector of [4 x float]. 2663/// \param __b 2664/// A 128-bit vector of [4 x float]. 2665/// \returns the CF flag. 2666static __inline int __DEFAULT_FN_ATTRS 2667_mm_testc_ps(__m128 __a, __m128 __b) 2668{ 2669 return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); 2670} 2671 2672/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an 2673/// element-by-element comparison of the single-precision element in the 2674/// first source vector and the corresponding element in the second source 2675/// vector. The EFLAGS register is updated as follows: \n 2676/// If there is at least one pair of single-precision elements where the 2677/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2678/// ZF flag is set to 1. \n 2679/// If there is at least one pair of single-precision elements where the 2680/// sign-bit of the first element is 0 and the sign-bit of the second element 2681/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2682/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2683/// otherwise it returns 0. 2684/// 2685/// \headerfile <x86intrin.h> 2686/// 2687/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2688/// 2689/// \param __a 2690/// A 128-bit vector of [4 x float]. 2691/// \param __b 2692/// A 128-bit vector of [4 x float]. 2693/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2694static __inline int __DEFAULT_FN_ATTRS 2695_mm_testnzc_ps(__m128 __a, __m128 __b) 2696{ 2697 return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); 2698} 2699 2700/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2701/// element-by-element comparison of the double-precision elements in the 2702/// first source vector and the corresponding elements in the second source 2703/// vector. The EFLAGS register is updated as follows: \n 2704/// If there is at least one pair of double-precision elements where the 2705/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2706/// ZF flag is set to 1. \n 2707/// If there is at least one pair of double-precision elements where the 2708/// sign-bit of the first element is 0 and the sign-bit of the second element 2709/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2710/// This intrinsic returns the value of the ZF flag. 2711/// 2712/// \headerfile <x86intrin.h> 2713/// 2714/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2715/// 2716/// \param __a 2717/// A 256-bit vector of [4 x double]. 2718/// \param __b 2719/// A 256-bit vector of [4 x double]. 2720/// \returns the ZF flag. 2721static __inline int __DEFAULT_FN_ATTRS 2722_mm256_testz_pd(__m256d __a, __m256d __b) 2723{ 2724 return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); 2725} 2726 2727/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2728/// element-by-element comparison of the double-precision elements in the 2729/// first source vector and the corresponding elements in the second source 2730/// vector. The EFLAGS register is updated as follows: \n 2731/// If there is at least one pair of double-precision elements where the 2732/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2733/// ZF flag is set to 1. \n 2734/// If there is at least one pair of double-precision elements where the 2735/// sign-bit of the first element is 0 and the sign-bit of the second element 2736/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2737/// This intrinsic returns the value of the CF flag. 2738/// 2739/// \headerfile <x86intrin.h> 2740/// 2741/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2742/// 2743/// \param __a 2744/// A 256-bit vector of [4 x double]. 2745/// \param __b 2746/// A 256-bit vector of [4 x double]. 2747/// \returns the CF flag. 2748static __inline int __DEFAULT_FN_ATTRS 2749_mm256_testc_pd(__m256d __a, __m256d __b) 2750{ 2751 return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); 2752} 2753 2754/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an 2755/// element-by-element comparison of the double-precision elements in the 2756/// first source vector and the corresponding elements in the second source 2757/// vector. The EFLAGS register is updated as follows: \n 2758/// If there is at least one pair of double-precision elements where the 2759/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2760/// ZF flag is set to 1. \n 2761/// If there is at least one pair of double-precision elements where the 2762/// sign-bit of the first element is 0 and the sign-bit of the second element 2763/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2764/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2765/// otherwise it returns 0. 2766/// 2767/// \headerfile <x86intrin.h> 2768/// 2769/// This intrinsic corresponds to the <c> VTESTPD </c> instruction. 2770/// 2771/// \param __a 2772/// A 256-bit vector of [4 x double]. 2773/// \param __b 2774/// A 256-bit vector of [4 x double]. 2775/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2776static __inline int __DEFAULT_FN_ATTRS 2777_mm256_testnzc_pd(__m256d __a, __m256d __b) 2778{ 2779 return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); 2780} 2781 2782/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2783/// element-by-element comparison of the single-precision element in the 2784/// first source vector and the corresponding element in the second source 2785/// vector. The EFLAGS register is updated as follows: \n 2786/// If there is at least one pair of single-precision elements where the 2787/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2788/// ZF flag is set to 1. \n 2789/// If there is at least one pair of single-precision elements where the 2790/// sign-bit of the first element is 0 and the sign-bit of the second element 2791/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2792/// This intrinsic returns the value of the ZF flag. 2793/// 2794/// \headerfile <x86intrin.h> 2795/// 2796/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2797/// 2798/// \param __a 2799/// A 256-bit vector of [8 x float]. 2800/// \param __b 2801/// A 256-bit vector of [8 x float]. 2802/// \returns the ZF flag. 2803static __inline int __DEFAULT_FN_ATTRS 2804_mm256_testz_ps(__m256 __a, __m256 __b) 2805{ 2806 return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); 2807} 2808 2809/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2810/// element-by-element comparison of the single-precision element in the 2811/// first source vector and the corresponding element in the second source 2812/// vector. The EFLAGS register is updated as follows: \n 2813/// If there is at least one pair of single-precision elements where the 2814/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2815/// ZF flag is set to 1. \n 2816/// If there is at least one pair of single-precision elements where the 2817/// sign-bit of the first element is 0 and the sign-bit of the second element 2818/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2819/// This intrinsic returns the value of the CF flag. 2820/// 2821/// \headerfile <x86intrin.h> 2822/// 2823/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2824/// 2825/// \param __a 2826/// A 256-bit vector of [8 x float]. 2827/// \param __b 2828/// A 256-bit vector of [8 x float]. 2829/// \returns the CF flag. 2830static __inline int __DEFAULT_FN_ATTRS 2831_mm256_testc_ps(__m256 __a, __m256 __b) 2832{ 2833 return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); 2834} 2835 2836/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an 2837/// element-by-element comparison of the single-precision elements in the 2838/// first source vector and the corresponding elements in the second source 2839/// vector. The EFLAGS register is updated as follows: \n 2840/// If there is at least one pair of single-precision elements where the 2841/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the 2842/// ZF flag is set to 1. \n 2843/// If there is at least one pair of single-precision elements where the 2844/// sign-bit of the first element is 0 and the sign-bit of the second element 2845/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n 2846/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2847/// otherwise it returns 0. 2848/// 2849/// \headerfile <x86intrin.h> 2850/// 2851/// This intrinsic corresponds to the <c> VTESTPS </c> instruction. 2852/// 2853/// \param __a 2854/// A 256-bit vector of [8 x float]. 2855/// \param __b 2856/// A 256-bit vector of [8 x float]. 2857/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2858static __inline int __DEFAULT_FN_ATTRS 2859_mm256_testnzc_ps(__m256 __a, __m256 __b) 2860{ 2861 return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); 2862} 2863 2864/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2865/// of the two source vectors and update the EFLAGS register as follows: \n 2866/// If there is at least one pair of bits where both bits are 1, the ZF flag 2867/// is set to 0. Otherwise the ZF flag is set to 1. \n 2868/// If there is at least one pair of bits where the bit from the first source 2869/// vector is 0 and the bit from the second source vector is 1, the CF flag 2870/// is set to 0. Otherwise the CF flag is set to 1. \n 2871/// This intrinsic returns the value of the ZF flag. 2872/// 2873/// \headerfile <x86intrin.h> 2874/// 2875/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2876/// 2877/// \param __a 2878/// A 256-bit integer vector. 2879/// \param __b 2880/// A 256-bit integer vector. 2881/// \returns the ZF flag. 2882static __inline int __DEFAULT_FN_ATTRS 2883_mm256_testz_si256(__m256i __a, __m256i __b) 2884{ 2885 return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); 2886} 2887 2888/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2889/// of the two source vectors and update the EFLAGS register as follows: \n 2890/// If there is at least one pair of bits where both bits are 1, the ZF flag 2891/// is set to 0. Otherwise the ZF flag is set to 1. \n 2892/// If there is at least one pair of bits where the bit from the first source 2893/// vector is 0 and the bit from the second source vector is 1, the CF flag 2894/// is set to 0. Otherwise the CF flag is set to 1. \n 2895/// This intrinsic returns the value of the CF flag. 2896/// 2897/// \headerfile <x86intrin.h> 2898/// 2899/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2900/// 2901/// \param __a 2902/// A 256-bit integer vector. 2903/// \param __b 2904/// A 256-bit integer vector. 2905/// \returns the CF flag. 2906static __inline int __DEFAULT_FN_ATTRS 2907_mm256_testc_si256(__m256i __a, __m256i __b) 2908{ 2909 return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); 2910} 2911 2912/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison 2913/// of the two source vectors and update the EFLAGS register as follows: \n 2914/// If there is at least one pair of bits where both bits are 1, the ZF flag 2915/// is set to 0. Otherwise the ZF flag is set to 1. \n 2916/// If there is at least one pair of bits where the bit from the first source 2917/// vector is 0 and the bit from the second source vector is 1, the CF flag 2918/// is set to 0. Otherwise the CF flag is set to 1. \n 2919/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, 2920/// otherwise it returns 0. 2921/// 2922/// \headerfile <x86intrin.h> 2923/// 2924/// This intrinsic corresponds to the <c> VPTEST </c> instruction. 2925/// 2926/// \param __a 2927/// A 256-bit integer vector. 2928/// \param __b 2929/// A 256-bit integer vector. 2930/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. 2931static __inline int __DEFAULT_FN_ATTRS 2932_mm256_testnzc_si256(__m256i __a, __m256i __b) 2933{ 2934 return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); 2935} 2936 2937/* Vector extract sign mask */ 2938/// \brief Extracts the sign bits of double-precision floating point elements 2939/// in a 256-bit vector of [4 x double] and writes them to the lower order 2940/// bits of the return value. 2941/// 2942/// \headerfile <x86intrin.h> 2943/// 2944/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction. 2945/// 2946/// \param __a 2947/// A 256-bit vector of [4 x double] containing the double-precision 2948/// floating point values with sign bits to be extracted. 2949/// \returns The sign bits from the operand, written to bits [3:0]. 2950static __inline int __DEFAULT_FN_ATTRS 2951_mm256_movemask_pd(__m256d __a) 2952{ 2953 return __builtin_ia32_movmskpd256((__v4df)__a); 2954} 2955 2956/// \brief Extracts the sign bits of double-precision floating point elements 2957/// in a 256-bit vector of [8 x float] and writes them to the lower order 2958/// bits of the return value. 2959/// 2960/// \headerfile <x86intrin.h> 2961/// 2962/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. 2963/// 2964/// \param __a 2965/// A 256-bit vector of [8 x float] containing the double-precision floating 2966/// point values with sign bits to be extracted. 2967/// \returns The sign bits from the operand, written to bits [7:0]. 2968static __inline int __DEFAULT_FN_ATTRS 2969_mm256_movemask_ps(__m256 __a) 2970{ 2971 return __builtin_ia32_movmskps256((__v8sf)__a); 2972} 2973 2974/* Vector __zero */ 2975/// \brief Zeroes the contents of all XMM or YMM registers. 2976/// 2977/// \headerfile <x86intrin.h> 2978/// 2979/// This intrinsic corresponds to the <c> VZEROALL </c> instruction. 2980static __inline void __DEFAULT_FN_ATTRS 2981_mm256_zeroall(void) 2982{ 2983 __builtin_ia32_vzeroall(); 2984} 2985 2986/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. 2987/// 2988/// \headerfile <x86intrin.h> 2989/// 2990/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. 2991static __inline void __DEFAULT_FN_ATTRS 2992_mm256_zeroupper(void) 2993{ 2994 __builtin_ia32_vzeroupper(); 2995} 2996 2997/* Vector load with broadcast */ 2998/// \brief Loads a scalar single-precision floating point value from the 2999/// specified address pointed to by \a __a and broadcasts it to the elements 3000/// of a [4 x float] vector. 3001/// 3002/// \headerfile <x86intrin.h> 3003/// 3004/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3005/// 3006/// \param __a 3007/// The single-precision floating point value to be broadcast. 3008/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set 3009/// equal to the broadcast value. 3010static __inline __m128 __DEFAULT_FN_ATTRS 3011_mm_broadcast_ss(float const *__a) 3012{ 3013 float __f = *__a; 3014 return (__m128)(__v4sf){ __f, __f, __f, __f }; 3015} 3016 3017/// \brief Loads a scalar double-precision floating point value from the 3018/// specified address pointed to by \a __a and broadcasts it to the elements 3019/// of a [4 x double] vector. 3020/// 3021/// \headerfile <x86intrin.h> 3022/// 3023/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction. 3024/// 3025/// \param __a 3026/// The double-precision floating point value to be broadcast. 3027/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set 3028/// equal to the broadcast value. 3029static __inline __m256d __DEFAULT_FN_ATTRS 3030_mm256_broadcast_sd(double const *__a) 3031{ 3032 double __d = *__a; 3033 return (__m256d)(__v4df){ __d, __d, __d, __d }; 3034} 3035 3036/// \brief Loads a scalar single-precision floating point value from the 3037/// specified address pointed to by \a __a and broadcasts it to the elements 3038/// of a [8 x float] vector. 3039/// 3040/// \headerfile <x86intrin.h> 3041/// 3042/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction. 3043/// 3044/// \param __a 3045/// The single-precision floating point value to be broadcast. 3046/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set 3047/// equal to the broadcast value. 3048static __inline __m256 __DEFAULT_FN_ATTRS 3049_mm256_broadcast_ss(float const *__a) 3050{ 3051 float __f = *__a; 3052 return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; 3053} 3054 3055/// \brief Loads the data from a 128-bit vector of [2 x double] from the 3056/// specified address pointed to by \a __a and broadcasts it to 128-bit 3057/// elements in a 256-bit vector of [4 x double]. 3058/// 3059/// \headerfile <x86intrin.h> 3060/// 3061/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3062/// 3063/// \param __a 3064/// The 128-bit vector of [2 x double] to be broadcast. 3065/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set 3066/// equal to the broadcast value. 3067static __inline __m256d __DEFAULT_FN_ATTRS 3068_mm256_broadcast_pd(__m128d const *__a) 3069{ 3070 return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); 3071} 3072 3073/// \brief Loads the data from a 128-bit vector of [4 x float] from the 3074/// specified address pointed to by \a __a and broadcasts it to 128-bit 3075/// elements in a 256-bit vector of [8 x float]. 3076/// 3077/// \headerfile <x86intrin.h> 3078/// 3079/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction. 3080/// 3081/// \param __a 3082/// The 128-bit vector of [4 x float] to be broadcast. 3083/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set 3084/// equal to the broadcast value. 3085static __inline __m256 __DEFAULT_FN_ATTRS 3086_mm256_broadcast_ps(__m128 const *__a) 3087{ 3088 return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); 3089} 3090 3091/* SIMD load ops */ 3092/// \brief Loads 4 double-precision floating point values from a 32-byte aligned 3093/// memory location pointed to by \a __p into a vector of [4 x double]. 3094/// 3095/// \headerfile <x86intrin.h> 3096/// 3097/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3098/// 3099/// \param __p 3100/// A 32-byte aligned pointer to a memory location containing 3101/// double-precision floating point values. 3102/// \returns A 256-bit vector of [4 x double] containing the moved values. 3103static __inline __m256d __DEFAULT_FN_ATTRS 3104_mm256_load_pd(double const *__p) 3105{ 3106 return *(__m256d *)__p; 3107} 3108 3109/// \brief Loads 8 single-precision floating point values from a 32-byte aligned 3110/// memory location pointed to by \a __p into a vector of [8 x float]. 3111/// 3112/// \headerfile <x86intrin.h> 3113/// 3114/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3115/// 3116/// \param __p 3117/// A 32-byte aligned pointer to a memory location containing float values. 3118/// \returns A 256-bit vector of [8 x float] containing the moved values. 3119static __inline __m256 __DEFAULT_FN_ATTRS 3120_mm256_load_ps(float const *__p) 3121{ 3122 return *(__m256 *)__p; 3123} 3124 3125/// \brief Loads 4 double-precision floating point values from an unaligned 3126/// memory location pointed to by \a __p into a vector of [4 x double]. 3127/// 3128/// \headerfile <x86intrin.h> 3129/// 3130/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3131/// 3132/// \param __p 3133/// A pointer to a memory location containing double-precision floating 3134/// point values. 3135/// \returns A 256-bit vector of [4 x double] containing the moved values. 3136static __inline __m256d __DEFAULT_FN_ATTRS 3137_mm256_loadu_pd(double const *__p) 3138{ 3139 struct __loadu_pd { 3140 __m256d __v; 3141 } __attribute__((__packed__, __may_alias__)); 3142 return ((struct __loadu_pd*)__p)->__v; 3143} 3144 3145/// \brief Loads 8 single-precision floating point values from an unaligned 3146/// memory location pointed to by \a __p into a vector of [8 x float]. 3147/// 3148/// \headerfile <x86intrin.h> 3149/// 3150/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3151/// 3152/// \param __p 3153/// A pointer to a memory location containing single-precision floating 3154/// point values. 3155/// \returns A 256-bit vector of [8 x float] containing the moved values. 3156static __inline __m256 __DEFAULT_FN_ATTRS 3157_mm256_loadu_ps(float const *__p) 3158{ 3159 struct __loadu_ps { 3160 __m256 __v; 3161 } __attribute__((__packed__, __may_alias__)); 3162 return ((struct __loadu_ps*)__p)->__v; 3163} 3164 3165/// \brief Loads 256 bits of integer data from a 32-byte aligned memory 3166/// location pointed to by \a __p into elements of a 256-bit integer vector. 3167/// 3168/// \headerfile <x86intrin.h> 3169/// 3170/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3171/// 3172/// \param __p 3173/// A 32-byte aligned pointer to a 256-bit integer vector containing integer 3174/// values. 3175/// \returns A 256-bit integer vector containing the moved values. 3176static __inline __m256i __DEFAULT_FN_ATTRS 3177_mm256_load_si256(__m256i const *__p) 3178{ 3179 return *__p; 3180} 3181 3182/// \brief Loads 256 bits of integer data from an unaligned memory location 3183/// pointed to by \a __p into a 256-bit integer vector. 3184/// 3185/// \headerfile <x86intrin.h> 3186/// 3187/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3188/// 3189/// \param __p 3190/// A pointer to a 256-bit integer vector containing integer values. 3191/// \returns A 256-bit integer vector containing the moved values. 3192static __inline __m256i __DEFAULT_FN_ATTRS 3193_mm256_loadu_si256(__m256i const *__p) 3194{ 3195 struct __loadu_si256 { 3196 __m256i __v; 3197 } __attribute__((__packed__, __may_alias__)); 3198 return ((struct __loadu_si256*)__p)->__v; 3199} 3200 3201/// \brief Loads 256 bits of integer data from an unaligned memory location 3202/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may 3203/// perform better than \c _mm256_loadu_si256 when the data crosses a cache 3204/// line boundary. 3205/// 3206/// \headerfile <x86intrin.h> 3207/// 3208/// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 3209/// 3210/// \param __p 3211/// A pointer to a 256-bit integer vector containing integer values. 3212/// \returns A 256-bit integer vector containing the moved values. 3213static __inline __m256i __DEFAULT_FN_ATTRS 3214_mm256_lddqu_si256(__m256i const *__p) 3215{ 3216 return (__m256i)__builtin_ia32_lddqu256((char const *)__p); 3217} 3218 3219/* SIMD store ops */ 3220/// \brief Stores double-precision floating point values from a 256-bit vector 3221/// of [4 x double] to a 32-byte aligned memory location pointed to by 3222/// \a __p. 3223/// 3224/// \headerfile <x86intrin.h> 3225/// 3226/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction. 3227/// 3228/// \param __p 3229/// A 32-byte aligned pointer to a memory location that will receive the 3230/// double-precision floaing point values. 3231/// \param __a 3232/// A 256-bit vector of [4 x double] containing the values to be moved. 3233static __inline void __DEFAULT_FN_ATTRS 3234_mm256_store_pd(double *__p, __m256d __a) 3235{ 3236 *(__m256d *)__p = __a; 3237} 3238 3239/// \brief Stores single-precision floating point values from a 256-bit vector 3240/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. 3241/// 3242/// \headerfile <x86intrin.h> 3243/// 3244/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction. 3245/// 3246/// \param __p 3247/// A 32-byte aligned pointer to a memory location that will receive the 3248/// float values. 3249/// \param __a 3250/// A 256-bit vector of [8 x float] containing the values to be moved. 3251static __inline void __DEFAULT_FN_ATTRS 3252_mm256_store_ps(float *__p, __m256 __a) 3253{ 3254 *(__m256 *)__p = __a; 3255} 3256 3257/// \brief Stores double-precision floating point values from a 256-bit vector 3258/// of [4 x double] to an unaligned memory location pointed to by \a __p. 3259/// 3260/// \headerfile <x86intrin.h> 3261/// 3262/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction. 3263/// 3264/// \param __p 3265/// A pointer to a memory location that will receive the double-precision 3266/// floating point values. 3267/// \param __a 3268/// A 256-bit vector of [4 x double] containing the values to be moved. 3269static __inline void __DEFAULT_FN_ATTRS 3270_mm256_storeu_pd(double *__p, __m256d __a) 3271{ 3272 struct __storeu_pd { 3273 __m256d __v; 3274 } __attribute__((__packed__, __may_alias__)); 3275 ((struct __storeu_pd*)__p)->__v = __a; 3276} 3277 3278/// \brief Stores single-precision floating point values from a 256-bit vector 3279/// of [8 x float] to an unaligned memory location pointed to by \a __p. 3280/// 3281/// \headerfile <x86intrin.h> 3282/// 3283/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction. 3284/// 3285/// \param __p 3286/// A pointer to a memory location that will receive the float values. 3287/// \param __a 3288/// A 256-bit vector of [8 x float] containing the values to be moved. 3289static __inline void __DEFAULT_FN_ATTRS 3290_mm256_storeu_ps(float *__p, __m256 __a) 3291{ 3292 struct __storeu_ps { 3293 __m256 __v; 3294 } __attribute__((__packed__, __may_alias__)); 3295 ((struct __storeu_ps*)__p)->__v = __a; 3296} 3297 3298/// \brief Stores integer values from a 256-bit integer vector to a 32-byte 3299/// aligned memory location pointed to by \a __p. 3300/// 3301/// \headerfile <x86intrin.h> 3302/// 3303/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction. 3304/// 3305/// \param __p 3306/// A 32-byte aligned pointer to a memory location that will receive the 3307/// integer values. 3308/// \param __a 3309/// A 256-bit integer vector containing the values to be moved. 3310static __inline void __DEFAULT_FN_ATTRS 3311_mm256_store_si256(__m256i *__p, __m256i __a) 3312{ 3313 *__p = __a; 3314} 3315 3316/// \brief Stores integer values from a 256-bit integer vector to an unaligned 3317/// memory location pointed to by \a __p. 3318/// 3319/// \headerfile <x86intrin.h> 3320/// 3321/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction. 3322/// 3323/// \param __p 3324/// A pointer to a memory location that will receive the integer values. 3325/// \param __a 3326/// A 256-bit integer vector containing the values to be moved. 3327static __inline void __DEFAULT_FN_ATTRS 3328_mm256_storeu_si256(__m256i *__p, __m256i __a) 3329{ 3330 struct __storeu_si256 { 3331 __m256i __v; 3332 } __attribute__((__packed__, __may_alias__)); 3333 ((struct __storeu_si256*)__p)->__v = __a; 3334} 3335 3336/* Conditional load ops */ 3337/// \brief Conditionally loads double-precision floating point elements from a 3338/// memory location pointed to by \a __p into a 128-bit vector of 3339/// [2 x double], depending on the mask bits associated with each data 3340/// element. 3341/// 3342/// \headerfile <x86intrin.h> 3343/// 3344/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3345/// 3346/// \param __p 3347/// A pointer to a memory location that contains the double-precision 3348/// floating point values. 3349/// \param __m 3350/// A 128-bit integer vector containing the mask. The most significant bit of 3351/// each data element represents the mask bits. If a mask bit is zero, the 3352/// corresponding value in the memory location is not loaded and the 3353/// corresponding field in the return value is set to zero. 3354/// \returns A 128-bit vector of [2 x double] containing the loaded values. 3355static __inline __m128d __DEFAULT_FN_ATTRS 3356_mm_maskload_pd(double const *__p, __m128i __m) 3357{ 3358 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); 3359} 3360 3361/// \brief Conditionally loads double-precision floating point elements from a 3362/// memory location pointed to by \a __p into a 256-bit vector of 3363/// [4 x double], depending on the mask bits associated with each data 3364/// element. 3365/// 3366/// \headerfile <x86intrin.h> 3367/// 3368/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3369/// 3370/// \param __p 3371/// A pointer to a memory location that contains the double-precision 3372/// floating point values. 3373/// \param __m 3374/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3375/// significant bit of each quadword element represents the mask bits. If a 3376/// mask bit is zero, the corresponding value in the memory location is not 3377/// loaded and the corresponding field in the return value is set to zero. 3378/// \returns A 256-bit vector of [4 x double] containing the loaded values. 3379static __inline __m256d __DEFAULT_FN_ATTRS 3380_mm256_maskload_pd(double const *__p, __m256i __m) 3381{ 3382 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, 3383 (__v4di)__m); 3384} 3385 3386/// \brief Conditionally loads single-precision floating point elements from a 3387/// memory location pointed to by \a __p into a 128-bit vector of 3388/// [4 x float], depending on the mask bits associated with each data 3389/// element. 3390/// 3391/// \headerfile <x86intrin.h> 3392/// 3393/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3394/// 3395/// \param __p 3396/// A pointer to a memory location that contains the single-precision 3397/// floating point values. 3398/// \param __m 3399/// A 128-bit integer vector containing the mask. The most significant bit of 3400/// each data element represents the mask bits. If a mask bit is zero, the 3401/// corresponding value in the memory location is not loaded and the 3402/// corresponding field in the return value is set to zero. 3403/// \returns A 128-bit vector of [4 x float] containing the loaded values. 3404static __inline __m128 __DEFAULT_FN_ATTRS 3405_mm_maskload_ps(float const *__p, __m128i __m) 3406{ 3407 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); 3408} 3409 3410/// \brief Conditionally loads single-precision floating point elements from a 3411/// memory location pointed to by \a __p into a 256-bit vector of 3412/// [8 x float], depending on the mask bits associated with each data 3413/// element. 3414/// 3415/// \headerfile <x86intrin.h> 3416/// 3417/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3418/// 3419/// \param __p 3420/// A pointer to a memory location that contains the single-precision 3421/// floating point values. 3422/// \param __m 3423/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3424/// significant bit of each dword element represents the mask bits. If a mask 3425/// bit is zero, the corresponding value in the memory location is not loaded 3426/// and the corresponding field in the return value is set to zero. 3427/// \returns A 256-bit vector of [8 x float] containing the loaded values. 3428static __inline __m256 __DEFAULT_FN_ATTRS 3429_mm256_maskload_ps(float const *__p, __m256i __m) 3430{ 3431 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); 3432} 3433 3434/* Conditional store ops */ 3435/// \brief Moves single-precision floating point values from a 256-bit vector 3436/// of [8 x float] to a memory location pointed to by \a __p, according to 3437/// the specified mask. 3438/// 3439/// \headerfile <x86intrin.h> 3440/// 3441/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3442/// 3443/// \param __p 3444/// A pointer to a memory location that will receive the float values. 3445/// \param __m 3446/// A 256-bit integer vector of [8 x dword] containing the mask. The most 3447/// significant bit of each dword element in the mask vector represents the 3448/// mask bits. If a mask bit is zero, the corresponding value from vector 3449/// \a __a is not stored and the corresponding field in the memory location 3450/// pointed to by \a __p is not changed. 3451/// \param __a 3452/// A 256-bit vector of [8 x float] containing the values to be stored. 3453static __inline void __DEFAULT_FN_ATTRS 3454_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) 3455{ 3456 __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); 3457} 3458 3459/// \brief Moves double-precision values from a 128-bit vector of [2 x double] 3460/// to a memory location pointed to by \a __p, according to the specified 3461/// mask. 3462/// 3463/// \headerfile <x86intrin.h> 3464/// 3465/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3466/// 3467/// \param __p 3468/// A pointer to a memory location that will receive the float values. 3469/// \param __m 3470/// A 128-bit integer vector containing the mask. The most significant bit of 3471/// each field in the mask vector represents the mask bits. If a mask bit is 3472/// zero, the corresponding value from vector \a __a is not stored and the 3473/// corresponding field in the memory location pointed to by \a __p is not 3474/// changed. 3475/// \param __a 3476/// A 128-bit vector of [2 x double] containing the values to be stored. 3477static __inline void __DEFAULT_FN_ATTRS 3478_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) 3479{ 3480 __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); 3481} 3482 3483/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3484/// to a memory location pointed to by \a __p, according to the specified 3485/// mask. 3486/// 3487/// \headerfile <x86intrin.h> 3488/// 3489/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction. 3490/// 3491/// \param __p 3492/// A pointer to a memory location that will receive the float values. 3493/// \param __m 3494/// A 256-bit integer vector of [4 x quadword] containing the mask. The most 3495/// significant bit of each quadword element in the mask vector represents 3496/// the mask bits. If a mask bit is zero, the corresponding value from vector 3497/// __a is not stored and the corresponding field in the memory location 3498/// pointed to by \a __p is not changed. 3499/// \param __a 3500/// A 256-bit vector of [4 x double] containing the values to be stored. 3501static __inline void __DEFAULT_FN_ATTRS 3502_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) 3503{ 3504 __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); 3505} 3506 3507/// \brief Moves single-precision floating point values from a 128-bit vector 3508/// of [4 x float] to a memory location pointed to by \a __p, according to 3509/// the specified mask. 3510/// 3511/// \headerfile <x86intrin.h> 3512/// 3513/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction. 3514/// 3515/// \param __p 3516/// A pointer to a memory location that will receive the float values. 3517/// \param __m 3518/// A 128-bit integer vector containing the mask. The most significant bit of 3519/// each field in the mask vector represents the mask bits. If a mask bit is 3520/// zero, the corresponding value from vector __a is not stored and the 3521/// corresponding field in the memory location pointed to by \a __p is not 3522/// changed. 3523/// \param __a 3524/// A 128-bit vector of [4 x float] containing the values to be stored. 3525static __inline void __DEFAULT_FN_ATTRS 3526_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) 3527{ 3528 __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); 3529} 3530 3531/* Cacheability support ops */ 3532/// \brief Moves integer data from a 256-bit integer vector to a 32-byte 3533/// aligned memory location. To minimize caching, the data is flagged as 3534/// non-temporal (unlikely to be used again soon). 3535/// 3536/// \headerfile <x86intrin.h> 3537/// 3538/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction. 3539/// 3540/// \param __a 3541/// A pointer to a 32-byte aligned memory location that will receive the 3542/// integer values. 3543/// \param __b 3544/// A 256-bit integer vector containing the values to be moved. 3545static __inline void __DEFAULT_FN_ATTRS 3546_mm256_stream_si256(__m256i *__a, __m256i __b) 3547{ 3548 __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); 3549} 3550 3551/// \brief Moves double-precision values from a 256-bit vector of [4 x double] 3552/// to a 32-byte aligned memory location. To minimize caching, the data is 3553/// flagged as non-temporal (unlikely to be used again soon). 3554/// 3555/// \headerfile <x86intrin.h> 3556/// 3557/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction. 3558/// 3559/// \param __a 3560/// A pointer to a 32-byte aligned memory location that will receive the 3561/// integer values. 3562/// \param __b 3563/// A 256-bit vector of [4 x double] containing the values to be moved. 3564static __inline void __DEFAULT_FN_ATTRS 3565_mm256_stream_pd(double *__a, __m256d __b) 3566{ 3567 __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); 3568} 3569 3570/// \brief Moves single-precision floating point values from a 256-bit vector 3571/// of [8 x float] to a 32-byte aligned memory location. To minimize 3572/// caching, the data is flagged as non-temporal (unlikely to be used again 3573/// soon). 3574/// 3575/// \headerfile <x86intrin.h> 3576/// 3577/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction. 3578/// 3579/// \param __p 3580/// A pointer to a 32-byte aligned memory location that will receive the 3581/// single-precision floating point values. 3582/// \param __a 3583/// A 256-bit vector of [8 x float] containing the values to be moved. 3584static __inline void __DEFAULT_FN_ATTRS 3585_mm256_stream_ps(float *__p, __m256 __a) 3586{ 3587 __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); 3588} 3589 3590/* Create vectors */ 3591/// \brief Create a 256-bit vector of [4 x double] with undefined values. 3592/// 3593/// \headerfile <x86intrin.h> 3594/// 3595/// This intrinsic has no corresponding instruction. 3596/// 3597/// \returns A 256-bit vector of [4 x double] containing undefined values. 3598static __inline__ __m256d __DEFAULT_FN_ATTRS 3599_mm256_undefined_pd(void) 3600{ 3601 return (__m256d)__builtin_ia32_undef256(); 3602} 3603 3604/// \brief Create a 256-bit vector of [8 x float] with undefined values. 3605/// 3606/// \headerfile <x86intrin.h> 3607/// 3608/// This intrinsic has no corresponding instruction. 3609/// 3610/// \returns A 256-bit vector of [8 x float] containing undefined values. 3611static __inline__ __m256 __DEFAULT_FN_ATTRS 3612_mm256_undefined_ps(void) 3613{ 3614 return (__m256)__builtin_ia32_undef256(); 3615} 3616 3617/// \brief Create a 256-bit integer vector with undefined values. 3618/// 3619/// \headerfile <x86intrin.h> 3620/// 3621/// This intrinsic has no corresponding instruction. 3622/// 3623/// \returns A 256-bit integer vector containing undefined values. 3624static __inline__ __m256i __DEFAULT_FN_ATTRS 3625_mm256_undefined_si256(void) 3626{ 3627 return (__m256i)__builtin_ia32_undef256(); 3628} 3629 3630/// \brief Constructs a 256-bit floating-point vector of [4 x double] 3631/// initialized with the specified double-precision floating-point values. 3632/// 3633/// \headerfile <x86intrin.h> 3634/// 3635/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3636/// instruction. 3637/// 3638/// \param __a 3639/// A double-precision floating-point value used to initialize bits [255:192] 3640/// of the result. 3641/// \param __b 3642/// A double-precision floating-point value used to initialize bits [191:128] 3643/// of the result. 3644/// \param __c 3645/// A double-precision floating-point value used to initialize bits [127:64] 3646/// of the result. 3647/// \param __d 3648/// A double-precision floating-point value used to initialize bits [63:0] 3649/// of the result. 3650/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3651static __inline __m256d __DEFAULT_FN_ATTRS 3652_mm256_set_pd(double __a, double __b, double __c, double __d) 3653{ 3654 return (__m256d){ __d, __c, __b, __a }; 3655} 3656 3657/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized 3658/// with the specified single-precision floating-point values. 3659/// 3660/// \headerfile <x86intrin.h> 3661/// 3662/// This intrinsic is a utility function and does not correspond to a specific 3663/// instruction. 3664/// 3665/// \param __a 3666/// A single-precision floating-point value used to initialize bits [255:224] 3667/// of the result. 3668/// \param __b 3669/// A single-precision floating-point value used to initialize bits [223:192] 3670/// of the result. 3671/// \param __c 3672/// A single-precision floating-point value used to initialize bits [191:160] 3673/// of the result. 3674/// \param __d 3675/// A single-precision floating-point value used to initialize bits [159:128] 3676/// of the result. 3677/// \param __e 3678/// A single-precision floating-point value used to initialize bits [127:96] 3679/// of the result. 3680/// \param __f 3681/// A single-precision floating-point value used to initialize bits [95:64] 3682/// of the result. 3683/// \param __g 3684/// A single-precision floating-point value used to initialize bits [63:32] 3685/// of the result. 3686/// \param __h 3687/// A single-precision floating-point value used to initialize bits [31:0] 3688/// of the result. 3689/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3690static __inline __m256 __DEFAULT_FN_ATTRS 3691_mm256_set_ps(float __a, float __b, float __c, float __d, 3692 float __e, float __f, float __g, float __h) 3693{ 3694 return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; 3695} 3696 3697/// \brief Constructs a 256-bit integer vector initialized with the specified 3698/// 32-bit integral values. 3699/// 3700/// \headerfile <x86intrin.h> 3701/// 3702/// This intrinsic is a utility function and does not correspond to a specific 3703/// instruction. 3704/// 3705/// \param __i0 3706/// A 32-bit integral value used to initialize bits [255:224] of the result. 3707/// \param __i1 3708/// A 32-bit integral value used to initialize bits [223:192] of the result. 3709/// \param __i2 3710/// A 32-bit integral value used to initialize bits [191:160] of the result. 3711/// \param __i3 3712/// A 32-bit integral value used to initialize bits [159:128] of the result. 3713/// \param __i4 3714/// A 32-bit integral value used to initialize bits [127:96] of the result. 3715/// \param __i5 3716/// A 32-bit integral value used to initialize bits [95:64] of the result. 3717/// \param __i6 3718/// A 32-bit integral value used to initialize bits [63:32] of the result. 3719/// \param __i7 3720/// A 32-bit integral value used to initialize bits [31:0] of the result. 3721/// \returns An initialized 256-bit integer vector. 3722static __inline __m256i __DEFAULT_FN_ATTRS 3723_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, 3724 int __i4, int __i5, int __i6, int __i7) 3725{ 3726 return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; 3727} 3728 3729/// \brief Constructs a 256-bit integer vector initialized with the specified 3730/// 16-bit integral values. 3731/// 3732/// \headerfile <x86intrin.h> 3733/// 3734/// This intrinsic is a utility function and does not correspond to a specific 3735/// instruction. 3736/// 3737/// \param __w15 3738/// A 16-bit integral value used to initialize bits [255:240] of the result. 3739/// \param __w14 3740/// A 16-bit integral value used to initialize bits [239:224] of the result. 3741/// \param __w13 3742/// A 16-bit integral value used to initialize bits [223:208] of the result. 3743/// \param __w12 3744/// A 16-bit integral value used to initialize bits [207:192] of the result. 3745/// \param __w11 3746/// A 16-bit integral value used to initialize bits [191:176] of the result. 3747/// \param __w10 3748/// A 16-bit integral value used to initialize bits [175:160] of the result. 3749/// \param __w09 3750/// A 16-bit integral value used to initialize bits [159:144] of the result. 3751/// \param __w08 3752/// A 16-bit integral value used to initialize bits [143:128] of the result. 3753/// \param __w07 3754/// A 16-bit integral value used to initialize bits [127:112] of the result. 3755/// \param __w06 3756/// A 16-bit integral value used to initialize bits [111:96] of the result. 3757/// \param __w05 3758/// A 16-bit integral value used to initialize bits [95:80] of the result. 3759/// \param __w04 3760/// A 16-bit integral value used to initialize bits [79:64] of the result. 3761/// \param __w03 3762/// A 16-bit integral value used to initialize bits [63:48] of the result. 3763/// \param __w02 3764/// A 16-bit integral value used to initialize bits [47:32] of the result. 3765/// \param __w01 3766/// A 16-bit integral value used to initialize bits [31:16] of the result. 3767/// \param __w00 3768/// A 16-bit integral value used to initialize bits [15:0] of the result. 3769/// \returns An initialized 256-bit integer vector. 3770static __inline __m256i __DEFAULT_FN_ATTRS 3771_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, 3772 short __w11, short __w10, short __w09, short __w08, 3773 short __w07, short __w06, short __w05, short __w04, 3774 short __w03, short __w02, short __w01, short __w00) 3775{ 3776 return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, 3777 __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; 3778} 3779 3780/// \brief Constructs a 256-bit integer vector initialized with the specified 3781/// 8-bit integral values. 3782/// 3783/// \headerfile <x86intrin.h> 3784/// 3785/// This intrinsic is a utility function and does not correspond to a specific 3786/// instruction. 3787/// 3788/// \param __b31 3789/// An 8-bit integral value used to initialize bits [255:248] of the result. 3790/// \param __b30 3791/// An 8-bit integral value used to initialize bits [247:240] of the result. 3792/// \param __b29 3793/// An 8-bit integral value used to initialize bits [239:232] of the result. 3794/// \param __b28 3795/// An 8-bit integral value used to initialize bits [231:224] of the result. 3796/// \param __b27 3797/// An 8-bit integral value used to initialize bits [223:216] of the result. 3798/// \param __b26 3799/// An 8-bit integral value used to initialize bits [215:208] of the result. 3800/// \param __b25 3801/// An 8-bit integral value used to initialize bits [207:200] of the result. 3802/// \param __b24 3803/// An 8-bit integral value used to initialize bits [199:192] of the result. 3804/// \param __b23 3805/// An 8-bit integral value used to initialize bits [191:184] of the result. 3806/// \param __b22 3807/// An 8-bit integral value used to initialize bits [183:176] of the result. 3808/// \param __b21 3809/// An 8-bit integral value used to initialize bits [175:168] of the result. 3810/// \param __b20 3811/// An 8-bit integral value used to initialize bits [167:160] of the result. 3812/// \param __b19 3813/// An 8-bit integral value used to initialize bits [159:152] of the result. 3814/// \param __b18 3815/// An 8-bit integral value used to initialize bits [151:144] of the result. 3816/// \param __b17 3817/// An 8-bit integral value used to initialize bits [143:136] of the result. 3818/// \param __b16 3819/// An 8-bit integral value used to initialize bits [135:128] of the result. 3820/// \param __b15 3821/// An 8-bit integral value used to initialize bits [127:120] of the result. 3822/// \param __b14 3823/// An 8-bit integral value used to initialize bits [119:112] of the result. 3824/// \param __b13 3825/// An 8-bit integral value used to initialize bits [111:104] of the result. 3826/// \param __b12 3827/// An 8-bit integral value used to initialize bits [103:96] of the result. 3828/// \param __b11 3829/// An 8-bit integral value used to initialize bits [95:88] of the result. 3830/// \param __b10 3831/// An 8-bit integral value used to initialize bits [87:80] of the result. 3832/// \param __b09 3833/// An 8-bit integral value used to initialize bits [79:72] of the result. 3834/// \param __b08 3835/// An 8-bit integral value used to initialize bits [71:64] of the result. 3836/// \param __b07 3837/// An 8-bit integral value used to initialize bits [63:56] of the result. 3838/// \param __b06 3839/// An 8-bit integral value used to initialize bits [55:48] of the result. 3840/// \param __b05 3841/// An 8-bit integral value used to initialize bits [47:40] of the result. 3842/// \param __b04 3843/// An 8-bit integral value used to initialize bits [39:32] of the result. 3844/// \param __b03 3845/// An 8-bit integral value used to initialize bits [31:24] of the result. 3846/// \param __b02 3847/// An 8-bit integral value used to initialize bits [23:16] of the result. 3848/// \param __b01 3849/// An 8-bit integral value used to initialize bits [15:8] of the result. 3850/// \param __b00 3851/// An 8-bit integral value used to initialize bits [7:0] of the result. 3852/// \returns An initialized 256-bit integer vector. 3853static __inline __m256i __DEFAULT_FN_ATTRS 3854_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, 3855 char __b27, char __b26, char __b25, char __b24, 3856 char __b23, char __b22, char __b21, char __b20, 3857 char __b19, char __b18, char __b17, char __b16, 3858 char __b15, char __b14, char __b13, char __b12, 3859 char __b11, char __b10, char __b09, char __b08, 3860 char __b07, char __b06, char __b05, char __b04, 3861 char __b03, char __b02, char __b01, char __b00) 3862{ 3863 return (__m256i)(__v32qi){ 3864 __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, 3865 __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, 3866 __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, 3867 __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 3868 }; 3869} 3870 3871/// \brief Constructs a 256-bit integer vector initialized with the specified 3872/// 64-bit integral values. 3873/// 3874/// \headerfile <x86intrin.h> 3875/// 3876/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 3877/// instruction. 3878/// 3879/// \param __a 3880/// A 64-bit integral value used to initialize bits [255:192] of the result. 3881/// \param __b 3882/// A 64-bit integral value used to initialize bits [191:128] of the result. 3883/// \param __c 3884/// A 64-bit integral value used to initialize bits [127:64] of the result. 3885/// \param __d 3886/// A 64-bit integral value used to initialize bits [63:0] of the result. 3887/// \returns An initialized 256-bit integer vector. 3888static __inline __m256i __DEFAULT_FN_ATTRS 3889_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) 3890{ 3891 return (__m256i)(__v4di){ __d, __c, __b, __a }; 3892} 3893 3894/* Create vectors with elements in reverse order */ 3895/// \brief Constructs a 256-bit floating-point vector of [4 x double], 3896/// initialized in reverse order with the specified double-precision 3897/// floating-point values. 3898/// 3899/// \headerfile <x86intrin.h> 3900/// 3901/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c> 3902/// instruction. 3903/// 3904/// \param __a 3905/// A double-precision floating-point value used to initialize bits [63:0] 3906/// of the result. 3907/// \param __b 3908/// A double-precision floating-point value used to initialize bits [127:64] 3909/// of the result. 3910/// \param __c 3911/// A double-precision floating-point value used to initialize bits [191:128] 3912/// of the result. 3913/// \param __d 3914/// A double-precision floating-point value used to initialize bits [255:192] 3915/// of the result. 3916/// \returns An initialized 256-bit floating-point vector of [4 x double]. 3917static __inline __m256d __DEFAULT_FN_ATTRS 3918_mm256_setr_pd(double __a, double __b, double __c, double __d) 3919{ 3920 return (__m256d){ __a, __b, __c, __d }; 3921} 3922 3923/// \brief Constructs a 256-bit floating-point vector of [8 x float], 3924/// initialized in reverse order with the specified single-precision 3925/// float-point values. 3926/// 3927/// \headerfile <x86intrin.h> 3928/// 3929/// This intrinsic is a utility function and does not correspond to a specific 3930/// instruction. 3931/// 3932/// \param __a 3933/// A single-precision floating-point value used to initialize bits [31:0] 3934/// of the result. 3935/// \param __b 3936/// A single-precision floating-point value used to initialize bits [63:32] 3937/// of the result. 3938/// \param __c 3939/// A single-precision floating-point value used to initialize bits [95:64] 3940/// of the result. 3941/// \param __d 3942/// A single-precision floating-point value used to initialize bits [127:96] 3943/// of the result. 3944/// \param __e 3945/// A single-precision floating-point value used to initialize bits [159:128] 3946/// of the result. 3947/// \param __f 3948/// A single-precision floating-point value used to initialize bits [191:160] 3949/// of the result. 3950/// \param __g 3951/// A single-precision floating-point value used to initialize bits [223:192] 3952/// of the result. 3953/// \param __h 3954/// A single-precision floating-point value used to initialize bits [255:224] 3955/// of the result. 3956/// \returns An initialized 256-bit floating-point vector of [8 x float]. 3957static __inline __m256 __DEFAULT_FN_ATTRS 3958_mm256_setr_ps(float __a, float __b, float __c, float __d, 3959 float __e, float __f, float __g, float __h) 3960{ 3961 return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; 3962} 3963 3964/// \brief Constructs a 256-bit integer vector, initialized in reverse order 3965/// with the specified 32-bit integral values. 3966/// 3967/// \headerfile <x86intrin.h> 3968/// 3969/// This intrinsic is a utility function and does not correspond to a specific 3970/// instruction. 3971/// 3972/// \param __i0 3973/// A 32-bit integral value used to initialize bits [31:0] of the result. 3974/// \param __i1 3975/// A 32-bit integral value used to initialize bits [63:32] of the result. 3976/// \param __i2 3977/// A 32-bit integral value used to initialize bits [95:64] of the result. 3978/// \param __i3 3979/// A 32-bit integral value used to initialize bits [127:96] of the result. 3980/// \param __i4 3981/// A 32-bit integral value used to initialize bits [159:128] of the result. 3982/// \param __i5 3983/// A 32-bit integral value used to initialize bits [191:160] of the result. 3984/// \param __i6 3985/// A 32-bit integral value used to initialize bits [223:192] of the result. 3986/// \param __i7 3987/// A 32-bit integral value used to initialize bits [255:224] of the result. 3988/// \returns An initialized 256-bit integer vector. 3989static __inline __m256i __DEFAULT_FN_ATTRS 3990_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, 3991 int __i4, int __i5, int __i6, int __i7) 3992{ 3993 return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; 3994} 3995 3996/// \brief Constructs a 256-bit integer vector, initialized in reverse order 3997/// with the specified 16-bit integral values. 3998/// 3999/// \headerfile <x86intrin.h> 4000/// 4001/// This intrinsic is a utility function and does not correspond to a specific 4002/// instruction. 4003/// 4004/// \param __w15 4005/// A 16-bit integral value used to initialize bits [15:0] of the result. 4006/// \param __w14 4007/// A 16-bit integral value used to initialize bits [31:16] of the result. 4008/// \param __w13 4009/// A 16-bit integral value used to initialize bits [47:32] of the result. 4010/// \param __w12 4011/// A 16-bit integral value used to initialize bits [63:48] of the result. 4012/// \param __w11 4013/// A 16-bit integral value used to initialize bits [79:64] of the result. 4014/// \param __w10 4015/// A 16-bit integral value used to initialize bits [95:80] of the result. 4016/// \param __w09 4017/// A 16-bit integral value used to initialize bits [111:96] of the result. 4018/// \param __w08 4019/// A 16-bit integral value used to initialize bits [127:112] of the result. 4020/// \param __w07 4021/// A 16-bit integral value used to initialize bits [143:128] of the result. 4022/// \param __w06 4023/// A 16-bit integral value used to initialize bits [159:144] of the result. 4024/// \param __w05 4025/// A 16-bit integral value used to initialize bits [175:160] of the result. 4026/// \param __w04 4027/// A 16-bit integral value used to initialize bits [191:176] of the result. 4028/// \param __w03 4029/// A 16-bit integral value used to initialize bits [207:192] of the result. 4030/// \param __w02 4031/// A 16-bit integral value used to initialize bits [223:208] of the result. 4032/// \param __w01 4033/// A 16-bit integral value used to initialize bits [239:224] of the result. 4034/// \param __w00 4035/// A 16-bit integral value used to initialize bits [255:240] of the result. 4036/// \returns An initialized 256-bit integer vector. 4037static __inline __m256i __DEFAULT_FN_ATTRS 4038_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, 4039 short __w11, short __w10, short __w09, short __w08, 4040 short __w07, short __w06, short __w05, short __w04, 4041 short __w03, short __w02, short __w01, short __w00) 4042{ 4043 return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, 4044 __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; 4045} 4046 4047/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4048/// with the specified 8-bit integral values. 4049/// 4050/// \headerfile <x86intrin.h> 4051/// 4052/// This intrinsic is a utility function and does not correspond to a specific 4053/// instruction. 4054/// 4055/// \param __b31 4056/// An 8-bit integral value used to initialize bits [7:0] of the result. 4057/// \param __b30 4058/// An 8-bit integral value used to initialize bits [15:8] of the result. 4059/// \param __b29 4060/// An 8-bit integral value used to initialize bits [23:16] of the result. 4061/// \param __b28 4062/// An 8-bit integral value used to initialize bits [31:24] of the result. 4063/// \param __b27 4064/// An 8-bit integral value used to initialize bits [39:32] of the result. 4065/// \param __b26 4066/// An 8-bit integral value used to initialize bits [47:40] of the result. 4067/// \param __b25 4068/// An 8-bit integral value used to initialize bits [55:48] of the result. 4069/// \param __b24 4070/// An 8-bit integral value used to initialize bits [63:56] of the result. 4071/// \param __b23 4072/// An 8-bit integral value used to initialize bits [71:64] of the result. 4073/// \param __b22 4074/// An 8-bit integral value used to initialize bits [79:72] of the result. 4075/// \param __b21 4076/// An 8-bit integral value used to initialize bits [87:80] of the result. 4077/// \param __b20 4078/// An 8-bit integral value used to initialize bits [95:88] of the result. 4079/// \param __b19 4080/// An 8-bit integral value used to initialize bits [103:96] of the result. 4081/// \param __b18 4082/// An 8-bit integral value used to initialize bits [111:104] of the result. 4083/// \param __b17 4084/// An 8-bit integral value used to initialize bits [119:112] of the result. 4085/// \param __b16 4086/// An 8-bit integral value used to initialize bits [127:120] of the result. 4087/// \param __b15 4088/// An 8-bit integral value used to initialize bits [135:128] of the result. 4089/// \param __b14 4090/// An 8-bit integral value used to initialize bits [143:136] of the result. 4091/// \param __b13 4092/// An 8-bit integral value used to initialize bits [151:144] of the result. 4093/// \param __b12 4094/// An 8-bit integral value used to initialize bits [159:152] of the result. 4095/// \param __b11 4096/// An 8-bit integral value used to initialize bits [167:160] of the result. 4097/// \param __b10 4098/// An 8-bit integral value used to initialize bits [175:168] of the result. 4099/// \param __b09 4100/// An 8-bit integral value used to initialize bits [183:176] of the result. 4101/// \param __b08 4102/// An 8-bit integral value used to initialize bits [191:184] of the result. 4103/// \param __b07 4104/// An 8-bit integral value used to initialize bits [199:192] of the result. 4105/// \param __b06 4106/// An 8-bit integral value used to initialize bits [207:200] of the result. 4107/// \param __b05 4108/// An 8-bit integral value used to initialize bits [215:208] of the result. 4109/// \param __b04 4110/// An 8-bit integral value used to initialize bits [223:216] of the result. 4111/// \param __b03 4112/// An 8-bit integral value used to initialize bits [231:224] of the result. 4113/// \param __b02 4114/// An 8-bit integral value used to initialize bits [239:232] of the result. 4115/// \param __b01 4116/// An 8-bit integral value used to initialize bits [247:240] of the result. 4117/// \param __b00 4118/// An 8-bit integral value used to initialize bits [255:248] of the result. 4119/// \returns An initialized 256-bit integer vector. 4120static __inline __m256i __DEFAULT_FN_ATTRS 4121_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, 4122 char __b27, char __b26, char __b25, char __b24, 4123 char __b23, char __b22, char __b21, char __b20, 4124 char __b19, char __b18, char __b17, char __b16, 4125 char __b15, char __b14, char __b13, char __b12, 4126 char __b11, char __b10, char __b09, char __b08, 4127 char __b07, char __b06, char __b05, char __b04, 4128 char __b03, char __b02, char __b01, char __b00) 4129{ 4130 return (__m256i)(__v32qi){ 4131 __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, 4132 __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, 4133 __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, 4134 __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; 4135} 4136 4137/// \brief Constructs a 256-bit integer vector, initialized in reverse order 4138/// with the specified 64-bit integral values. 4139/// 4140/// \headerfile <x86intrin.h> 4141/// 4142/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c> 4143/// instruction. 4144/// 4145/// \param __a 4146/// A 64-bit integral value used to initialize bits [63:0] of the result. 4147/// \param __b 4148/// A 64-bit integral value used to initialize bits [127:64] of the result. 4149/// \param __c 4150/// A 64-bit integral value used to initialize bits [191:128] of the result. 4151/// \param __d 4152/// A 64-bit integral value used to initialize bits [255:192] of the result. 4153/// \returns An initialized 256-bit integer vector. 4154static __inline __m256i __DEFAULT_FN_ATTRS 4155_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) 4156{ 4157 return (__m256i)(__v4di){ __a, __b, __c, __d }; 4158} 4159 4160/* Create vectors with repeated elements */ 4161/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each 4162/// of the four double-precision floating-point vector elements set to the 4163/// specified double-precision floating-point value. 4164/// 4165/// \headerfile <x86intrin.h> 4166/// 4167/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4168/// 4169/// \param __w 4170/// A double-precision floating-point value used to initialize each vector 4171/// element of the result. 4172/// \returns An initialized 256-bit floating-point vector of [4 x double]. 4173static __inline __m256d __DEFAULT_FN_ATTRS 4174_mm256_set1_pd(double __w) 4175{ 4176 return (__m256d){ __w, __w, __w, __w }; 4177} 4178 4179/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each 4180/// of the eight single-precision floating-point vector elements set to the 4181/// specified single-precision floating-point value. 4182/// 4183/// \headerfile <x86intrin.h> 4184/// 4185/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4186/// instruction. 4187/// 4188/// \param __w 4189/// A single-precision floating-point value used to initialize each vector 4190/// element of the result. 4191/// \returns An initialized 256-bit floating-point vector of [8 x float]. 4192static __inline __m256 __DEFAULT_FN_ATTRS 4193_mm256_set1_ps(float __w) 4194{ 4195 return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; 4196} 4197 4198/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the 4199/// 32-bit integral vector elements set to the specified 32-bit integral 4200/// value. 4201/// 4202/// \headerfile <x86intrin.h> 4203/// 4204/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c> 4205/// instruction. 4206/// 4207/// \param __i 4208/// A 32-bit integral value used to initialize each vector element of the 4209/// result. 4210/// \returns An initialized 256-bit integer vector of [8 x i32]. 4211static __inline __m256i __DEFAULT_FN_ATTRS 4212_mm256_set1_epi32(int __i) 4213{ 4214 return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; 4215} 4216 4217/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the 4218/// 16-bit integral vector elements set to the specified 16-bit integral 4219/// value. 4220/// 4221/// \headerfile <x86intrin.h> 4222/// 4223/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4224/// 4225/// \param __w 4226/// A 16-bit integral value used to initialize each vector element of the 4227/// result. 4228/// \returns An initialized 256-bit integer vector of [16 x i16]. 4229static __inline __m256i __DEFAULT_FN_ATTRS 4230_mm256_set1_epi16(short __w) 4231{ 4232 return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, 4233 __w, __w, __w, __w, __w, __w }; 4234} 4235 4236/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the 4237/// 8-bit integral vector elements set to the specified 8-bit integral value. 4238/// 4239/// \headerfile <x86intrin.h> 4240/// 4241/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction. 4242/// 4243/// \param __b 4244/// An 8-bit integral value used to initialize each vector element of the 4245/// result. 4246/// \returns An initialized 256-bit integer vector of [32 x i8]. 4247static __inline __m256i __DEFAULT_FN_ATTRS 4248_mm256_set1_epi8(char __b) 4249{ 4250 return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4251 __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 4252 __b, __b, __b, __b, __b, __b, __b }; 4253} 4254 4255/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the 4256/// 64-bit integral vector elements set to the specified 64-bit integral 4257/// value. 4258/// 4259/// \headerfile <x86intrin.h> 4260/// 4261/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction. 4262/// 4263/// \param __q 4264/// A 64-bit integral value used to initialize each vector element of the 4265/// result. 4266/// \returns An initialized 256-bit integer vector of [4 x i64]. 4267static __inline __m256i __DEFAULT_FN_ATTRS 4268_mm256_set1_epi64x(long long __q) 4269{ 4270 return (__m256i)(__v4di){ __q, __q, __q, __q }; 4271} 4272 4273/* Create __zeroed vectors */ 4274/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all 4275/// vector elements initialized to zero. 4276/// 4277/// \headerfile <x86intrin.h> 4278/// 4279/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4280/// 4281/// \returns A 256-bit vector of [4 x double] with all elements set to zero. 4282static __inline __m256d __DEFAULT_FN_ATTRS 4283_mm256_setzero_pd(void) 4284{ 4285 return (__m256d){ 0, 0, 0, 0 }; 4286} 4287 4288/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all 4289/// vector elements initialized to zero. 4290/// 4291/// \headerfile <x86intrin.h> 4292/// 4293/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4294/// 4295/// \returns A 256-bit vector of [8 x float] with all elements set to zero. 4296static __inline __m256 __DEFAULT_FN_ATTRS 4297_mm256_setzero_ps(void) 4298{ 4299 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 4300} 4301 4302/// \brief Constructs a 256-bit integer vector initialized to zero. 4303/// 4304/// \headerfile <x86intrin.h> 4305/// 4306/// This intrinsic corresponds to the <c> VXORPS </c> instruction. 4307/// 4308/// \returns A 256-bit integer vector initialized to zero. 4309static __inline __m256i __DEFAULT_FN_ATTRS 4310_mm256_setzero_si256(void) 4311{ 4312 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 4313} 4314 4315/* Cast between vector types */ 4316/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4317/// floating-point vector of [8 x float]. 4318/// 4319/// \headerfile <x86intrin.h> 4320/// 4321/// This intrinsic has no corresponding instruction. 4322/// 4323/// \param __a 4324/// A 256-bit floating-point vector of [4 x double]. 4325/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4326/// bitwise pattern as the parameter. 4327static __inline __m256 __DEFAULT_FN_ATTRS 4328_mm256_castpd_ps(__m256d __a) 4329{ 4330 return (__m256)__a; 4331} 4332 4333/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit 4334/// integer vector. 4335/// 4336/// \headerfile <x86intrin.h> 4337/// 4338/// This intrinsic has no corresponding instruction. 4339/// 4340/// \param __a 4341/// A 256-bit floating-point vector of [4 x double]. 4342/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4343/// parameter. 4344static __inline __m256i __DEFAULT_FN_ATTRS 4345_mm256_castpd_si256(__m256d __a) 4346{ 4347 return (__m256i)__a; 4348} 4349 4350/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4351/// floating-point vector of [4 x double]. 4352/// 4353/// \headerfile <x86intrin.h> 4354/// 4355/// This intrinsic has no corresponding instruction. 4356/// 4357/// \param __a 4358/// A 256-bit floating-point vector of [8 x float]. 4359/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4360/// bitwise pattern as the parameter. 4361static __inline __m256d __DEFAULT_FN_ATTRS 4362_mm256_castps_pd(__m256 __a) 4363{ 4364 return (__m256d)__a; 4365} 4366 4367/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit 4368/// integer vector. 4369/// 4370/// \headerfile <x86intrin.h> 4371/// 4372/// This intrinsic has no corresponding instruction. 4373/// 4374/// \param __a 4375/// A 256-bit floating-point vector of [8 x float]. 4376/// \returns A 256-bit integer vector containing the same bitwise pattern as the 4377/// parameter. 4378static __inline __m256i __DEFAULT_FN_ATTRS 4379_mm256_castps_si256(__m256 __a) 4380{ 4381 return (__m256i)__a; 4382} 4383 4384/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4385/// of [8 x float]. 4386/// 4387/// \headerfile <x86intrin.h> 4388/// 4389/// This intrinsic has no corresponding instruction. 4390/// 4391/// \param __a 4392/// A 256-bit integer vector. 4393/// \returns A 256-bit floating-point vector of [8 x float] containing the same 4394/// bitwise pattern as the parameter. 4395static __inline __m256 __DEFAULT_FN_ATTRS 4396_mm256_castsi256_ps(__m256i __a) 4397{ 4398 return (__m256)__a; 4399} 4400 4401/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector 4402/// of [4 x double]. 4403/// 4404/// \headerfile <x86intrin.h> 4405/// 4406/// This intrinsic has no corresponding instruction. 4407/// 4408/// \param __a 4409/// A 256-bit integer vector. 4410/// \returns A 256-bit floating-point vector of [4 x double] containing the same 4411/// bitwise pattern as the parameter. 4412static __inline __m256d __DEFAULT_FN_ATTRS 4413_mm256_castsi256_pd(__m256i __a) 4414{ 4415 return (__m256d)__a; 4416} 4417 4418/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4419/// [4 x double] as a 128-bit floating-point vector of [2 x double]. 4420/// 4421/// \headerfile <x86intrin.h> 4422/// 4423/// This intrinsic has no corresponding instruction. 4424/// 4425/// \param __a 4426/// A 256-bit floating-point vector of [4 x double]. 4427/// \returns A 128-bit floating-point vector of [2 x double] containing the 4428/// lower 128 bits of the parameter. 4429static __inline __m128d __DEFAULT_FN_ATTRS 4430_mm256_castpd256_pd128(__m256d __a) 4431{ 4432 return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); 4433} 4434 4435/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of 4436/// [8 x float] as a 128-bit floating-point vector of [4 x float]. 4437/// 4438/// \headerfile <x86intrin.h> 4439/// 4440/// This intrinsic has no corresponding instruction. 4441/// 4442/// \param __a 4443/// A 256-bit floating-point vector of [8 x float]. 4444/// \returns A 128-bit floating-point vector of [4 x float] containing the 4445/// lower 128 bits of the parameter. 4446static __inline __m128 __DEFAULT_FN_ATTRS 4447_mm256_castps256_ps128(__m256 __a) 4448{ 4449 return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); 4450} 4451 4452/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. 4453/// 4454/// \headerfile <x86intrin.h> 4455/// 4456/// This intrinsic has no corresponding instruction. 4457/// 4458/// \param __a 4459/// A 256-bit integer vector. 4460/// \returns A 128-bit integer vector containing the lower 128 bits of the 4461/// parameter. 4462static __inline __m128i __DEFAULT_FN_ATTRS 4463_mm256_castsi256_si128(__m256i __a) 4464{ 4465 return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); 4466} 4467 4468/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a 4469/// 128-bit floating-point vector of [2 x double]. The lower 128 bits 4470/// contain the value of the source vector. The contents of the upper 128 4471/// bits are undefined. 4472/// 4473/// \headerfile <x86intrin.h> 4474/// 4475/// This intrinsic has no corresponding instruction. 4476/// 4477/// \param __a 4478/// A 128-bit vector of [2 x double]. 4479/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits 4480/// contain the value of the parameter. The contents of the upper 128 bits 4481/// are undefined. 4482static __inline __m256d __DEFAULT_FN_ATTRS 4483_mm256_castpd128_pd256(__m128d __a) 4484{ 4485 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); 4486} 4487 4488/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a 4489/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain 4490/// the value of the source vector. The contents of the upper 128 bits are 4491/// undefined. 4492/// 4493/// \headerfile <x86intrin.h> 4494/// 4495/// This intrinsic has no corresponding instruction. 4496/// 4497/// \param __a 4498/// A 128-bit vector of [4 x float]. 4499/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits 4500/// contain the value of the parameter. The contents of the upper 128 bits 4501/// are undefined. 4502static __inline __m256 __DEFAULT_FN_ATTRS 4503_mm256_castps128_ps256(__m128 __a) 4504{ 4505 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); 4506} 4507 4508/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. 4509/// The lower 128 bits contain the value of the source vector. The contents 4510/// of the upper 128 bits are undefined. 4511/// 4512/// \headerfile <x86intrin.h> 4513/// 4514/// This intrinsic has no corresponding instruction. 4515/// 4516/// \param __a 4517/// A 128-bit integer vector. 4518/// \returns A 256-bit integer vector. The lower 128 bits contain the value of 4519/// the parameter. The contents of the upper 128 bits are undefined. 4520static __inline __m256i __DEFAULT_FN_ATTRS 4521_mm256_castsi128_si256(__m128i __a) 4522{ 4523 return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); 4524} 4525 4526/* 4527 Vector insert. 4528 We use macros rather than inlines because we only want to accept 4529 invocations where the immediate M is a constant expression. 4530*/ 4531/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating 4532/// a 256-bit vector of [8 x float] given in the first parameter, and then 4533/// replacing either the upper or the lower 128 bits with the contents of a 4534/// 128-bit vector of [4 x float] in the second parameter. The immediate 4535/// integer parameter determines between the upper or the lower 128 bits. 4536/// 4537/// \headerfile <x86intrin.h> 4538/// 4539/// \code 4540/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); 4541/// \endcode 4542/// 4543/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4544/// 4545/// \param V1 4546/// A 256-bit vector of [8 x float]. This vector is copied to the result 4547/// first, and then either the upper or the lower 128 bits of the result will 4548/// be replaced by the contents of \a V2. 4549/// \param V2 4550/// A 128-bit vector of [4 x float]. The contents of this parameter are 4551/// written to either the upper or the lower 128 bits of the result depending 4552/// on the value of parameter \a M. 4553/// \param M 4554/// An immediate integer. The least significant bit determines how the values 4555/// from the two parameters are interleaved: \n 4556/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4557/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4558/// result. \n 4559/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4560/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4561/// result. 4562/// \returns A 256-bit vector of [8 x float] containing the interleaved values. 4563#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ 4564 (__m256)__builtin_shufflevector( \ 4565 (__v8sf)(__m256)(V1), \ 4566 (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ 4567 (((M) & 1) ? 0 : 8), \ 4568 (((M) & 1) ? 1 : 9), \ 4569 (((M) & 1) ? 2 : 10), \ 4570 (((M) & 1) ? 3 : 11), \ 4571 (((M) & 1) ? 8 : 4), \ 4572 (((M) & 1) ? 9 : 5), \ 4573 (((M) & 1) ? 10 : 6), \ 4574 (((M) & 1) ? 11 : 7) );}) 4575 4576/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating 4577/// a 256-bit vector of [4 x double] given in the first parameter, and then 4578/// replacing either the upper or the lower 128 bits with the contents of a 4579/// 128-bit vector of [2 x double] in the second parameter. The immediate 4580/// integer parameter determines between the upper or the lower 128 bits. 4581/// 4582/// \headerfile <x86intrin.h> 4583/// 4584/// \code 4585/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); 4586/// \endcode 4587/// 4588/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4589/// 4590/// \param V1 4591/// A 256-bit vector of [4 x double]. This vector is copied to the result 4592/// first, and then either the upper or the lower 128 bits of the result will 4593/// be replaced by the contents of \a V2. 4594/// \param V2 4595/// A 128-bit vector of [2 x double]. The contents of this parameter are 4596/// written to either the upper or the lower 128 bits of the result depending 4597/// on the value of parameter \a M. 4598/// \param M 4599/// An immediate integer. The least significant bit determines how the values 4600/// from the two parameters are interleaved: \n 4601/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4602/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4603/// result. \n 4604/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4605/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4606/// result. 4607/// \returns A 256-bit vector of [4 x double] containing the interleaved values. 4608#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ 4609 (__m256d)__builtin_shufflevector( \ 4610 (__v4df)(__m256d)(V1), \ 4611 (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ 4612 (((M) & 1) ? 0 : 4), \ 4613 (((M) & 1) ? 1 : 5), \ 4614 (((M) & 1) ? 4 : 2), \ 4615 (((M) & 1) ? 5 : 3) );}) 4616 4617/// \brief Constructs a new 256-bit integer vector by first duplicating a 4618/// 256-bit integer vector given in the first parameter, and then replacing 4619/// either the upper or the lower 128 bits with the contents of a 128-bit 4620/// integer vector in the second parameter. The immediate integer parameter 4621/// determines between the upper or the lower 128 bits. 4622/// 4623/// \headerfile <x86intrin.h> 4624/// 4625/// \code 4626/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); 4627/// \endcode 4628/// 4629/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4630/// 4631/// \param V1 4632/// A 256-bit integer vector. This vector is copied to the result first, and 4633/// then either the upper or the lower 128 bits of the result will be 4634/// replaced by the contents of \a V2. 4635/// \param V2 4636/// A 128-bit integer vector. The contents of this parameter are written to 4637/// either the upper or the lower 128 bits of the result depending on the 4638/// value of parameter \a M. 4639/// \param M 4640/// An immediate integer. The least significant bit determines how the values 4641/// from the two parameters are interleaved: \n 4642/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, 4643/// and bits [255:128] of \a V1 are copied to bits [255:128] of the 4644/// result. \n 4645/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the 4646/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the 4647/// result. 4648/// \returns A 256-bit integer vector containing the interleaved values. 4649#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ 4650 (__m256i)__builtin_shufflevector( \ 4651 (__v4di)(__m256i)(V1), \ 4652 (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ 4653 (((M) & 1) ? 0 : 4), \ 4654 (((M) & 1) ? 1 : 5), \ 4655 (((M) & 1) ? 4 : 2), \ 4656 (((M) & 1) ? 5 : 3) );}) 4657 4658/* 4659 Vector extract. 4660 We use macros rather than inlines because we only want to accept 4661 invocations where the immediate M is a constant expression. 4662*/ 4663/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4664/// of [8 x float], as determined by the immediate integer parameter, and 4665/// returns the extracted bits as a 128-bit vector of [4 x float]. 4666/// 4667/// \headerfile <x86intrin.h> 4668/// 4669/// \code 4670/// __m128 _mm256_extractf128_ps(__m256 V, const int M); 4671/// \endcode 4672/// 4673/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4674/// 4675/// \param V 4676/// A 256-bit vector of [8 x float]. 4677/// \param M 4678/// An immediate integer. The least significant bit determines which bits are 4679/// extracted from the first parameter: \n 4680/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4681/// result. \n 4682/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4683/// \returns A 128-bit vector of [4 x float] containing the extracted bits. 4684#define _mm256_extractf128_ps(V, M) __extension__ ({ \ 4685 (__m128)__builtin_shufflevector( \ 4686 (__v8sf)(__m256)(V), \ 4687 (__v8sf)(_mm256_undefined_ps()), \ 4688 (((M) & 1) ? 4 : 0), \ 4689 (((M) & 1) ? 5 : 1), \ 4690 (((M) & 1) ? 6 : 2), \ 4691 (((M) & 1) ? 7 : 3) );}) 4692 4693/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector 4694/// of [4 x double], as determined by the immediate integer parameter, and 4695/// returns the extracted bits as a 128-bit vector of [2 x double]. 4696/// 4697/// \headerfile <x86intrin.h> 4698/// 4699/// \code 4700/// __m128d _mm256_extractf128_pd(__m256d V, const int M); 4701/// \endcode 4702/// 4703/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4704/// 4705/// \param V 4706/// A 256-bit vector of [4 x double]. 4707/// \param M 4708/// An immediate integer. The least significant bit determines which bits are 4709/// extracted from the first parameter: \n 4710/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4711/// result. \n 4712/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4713/// \returns A 128-bit vector of [2 x double] containing the extracted bits. 4714#define _mm256_extractf128_pd(V, M) __extension__ ({ \ 4715 (__m128d)__builtin_shufflevector( \ 4716 (__v4df)(__m256d)(V), \ 4717 (__v4df)(_mm256_undefined_pd()), \ 4718 (((M) & 1) ? 2 : 0), \ 4719 (((M) & 1) ? 3 : 1) );}) 4720 4721/// \brief Extracts either the upper or the lower 128 bits from a 256-bit 4722/// integer vector, as determined by the immediate integer parameter, and 4723/// returns the extracted bits as a 128-bit integer vector. 4724/// 4725/// \headerfile <x86intrin.h> 4726/// 4727/// \code 4728/// __m128i _mm256_extractf128_si256(__m256i V, const int M); 4729/// \endcode 4730/// 4731/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction. 4732/// 4733/// \param V 4734/// A 256-bit integer vector. 4735/// \param M 4736/// An immediate integer. The least significant bit determines which bits are 4737/// extracted from the first parameter: \n 4738/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the 4739/// result. \n 4740/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. 4741/// \returns A 128-bit integer vector containing the extracted bits. 4742#define _mm256_extractf128_si256(V, M) __extension__ ({ \ 4743 (__m128i)__builtin_shufflevector( \ 4744 (__v4di)(__m256i)(V), \ 4745 (__v4di)(_mm256_undefined_si256()), \ 4746 (((M) & 1) ? 2 : 0), \ 4747 (((M) & 1) ? 3 : 1) );}) 4748 4749/* SIMD load ops (unaligned) */ 4750/// \brief Loads two 128-bit floating-point vectors of [4 x float] from 4751/// unaligned memory locations and constructs a 256-bit floating-point vector 4752/// of [8 x float] by concatenating the two 128-bit vectors. 4753/// 4754/// \headerfile <x86intrin.h> 4755/// 4756/// This intrinsic corresponds to load instructions followed by the 4757/// <c> VINSERTF128 </c> instruction. 4758/// 4759/// \param __addr_hi 4760/// A pointer to a 128-bit memory location containing 4 consecutive 4761/// single-precision floating-point values. These values are to be copied to 4762/// bits[255:128] of the result. The address of the memory location does not 4763/// have to be aligned. 4764/// \param __addr_lo 4765/// A pointer to a 128-bit memory location containing 4 consecutive 4766/// single-precision floating-point values. These values are to be copied to 4767/// bits[127:0] of the result. The address of the memory location does not 4768/// have to be aligned. 4769/// \returns A 256-bit floating-point vector of [8 x float] containing the 4770/// concatenated result. 4771static __inline __m256 __DEFAULT_FN_ATTRS 4772_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) 4773{ 4774 __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo)); 4775 return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); 4776} 4777 4778/// \brief Loads two 128-bit floating-point vectors of [2 x double] from 4779/// unaligned memory locations and constructs a 256-bit floating-point vector 4780/// of [4 x double] by concatenating the two 128-bit vectors. 4781/// 4782/// \headerfile <x86intrin.h> 4783/// 4784/// This intrinsic corresponds to load instructions followed by the 4785/// <c> VINSERTF128 </c> instruction. 4786/// 4787/// \param __addr_hi 4788/// A pointer to a 128-bit memory location containing two consecutive 4789/// double-precision floating-point values. These values are to be copied to 4790/// bits[255:128] of the result. The address of the memory location does not 4791/// have to be aligned. 4792/// \param __addr_lo 4793/// A pointer to a 128-bit memory location containing two consecutive 4794/// double-precision floating-point values. These values are to be copied to 4795/// bits[127:0] of the result. The address of the memory location does not 4796/// have to be aligned. 4797/// \returns A 256-bit floating-point vector of [4 x double] containing the 4798/// concatenated result. 4799static __inline __m256d __DEFAULT_FN_ATTRS 4800_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) 4801{ 4802 __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo)); 4803 return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); 4804} 4805 4806/// \brief Loads two 128-bit integer vectors from unaligned memory locations and 4807/// constructs a 256-bit integer vector by concatenating the two 128-bit 4808/// vectors. 4809/// 4810/// \headerfile <x86intrin.h> 4811/// 4812/// This intrinsic corresponds to load instructions followed by the 4813/// <c> VINSERTF128 </c> instruction. 4814/// 4815/// \param __addr_hi 4816/// A pointer to a 128-bit memory location containing a 128-bit integer 4817/// vector. This vector is to be copied to bits[255:128] of the result. The 4818/// address of the memory location does not have to be aligned. 4819/// \param __addr_lo 4820/// A pointer to a 128-bit memory location containing a 128-bit integer 4821/// vector. This vector is to be copied to bits[127:0] of the result. The 4822/// address of the memory location does not have to be aligned. 4823/// \returns A 256-bit integer vector containing the concatenated result. 4824static __inline __m256i __DEFAULT_FN_ATTRS 4825_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) 4826{ 4827 __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo)); 4828 return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1); 4829} 4830 4831/* SIMD store ops (unaligned) */ 4832/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4833/// vector of [8 x float] into two different unaligned memory locations. 4834/// 4835/// \headerfile <x86intrin.h> 4836/// 4837/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4838/// store instructions. 4839/// 4840/// \param __addr_hi 4841/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4842/// copied to this memory location. The address of this memory location does 4843/// not have to be aligned. 4844/// \param __addr_lo 4845/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4846/// copied to this memory location. The address of this memory location does 4847/// not have to be aligned. 4848/// \param __a 4849/// A 256-bit floating-point vector of [8 x float]. 4850static __inline void __DEFAULT_FN_ATTRS 4851_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) 4852{ 4853 __m128 __v128; 4854 4855 __v128 = _mm256_castps256_ps128(__a); 4856 _mm_storeu_ps(__addr_lo, __v128); 4857 __v128 = _mm256_extractf128_ps(__a, 1); 4858 _mm_storeu_ps(__addr_hi, __v128); 4859} 4860 4861/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point 4862/// vector of [4 x double] into two different unaligned memory locations. 4863/// 4864/// \headerfile <x86intrin.h> 4865/// 4866/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4867/// store instructions. 4868/// 4869/// \param __addr_hi 4870/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4871/// copied to this memory location. The address of this memory location does 4872/// not have to be aligned. 4873/// \param __addr_lo 4874/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4875/// copied to this memory location. The address of this memory location does 4876/// not have to be aligned. 4877/// \param __a 4878/// A 256-bit floating-point vector of [4 x double]. 4879static __inline void __DEFAULT_FN_ATTRS 4880_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) 4881{ 4882 __m128d __v128; 4883 4884 __v128 = _mm256_castpd256_pd128(__a); 4885 _mm_storeu_pd(__addr_lo, __v128); 4886 __v128 = _mm256_extractf128_pd(__a, 1); 4887 _mm_storeu_pd(__addr_hi, __v128); 4888} 4889 4890/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into 4891/// two different unaligned memory locations. 4892/// 4893/// \headerfile <x86intrin.h> 4894/// 4895/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the 4896/// store instructions. 4897/// 4898/// \param __addr_hi 4899/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be 4900/// copied to this memory location. The address of this memory location does 4901/// not have to be aligned. 4902/// \param __addr_lo 4903/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be 4904/// copied to this memory location. The address of this memory location does 4905/// not have to be aligned. 4906/// \param __a 4907/// A 256-bit integer vector. 4908static __inline void __DEFAULT_FN_ATTRS 4909_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) 4910{ 4911 __m128i __v128; 4912 4913 __v128 = _mm256_castsi256_si128(__a); 4914 _mm_storeu_si128(__addr_lo, __v128); 4915 __v128 = _mm256_extractf128_si256(__a, 1); 4916 _mm_storeu_si128(__addr_hi, __v128); 4917} 4918 4919/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 4920/// concatenating two 128-bit floating-point vectors of [4 x float]. 4921/// 4922/// \headerfile <x86intrin.h> 4923/// 4924/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4925/// 4926/// \param __hi 4927/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4928/// 128 bits of the result. 4929/// \param __lo 4930/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4931/// 128 bits of the result. 4932/// \returns A 256-bit floating-point vector of [8 x float] containing the 4933/// concatenated result. 4934static __inline __m256 __DEFAULT_FN_ATTRS 4935_mm256_set_m128 (__m128 __hi, __m128 __lo) 4936{ 4937 return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); 4938} 4939 4940/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 4941/// concatenating two 128-bit floating-point vectors of [2 x double]. 4942/// 4943/// \headerfile <x86intrin.h> 4944/// 4945/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4946/// 4947/// \param __hi 4948/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 4949/// 128 bits of the result. 4950/// \param __lo 4951/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 4952/// 128 bits of the result. 4953/// \returns A 256-bit floating-point vector of [4 x double] containing the 4954/// concatenated result. 4955static __inline __m256d __DEFAULT_FN_ATTRS 4956_mm256_set_m128d (__m128d __hi, __m128d __lo) 4957{ 4958 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4959} 4960 4961/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 4962/// integer vectors. 4963/// 4964/// \headerfile <x86intrin.h> 4965/// 4966/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4967/// 4968/// \param __hi 4969/// A 128-bit integer vector to be copied to the upper 128 bits of the 4970/// result. 4971/// \param __lo 4972/// A 128-bit integer vector to be copied to the lower 128 bits of the 4973/// result. 4974/// \returns A 256-bit integer vector containing the concatenated result. 4975static __inline __m256i __DEFAULT_FN_ATTRS 4976_mm256_set_m128i (__m128i __hi, __m128i __lo) 4977{ 4978 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 4979} 4980 4981/// \brief Constructs a 256-bit floating-point vector of [8 x float] by 4982/// concatenating two 128-bit floating-point vectors of [4 x float]. This is 4983/// similar to _mm256_set_m128, but the order of the input parameters is 4984/// swapped. 4985/// 4986/// \headerfile <x86intrin.h> 4987/// 4988/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 4989/// 4990/// \param __lo 4991/// A 128-bit floating-point vector of [4 x float] to be copied to the lower 4992/// 128 bits of the result. 4993/// \param __hi 4994/// A 128-bit floating-point vector of [4 x float] to be copied to the upper 4995/// 128 bits of the result. 4996/// \returns A 256-bit floating-point vector of [8 x float] containing the 4997/// concatenated result. 4998static __inline __m256 __DEFAULT_FN_ATTRS 4999_mm256_setr_m128 (__m128 __lo, __m128 __hi) 5000{ 5001 return _mm256_set_m128(__hi, __lo); 5002} 5003 5004/// \brief Constructs a 256-bit floating-point vector of [4 x double] by 5005/// concatenating two 128-bit floating-point vectors of [2 x double]. This is 5006/// similar to _mm256_set_m128d, but the order of the input parameters is 5007/// swapped. 5008/// 5009/// \headerfile <x86intrin.h> 5010/// 5011/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5012/// 5013/// \param __lo 5014/// A 128-bit floating-point vector of [2 x double] to be copied to the lower 5015/// 128 bits of the result. 5016/// \param __hi 5017/// A 128-bit floating-point vector of [2 x double] to be copied to the upper 5018/// 128 bits of the result. 5019/// \returns A 256-bit floating-point vector of [4 x double] containing the 5020/// concatenated result. 5021static __inline __m256d __DEFAULT_FN_ATTRS 5022_mm256_setr_m128d (__m128d __lo, __m128d __hi) 5023{ 5024 return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5025} 5026 5027/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit 5028/// integer vectors. This is similar to _mm256_set_m128i, but the order of 5029/// the input parameters is swapped. 5030/// 5031/// \headerfile <x86intrin.h> 5032/// 5033/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction. 5034/// 5035/// \param __lo 5036/// A 128-bit integer vector to be copied to the lower 128 bits of the 5037/// result. 5038/// \param __hi 5039/// A 128-bit integer vector to be copied to the upper 128 bits of the 5040/// result. 5041/// \returns A 256-bit integer vector containing the concatenated result. 5042static __inline __m256i __DEFAULT_FN_ATTRS 5043_mm256_setr_m128i (__m128i __lo, __m128i __hi) 5044{ 5045 return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); 5046} 5047 5048#undef __DEFAULT_FN_ATTRS 5049 5050#endif /* __AVXINTRIN_H */ 5051