1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef _SMMINTRIN_H 25#define _SMMINTRIN_H 26 27#include <tmmintrin.h> 28 29/* Define the default attributes for the functions in this file. */ 30#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"))) 31 32/* SSE4 Rounding macros. */ 33#define _MM_FROUND_TO_NEAREST_INT 0x00 34#define _MM_FROUND_TO_NEG_INF 0x01 35#define _MM_FROUND_TO_POS_INF 0x02 36#define _MM_FROUND_TO_ZERO 0x03 37#define _MM_FROUND_CUR_DIRECTION 0x04 38 39#define _MM_FROUND_RAISE_EXC 0x00 40#define _MM_FROUND_NO_EXC 0x08 41 42#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 43#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 44#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 45#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 46#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 47#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 48 49/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an 50/// integer and returns the rounded values in a 128-bit vector of 51/// [4 x float]. 52/// 53/// \headerfile <x86intrin.h> 54/// 55/// \code 56/// __m128 _mm_ceil_ps(__m128 X); 57/// \endcode 58/// 59/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 60/// 61/// \param X 62/// A 128-bit vector of [4 x float] values to be rounded up. 63/// \returns A 128-bit vector of [4 x float] containing the rounded values. 64#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 65 66/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an 67/// integer and returns the rounded values in a 128-bit vector of 68/// [2 x double]. 69/// 70/// \headerfile <x86intrin.h> 71/// 72/// \code 73/// __m128d _mm_ceil_pd(__m128d X); 74/// \endcode 75/// 76/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 77/// 78/// \param X 79/// A 128-bit vector of [2 x double] values to be rounded up. 80/// \returns A 128-bit vector of [2 x double] containing the rounded values. 81#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 82 83/// \brief Copies three upper elements of the first 128-bit vector operand to 84/// the corresponding three upper elements of the 128-bit result vector of 85/// [4 x float]. Rounds up the lowest element of the second 128-bit vector 86/// operand to an integer and copies it to the lowest element of the 128-bit 87/// result vector of [4 x float]. 88/// 89/// \headerfile <x86intrin.h> 90/// 91/// \code 92/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 93/// \endcode 94/// 95/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 96/// 97/// \param X 98/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 99/// copied to the corresponding bits of the result. 100/// \param Y 101/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 102/// rounded up to the nearest integer and copied to the corresponding bits 103/// of the result. 104/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 105/// values. 106#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 107 108/// \brief Copies the upper element of the first 128-bit vector operand to the 109/// corresponding upper element of the 128-bit result vector of [2 x double]. 110/// Rounds up the lower element of the second 128-bit vector operand to an 111/// integer and copies it to the lower element of the 128-bit result vector 112/// of [2 x double]. 113/// 114/// \headerfile <x86intrin.h> 115/// 116/// \code 117/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 118/// \endcode 119/// 120/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 121/// 122/// \param X 123/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 124/// copied to the corresponding bits of the result. 125/// \param Y 126/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 127/// rounded up to the nearest integer and copied to the corresponding bits 128/// of the result. 129/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 130/// values. 131#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 132 133/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an 134/// an integer and returns the rounded values in a 128-bit vector of 135/// [4 x float]. 136/// 137/// \headerfile <x86intrin.h> 138/// 139/// \code 140/// __m128 _mm_floor_ps(__m128 X); 141/// \endcode 142/// 143/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 144/// 145/// \param X 146/// A 128-bit vector of [4 x float] values to be rounded down. 147/// \returns A 128-bit vector of [4 x float] containing the rounded values. 148#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 149 150/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an 151/// integer and returns the rounded values in a 128-bit vector of 152/// [2 x double]. 153/// 154/// \headerfile <x86intrin.h> 155/// 156/// \code 157/// __m128d _mm_floor_pd(__m128d X); 158/// \endcode 159/// 160/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 161/// 162/// \param X 163/// A 128-bit vector of [2 x double]. 164/// \returns A 128-bit vector of [2 x double] containing the rounded values. 165#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 166 167/// \brief Copies three upper elements of the first 128-bit vector operand to 168/// the corresponding three upper elements of the 128-bit result vector of 169/// [4 x float]. Rounds down the lowest element of the second 128-bit vector 170/// operand to an integer and copies it to the lowest element of the 128-bit 171/// result vector of [4 x float]. 172/// 173/// \headerfile <x86intrin.h> 174/// 175/// \code 176/// __m128 _mm_floor_ss(__m128 X, __m128 Y); 177/// \endcode 178/// 179/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 180/// 181/// \param X 182/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 183/// copied to the corresponding bits of the result. 184/// \param Y 185/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 186/// rounded down to the nearest integer and copied to the corresponding bits 187/// of the result. 188/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 189/// values. 190#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 191 192/// \brief Copies the upper element of the first 128-bit vector operand to the 193/// corresponding upper element of the 128-bit result vector of [2 x double]. 194/// Rounds down the lower element of the second 128-bit vector operand to an 195/// integer and copies it to the lower element of the 128-bit result vector 196/// of [2 x double]. 197/// 198/// \headerfile <x86intrin.h> 199/// 200/// \code 201/// __m128d _mm_floor_sd(__m128d X, __m128d Y); 202/// \endcode 203/// 204/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 205/// 206/// \param X 207/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 208/// copied to the corresponding bits of the result. 209/// \param Y 210/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 211/// rounded down to the nearest integer and copied to the corresponding bits 212/// of the result. 213/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 214/// values. 215#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 216 217/// \brief Rounds each element of the 128-bit vector of [4 x float] to an 218/// integer value according to the rounding control specified by the second 219/// argument and returns the rounded values in a 128-bit vector of 220/// [4 x float]. 221/// 222/// \headerfile <x86intrin.h> 223/// 224/// \code 225/// __m128 _mm_round_ps(__m128 X, const int M); 226/// \endcode 227/// 228/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 229/// 230/// \param X 231/// A 128-bit vector of [4 x float]. 232/// \param M 233/// An integer value that specifies the rounding operation. \n 234/// Bits [7:4] are reserved. \n 235/// Bit [3] is a precision exception value: \n 236/// 0: A normal PE exception is used \n 237/// 1: The PE field is not updated \n 238/// Bit [2] is the rounding control source: \n 239/// 0: Use bits [1:0] of \a M \n 240/// 1: Use the current MXCSR setting \n 241/// Bits [1:0] contain the rounding control definition: \n 242/// 00: Nearest \n 243/// 01: Downward (toward negative infinity) \n 244/// 10: Upward (toward positive infinity) \n 245/// 11: Truncated 246/// \returns A 128-bit vector of [4 x float] containing the rounded values. 247#define _mm_round_ps(X, M) __extension__ ({ \ 248 (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) 249 250/// \brief Copies three upper elements of the first 128-bit vector operand to 251/// the corresponding three upper elements of the 128-bit result vector of 252/// [4 x float]. Rounds the lowest element of the second 128-bit vector 253/// operand to an integer value according to the rounding control specified 254/// by the third argument and copies it to the lowest element of the 128-bit 255/// result vector of [4 x float]. 256/// 257/// \headerfile <x86intrin.h> 258/// 259/// \code 260/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 261/// \endcode 262/// 263/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 264/// 265/// \param X 266/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 267/// copied to the corresponding bits of the result. 268/// \param Y 269/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 270/// rounded to the nearest integer using the specified rounding control and 271/// copied to the corresponding bits of the result. 272/// \param M 273/// An integer value that specifies the rounding operation. \n 274/// Bits [7:4] are reserved. \n 275/// Bit [3] is a precision exception value: \n 276/// 0: A normal PE exception is used \n 277/// 1: The PE field is not updated \n 278/// Bit [2] is the rounding control source: \n 279/// 0: Use bits [1:0] of \a M \n 280/// 1: Use the current MXCSR setting \n 281/// Bits [1:0] contain the rounding control definition: \n 282/// 00: Nearest \n 283/// 01: Downward (toward negative infinity) \n 284/// 10: Upward (toward positive infinity) \n 285/// 11: Truncated 286/// \returns A 128-bit vector of [4 x float] containing the copied and rounded 287/// values. 288#define _mm_round_ss(X, Y, M) __extension__ ({ \ 289 (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ 290 (__v4sf)(__m128)(Y), (M)); }) 291 292/// \brief Rounds each element of the 128-bit vector of [2 x double] to an 293/// integer value according to the rounding control specified by the second 294/// argument and returns the rounded values in a 128-bit vector of 295/// [2 x double]. 296/// 297/// \headerfile <x86intrin.h> 298/// 299/// \code 300/// __m128d _mm_round_pd(__m128d X, const int M); 301/// \endcode 302/// 303/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 304/// 305/// \param X 306/// A 128-bit vector of [2 x double]. 307/// \param M 308/// An integer value that specifies the rounding operation. \n 309/// Bits [7:4] are reserved. \n 310/// Bit [3] is a precision exception value: \n 311/// 0: A normal PE exception is used \n 312/// 1: The PE field is not updated \n 313/// Bit [2] is the rounding control source: \n 314/// 0: Use bits [1:0] of \a M \n 315/// 1: Use the current MXCSR setting \n 316/// Bits [1:0] contain the rounding control definition: \n 317/// 00: Nearest \n 318/// 01: Downward (toward negative infinity) \n 319/// 10: Upward (toward positive infinity) \n 320/// 11: Truncated 321/// \returns A 128-bit vector of [2 x double] containing the rounded values. 322#define _mm_round_pd(X, M) __extension__ ({ \ 323 (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) 324 325/// \brief Copies the upper element of the first 128-bit vector operand to the 326/// corresponding upper element of the 128-bit result vector of [2 x double]. 327/// Rounds the lower element of the second 128-bit vector operand to an 328/// integer value according to the rounding control specified by the third 329/// argument and copies it to the lower element of the 128-bit result vector 330/// of [2 x double]. 331/// 332/// \headerfile <x86intrin.h> 333/// 334/// \code 335/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 336/// \endcode 337/// 338/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 339/// 340/// \param X 341/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 342/// copied to the corresponding bits of the result. 343/// \param Y 344/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 345/// rounded to the nearest integer using the specified rounding control and 346/// copied to the corresponding bits of the result. 347/// \param M 348/// An integer value that specifies the rounding operation. \n 349/// Bits [7:4] are reserved. \n 350/// Bit [3] is a precision exception value: \n 351/// 0: A normal PE exception is used \n 352/// 1: The PE field is not updated \n 353/// Bit [2] is the rounding control source: \n 354/// 0: Use bits [1:0] of \a M \n 355/// 1: Use the current MXCSR setting \n 356/// Bits [1:0] contain the rounding control definition: \n 357/// 00: Nearest \n 358/// 01: Downward (toward negative infinity) \n 359/// 10: Upward (toward positive infinity) \n 360/// 11: Truncated 361/// \returns A 128-bit vector of [2 x double] containing the copied and rounded 362/// values. 363#define _mm_round_sd(X, Y, M) __extension__ ({ \ 364 (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ 365 (__v2df)(__m128d)(Y), (M)); }) 366 367/* SSE4 Packed Blending Intrinsics. */ 368/// \brief Returns a 128-bit vector of [2 x double] where the values are 369/// selected from either the first or second operand as specified by the 370/// third operand, the control mask. 371/// 372/// \headerfile <x86intrin.h> 373/// 374/// \code 375/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 376/// \endcode 377/// 378/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 379/// 380/// \param V1 381/// A 128-bit vector of [2 x double]. 382/// \param V2 383/// A 128-bit vector of [2 x double]. 384/// \param M 385/// An immediate integer operand, with mask bits [1:0] specifying how the 386/// values are to be copied. The position of the mask bit corresponds to the 387/// index of a copied value. When a mask bit is 0, the corresponding 64-bit 388/// element in operand \a V1 is copied to the same position in the result. 389/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 390/// is copied to the same position in the result. 391/// \returns A 128-bit vector of [2 x double] containing the copied values. 392#define _mm_blend_pd(V1, V2, M) __extension__ ({ \ 393 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ 394 (__v2df)(__m128d)(V2), \ 395 (((M) & 0x01) ? 2 : 0), \ 396 (((M) & 0x02) ? 3 : 1)); }) 397 398/// \brief Returns a 128-bit vector of [4 x float] where the values are selected 399/// from either the first or second operand as specified by the third 400/// operand, the control mask. 401/// 402/// \headerfile <x86intrin.h> 403/// 404/// \code 405/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 406/// \endcode 407/// 408/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 409/// 410/// \param V1 411/// A 128-bit vector of [4 x float]. 412/// \param V2 413/// A 128-bit vector of [4 x float]. 414/// \param M 415/// An immediate integer operand, with mask bits [3:0] specifying how the 416/// values are to be copied. The position of the mask bit corresponds to the 417/// index of a copied value. When a mask bit is 0, the corresponding 32-bit 418/// element in operand \a V1 is copied to the same position in the result. 419/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 420/// is copied to the same position in the result. 421/// \returns A 128-bit vector of [4 x float] containing the copied values. 422#define _mm_blend_ps(V1, V2, M) __extension__ ({ \ 423 (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 424 (((M) & 0x01) ? 4 : 0), \ 425 (((M) & 0x02) ? 5 : 1), \ 426 (((M) & 0x04) ? 6 : 2), \ 427 (((M) & 0x08) ? 7 : 3)); }) 428 429/// \brief Returns a 128-bit vector of [2 x double] where the values are 430/// selected from either the first or second operand as specified by the 431/// third operand, the control mask. 432/// 433/// \headerfile <x86intrin.h> 434/// 435/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 436/// 437/// \param __V1 438/// A 128-bit vector of [2 x double]. 439/// \param __V2 440/// A 128-bit vector of [2 x double]. 441/// \param __M 442/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 443/// values are to be copied. The position of the mask bit corresponds to the 444/// most significant bit of a copied value. When a mask bit is 0, the 445/// corresponding 64-bit element in operand \a __V1 is copied to the same 446/// position in the result. When a mask bit is 1, the corresponding 64-bit 447/// element in operand \a __V2 is copied to the same position in the result. 448/// \returns A 128-bit vector of [2 x double] containing the copied values. 449static __inline__ __m128d __DEFAULT_FN_ATTRS 450_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 451{ 452 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 453 (__v2df)__M); 454} 455 456/// \brief Returns a 128-bit vector of [4 x float] where the values are 457/// selected from either the first or second operand as specified by the 458/// third operand, the control mask. 459/// 460/// \headerfile <x86intrin.h> 461/// 462/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 463/// 464/// \param __V1 465/// A 128-bit vector of [4 x float]. 466/// \param __V2 467/// A 128-bit vector of [4 x float]. 468/// \param __M 469/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 470/// how the values are to be copied. The position of the mask bit corresponds 471/// to the most significant bit of a copied value. When a mask bit is 0, the 472/// corresponding 32-bit element in operand \a __V1 is copied to the same 473/// position in the result. When a mask bit is 1, the corresponding 32-bit 474/// element in operand \a __V2 is copied to the same position in the result. 475/// \returns A 128-bit vector of [4 x float] containing the copied values. 476static __inline__ __m128 __DEFAULT_FN_ATTRS 477_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 478{ 479 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 480 (__v4sf)__M); 481} 482 483/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected 484/// from either of the first or second operand as specified by the third 485/// operand, the control mask. 486/// 487/// \headerfile <x86intrin.h> 488/// 489/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 490/// 491/// \param __V1 492/// A 128-bit vector of [16 x i8]. 493/// \param __V2 494/// A 128-bit vector of [16 x i8]. 495/// \param __M 496/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying 497/// how the values are to be copied. The position of the mask bit corresponds 498/// to the most significant bit of a copied value. When a mask bit is 0, the 499/// corresponding 8-bit element in operand \a __V1 is copied to the same 500/// position in the result. When a mask bit is 1, the corresponding 8-bit 501/// element in operand \a __V2 is copied to the same position in the result. 502/// \returns A 128-bit vector of [16 x i8] containing the copied values. 503static __inline__ __m128i __DEFAULT_FN_ATTRS 504_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 505{ 506 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 507 (__v16qi)__M); 508} 509 510/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected 511/// from either of the first or second operand as specified by the third 512/// operand, the control mask. 513/// 514/// \headerfile <x86intrin.h> 515/// 516/// \code 517/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 518/// \endcode 519/// 520/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 521/// 522/// \param V1 523/// A 128-bit vector of [8 x i16]. 524/// \param V2 525/// A 128-bit vector of [8 x i16]. 526/// \param M 527/// An immediate integer operand, with mask bits [7:0] specifying how the 528/// values are to be copied. The position of the mask bit corresponds to the 529/// index of a copied value. When a mask bit is 0, the corresponding 16-bit 530/// element in operand \a V1 is copied to the same position in the result. 531/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 532/// is copied to the same position in the result. 533/// \returns A 128-bit vector of [8 x i16] containing the copied values. 534#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ 535 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ 536 (__v8hi)(__m128i)(V2), \ 537 (((M) & 0x01) ? 8 : 0), \ 538 (((M) & 0x02) ? 9 : 1), \ 539 (((M) & 0x04) ? 10 : 2), \ 540 (((M) & 0x08) ? 11 : 3), \ 541 (((M) & 0x10) ? 12 : 4), \ 542 (((M) & 0x20) ? 13 : 5), \ 543 (((M) & 0x40) ? 14 : 6), \ 544 (((M) & 0x80) ? 15 : 7)); }) 545 546/* SSE4 Dword Multiply Instructions. */ 547/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32] 548/// and returns the lower 32 bits of the each product in a 128-bit vector of 549/// [4 x i32]. 550/// 551/// \headerfile <x86intrin.h> 552/// 553/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 554/// 555/// \param __V1 556/// A 128-bit integer vector. 557/// \param __V2 558/// A 128-bit integer vector. 559/// \returns A 128-bit integer vector containing the products of both operands. 560static __inline__ __m128i __DEFAULT_FN_ATTRS 561_mm_mullo_epi32 (__m128i __V1, __m128i __V2) 562{ 563 return (__m128i) ((__v4su)__V1 * (__v4su)__V2); 564} 565 566/// \brief Multiplies corresponding even-indexed elements of two 128-bit 567/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 568/// containing the products. 569/// 570/// \headerfile <x86intrin.h> 571/// 572/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 573/// 574/// \param __V1 575/// A 128-bit vector of [4 x i32]. 576/// \param __V2 577/// A 128-bit vector of [4 x i32]. 578/// \returns A 128-bit vector of [2 x i64] containing the products of both 579/// operands. 580static __inline__ __m128i __DEFAULT_FN_ATTRS 581_mm_mul_epi32 (__m128i __V1, __m128i __V2) 582{ 583 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 584} 585 586/* SSE4 Floating Point Dot Product Instructions. */ 587/// \brief Computes the dot product of the two 128-bit vectors of [4 x float] 588/// and returns it in the elements of the 128-bit result vector of 589/// [4 x float]. 590/// 591/// The immediate integer operand controls which input elements 592/// will contribute to the dot product, and where the final results are 593/// returned. 594/// 595/// \headerfile <x86intrin.h> 596/// 597/// \code 598/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 599/// \endcode 600/// 601/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 602/// 603/// \param X 604/// A 128-bit vector of [4 x float]. 605/// \param Y 606/// A 128-bit vector of [4 x float]. 607/// \param M 608/// An immediate integer operand. Mask bits [7:4] determine which elements 609/// of the input vectors are used, with bit [4] corresponding to the lowest 610/// element and bit [7] corresponding to the highest element of each [4 x 611/// float] vector. If a bit is set, the corresponding elements from the two 612/// input vectors are used as an input for dot product; otherwise that input 613/// is treated as zero. Bits [3:0] determine which elements of the result 614/// will receive a copy of the final dot product, with bit [0] corresponding 615/// to the lowest element and bit [3] corresponding to the highest element of 616/// each [4 x float] subvector. If a bit is set, the dot product is returned 617/// in the corresponding element; otherwise that element is set to zero. 618/// \returns A 128-bit vector of [4 x float] containing the dot product. 619#define _mm_dp_ps(X, Y, M) __extension__ ({ \ 620 (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ 621 (__v4sf)(__m128)(Y), (M)); }) 622 623/// \brief Computes the dot product of the two 128-bit vectors of [2 x double] 624/// and returns it in the elements of the 128-bit result vector of 625/// [2 x double]. 626/// 627/// The immediate integer operand controls which input 628/// elements will contribute to the dot product, and where the final results 629/// are returned. 630/// 631/// \headerfile <x86intrin.h> 632/// 633/// \code 634/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 635/// \endcode 636/// 637/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 638/// 639/// \param X 640/// A 128-bit vector of [2 x double]. 641/// \param Y 642/// A 128-bit vector of [2 x double]. 643/// \param M 644/// An immediate integer operand. Mask bits [5:4] determine which elements 645/// of the input vectors are used, with bit [4] corresponding to the lowest 646/// element and bit [5] corresponding to the highest element of each of [2 x 647/// double] vector. If a bit is set, the corresponding elements from the two 648/// input vectors are used as an input for dot product; otherwise that input 649/// is treated as zero. Bits [1:0] determine which elements of the result 650/// will receive a copy of the final dot product, with bit [0] corresponding 651/// to the lowest element and bit [3] corresponding to the highest element of 652/// each [2 x double] vector. If a bit is set, the dot product is returned in 653/// the corresponding element; otherwise that element is set to zero. 654#define _mm_dp_pd(X, Y, M) __extension__ ({\ 655 (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ 656 (__v2df)(__m128d)(Y), (M)); }) 657 658/* SSE4 Streaming Load Hint Instruction. */ 659/// \brief Loads integer values from a 128-bit aligned memory location to a 660/// 128-bit integer vector. 661/// 662/// \headerfile <x86intrin.h> 663/// 664/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 665/// 666/// \param __V 667/// A pointer to a 128-bit aligned memory location that contains the integer 668/// values. 669/// \returns A 128-bit integer vector containing the data stored at the 670/// specified memory location. 671static __inline__ __m128i __DEFAULT_FN_ATTRS 672_mm_stream_load_si128 (__m128i const *__V) 673{ 674 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); 675} 676 677/* SSE4 Packed Integer Min/Max Instructions. */ 678/// \brief Compares the corresponding elements of two 128-bit vectors of 679/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 680/// of the two values. 681/// 682/// \headerfile <x86intrin.h> 683/// 684/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 685/// 686/// \param __V1 687/// A 128-bit vector of [16 x i8]. 688/// \param __V2 689/// A 128-bit vector of [16 x i8] 690/// \returns A 128-bit vector of [16 x i8] containing the lesser values. 691static __inline__ __m128i __DEFAULT_FN_ATTRS 692_mm_min_epi8 (__m128i __V1, __m128i __V2) 693{ 694 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 695} 696 697/// \brief Compares the corresponding elements of two 128-bit vectors of 698/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 699/// greater value of the two. 700/// 701/// \headerfile <x86intrin.h> 702/// 703/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 704/// 705/// \param __V1 706/// A 128-bit vector of [16 x i8]. 707/// \param __V2 708/// A 128-bit vector of [16 x i8]. 709/// \returns A 128-bit vector of [16 x i8] containing the greater values. 710static __inline__ __m128i __DEFAULT_FN_ATTRS 711_mm_max_epi8 (__m128i __V1, __m128i __V2) 712{ 713 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 714} 715 716/// \brief Compares the corresponding elements of two 128-bit vectors of 717/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 718/// value of the two. 719/// 720/// \headerfile <x86intrin.h> 721/// 722/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 723/// 724/// \param __V1 725/// A 128-bit vector of [8 x u16]. 726/// \param __V2 727/// A 128-bit vector of [8 x u16]. 728/// \returns A 128-bit vector of [8 x u16] containing the lesser values. 729static __inline__ __m128i __DEFAULT_FN_ATTRS 730_mm_min_epu16 (__m128i __V1, __m128i __V2) 731{ 732 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 733} 734 735/// \brief Compares the corresponding elements of two 128-bit vectors of 736/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 737/// greater value of the two. 738/// 739/// \headerfile <x86intrin.h> 740/// 741/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 742/// 743/// \param __V1 744/// A 128-bit vector of [8 x u16]. 745/// \param __V2 746/// A 128-bit vector of [8 x u16]. 747/// \returns A 128-bit vector of [8 x u16] containing the greater values. 748static __inline__ __m128i __DEFAULT_FN_ATTRS 749_mm_max_epu16 (__m128i __V1, __m128i __V2) 750{ 751 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 752} 753 754/// \brief Compares the corresponding elements of two 128-bit vectors of 755/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 756/// value of the two. 757/// 758/// \headerfile <x86intrin.h> 759/// 760/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 761/// 762/// \param __V1 763/// A 128-bit vector of [4 x i32]. 764/// \param __V2 765/// A 128-bit vector of [4 x i32]. 766/// \returns A 128-bit vector of [4 x i32] containing the lesser values. 767static __inline__ __m128i __DEFAULT_FN_ATTRS 768_mm_min_epi32 (__m128i __V1, __m128i __V2) 769{ 770 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 771} 772 773/// \brief Compares the corresponding elements of two 128-bit vectors of 774/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 775/// greater value of the two. 776/// 777/// \headerfile <x86intrin.h> 778/// 779/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 780/// 781/// \param __V1 782/// A 128-bit vector of [4 x i32]. 783/// \param __V2 784/// A 128-bit vector of [4 x i32]. 785/// \returns A 128-bit vector of [4 x i32] containing the greater values. 786static __inline__ __m128i __DEFAULT_FN_ATTRS 787_mm_max_epi32 (__m128i __V1, __m128i __V2) 788{ 789 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 790} 791 792/// \brief Compares the corresponding elements of two 128-bit vectors of 793/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 794/// value of the two. 795/// 796/// \headerfile <x86intrin.h> 797/// 798/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 799/// 800/// \param __V1 801/// A 128-bit vector of [4 x u32]. 802/// \param __V2 803/// A 128-bit vector of [4 x u32]. 804/// \returns A 128-bit vector of [4 x u32] containing the lesser values. 805static __inline__ __m128i __DEFAULT_FN_ATTRS 806_mm_min_epu32 (__m128i __V1, __m128i __V2) 807{ 808 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 809} 810 811/// \brief Compares the corresponding elements of two 128-bit vectors of 812/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 813/// greater value of the two. 814/// 815/// \headerfile <x86intrin.h> 816/// 817/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 818/// 819/// \param __V1 820/// A 128-bit vector of [4 x u32]. 821/// \param __V2 822/// A 128-bit vector of [4 x u32]. 823/// \returns A 128-bit vector of [4 x u32] containing the greater values. 824static __inline__ __m128i __DEFAULT_FN_ATTRS 825_mm_max_epu32 (__m128i __V1, __m128i __V2) 826{ 827 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 828} 829 830/* SSE4 Insertion and Extraction from XMM Register Instructions. */ 831/// \brief Takes the first argument \a X and inserts an element from the second 832/// argument \a Y as selected by the third argument \a N. That result then 833/// has elements zeroed out also as selected by the third argument \a N. The 834/// resulting 128-bit vector of [4 x float] is then returned. 835/// 836/// \headerfile <x86intrin.h> 837/// 838/// \code 839/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 840/// \endcode 841/// 842/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 843/// 844/// \param X 845/// A 128-bit vector source operand of [4 x float]. With the exception of 846/// those bits in the result copied from parameter \a Y and zeroed by bits 847/// [3:0] of \a N, all bits from this parameter are copied to the result. 848/// \param Y 849/// A 128-bit vector source operand of [4 x float]. One single-precision 850/// floating-point element from this source, as determined by the immediate 851/// parameter, is copied to the result. 852/// \param N 853/// Specifies which bits from operand \a Y will be copied, which bits in the 854/// result they will be be copied to, and which bits in the result will be 855/// cleared. The following assignments are made: \n 856/// Bits [7:6] specify the bits to copy from operand \a Y: \n 857/// 00: Selects bits [31:0] from operand \a Y. \n 858/// 01: Selects bits [63:32] from operand \a Y. \n 859/// 10: Selects bits [95:64] from operand \a Y. \n 860/// 11: Selects bits [127:96] from operand \a Y. \n 861/// Bits [5:4] specify the bits in the result to which the selected bits 862/// from operand \a Y are copied: \n 863/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 864/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 865/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 866/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 867/// Bits[3:0]: If any of these bits are set, the corresponding result 868/// element is cleared. 869/// \returns A 128-bit vector of [4 x float] containing the copied single- 870/// precision floating point elements from the operands. 871#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 872 873/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 874/// returns it, using the immediate value parameter \a N as a selector. 875/// 876/// \headerfile <x86intrin.h> 877/// 878/// \code 879/// int _mm_extract_ps(__m128 X, const int N); 880/// \endcode 881/// 882/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 883/// instruction. 884/// 885/// \param X 886/// A 128-bit vector of [4 x float]. 887/// \param N 888/// An immediate value. Bits [1:0] determines which bits from the argument 889/// \a X are extracted and returned: \n 890/// 00: Bits [31:0] of parameter \a X are returned. \n 891/// 01: Bits [63:32] of parameter \a X are returned. \n 892/// 10: Bits [95:64] of parameter \a X are returned. \n 893/// 11: Bits [127:96] of parameter \a X are returned. 894/// \returns A 32-bit integer containing the extracted 32 bits of float data. 895#define _mm_extract_ps(X, N) (__extension__ \ 896 ({ union { int __i; float __f; } __t; \ 897 __v4sf __a = (__v4sf)(__m128)(X); \ 898 __t.__f = __a[(N) & 3]; \ 899 __t.__i;})) 900 901/* Miscellaneous insert and extract macros. */ 902/* Extract a single-precision float from X at index N into D. */ 903#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ 904 (D) = __a[N]; })) 905 906/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 907 an index suitable for _mm_insert_ps. */ 908#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 909 910/* Extract a float from X at index N into the first index of the return. */ 911#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 912 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 913 914/* Insert int into packed integer array at index. */ 915/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of 916/// the 128-bit integer vector parameter, and then inserting the lower 8 bits 917/// of an integer parameter \a I into an offset specified by the immediate 918/// value parameter \a N. 919/// 920/// \headerfile <x86intrin.h> 921/// 922/// \code 923/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 924/// \endcode 925/// 926/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 927/// 928/// \param X 929/// A 128-bit integer vector of [16 x i8]. This vector is copied to the 930/// result and then one of the sixteen elements in the result vector is 931/// replaced by the lower 8 bits of \a I. 932/// \param I 933/// An integer. The lower 8 bits of this operand are written to the result 934/// beginning at the offset specified by \a N. 935/// \param N 936/// An immediate value. Bits [3:0] specify the bit offset in the result at 937/// which the lower 8 bits of \a I are written. \n 938/// 0000: Bits [7:0] of the result are used for insertion. \n 939/// 0001: Bits [15:8] of the result are used for insertion. \n 940/// 0010: Bits [23:16] of the result are used for insertion. \n 941/// 0011: Bits [31:24] of the result are used for insertion. \n 942/// 0100: Bits [39:32] of the result are used for insertion. \n 943/// 0101: Bits [47:40] of the result are used for insertion. \n 944/// 0110: Bits [55:48] of the result are used for insertion. \n 945/// 0111: Bits [63:56] of the result are used for insertion. \n 946/// 1000: Bits [71:64] of the result are used for insertion. \n 947/// 1001: Bits [79:72] of the result are used for insertion. \n 948/// 1010: Bits [87:80] of the result are used for insertion. \n 949/// 1011: Bits [95:88] of the result are used for insertion. \n 950/// 1100: Bits [103:96] of the result are used for insertion. \n 951/// 1101: Bits [111:104] of the result are used for insertion. \n 952/// 1110: Bits [119:112] of the result are used for insertion. \n 953/// 1111: Bits [127:120] of the result are used for insertion. 954/// \returns A 128-bit integer vector containing the constructed values. 955#define _mm_insert_epi8(X, I, N) (__extension__ \ 956 ({ __v16qi __a = (__v16qi)(__m128i)(X); \ 957 __a[(N) & 15] = (I); \ 958 (__m128i)__a;})) 959 960/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of 961/// the 128-bit integer vector parameter, and then inserting the 32-bit 962/// integer parameter \a I at the offset specified by the immediate value 963/// parameter \a N. 964/// 965/// \headerfile <x86intrin.h> 966/// 967/// \code 968/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 969/// \endcode 970/// 971/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 972/// 973/// \param X 974/// A 128-bit integer vector of [4 x i32]. This vector is copied to the 975/// result and then one of the four elements in the result vector is 976/// replaced by \a I. 977/// \param I 978/// A 32-bit integer that is written to the result beginning at the offset 979/// specified by \a N. 980/// \param N 981/// An immediate value. Bits [1:0] specify the bit offset in the result at 982/// which the integer \a I is written. \n 983/// 00: Bits [31:0] of the result are used for insertion. \n 984/// 01: Bits [63:32] of the result are used for insertion. \n 985/// 10: Bits [95:64] of the result are used for insertion. \n 986/// 11: Bits [127:96] of the result are used for insertion. 987/// \returns A 128-bit integer vector containing the constructed values. 988#define _mm_insert_epi32(X, I, N) (__extension__ \ 989 ({ __v4si __a = (__v4si)(__m128i)(X); \ 990 __a[(N) & 3] = (I); \ 991 (__m128i)__a;})) 992 993#ifdef __x86_64__ 994/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of 995/// the 128-bit integer vector parameter, and then inserting the 64-bit 996/// integer parameter \a I, using the immediate value parameter \a N as an 997/// insertion location selector. 998/// 999/// \headerfile <x86intrin.h> 1000/// 1001/// \code 1002/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 1003/// \endcode 1004/// 1005/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 1006/// 1007/// \param X 1008/// A 128-bit integer vector of [2 x i64]. This vector is copied to the 1009/// result and then one of the two elements in the result vector is replaced 1010/// by \a I. 1011/// \param I 1012/// A 64-bit integer that is written to the result beginning at the offset 1013/// specified by \a N. 1014/// \param N 1015/// An immediate value. Bit [0] specifies the bit offset in the result at 1016/// which the integer \a I is written. \n 1017/// 0: Bits [63:0] of the result are used for insertion. \n 1018/// 1: Bits [127:64] of the result are used for insertion. \n 1019/// \returns A 128-bit integer vector containing the constructed values. 1020#define _mm_insert_epi64(X, I, N) (__extension__ \ 1021 ({ __v2di __a = (__v2di)(__m128i)(X); \ 1022 __a[(N) & 1] = (I); \ 1023 (__m128i)__a;})) 1024#endif /* __x86_64__ */ 1025 1026/* Extract int from packed integer array at index. This returns the element 1027 * as a zero extended value, so it is unsigned. 1028 */ 1029/// \brief Extracts an 8-bit element from the 128-bit integer vector of 1030/// [16 x i8], using the immediate value parameter \a N as a selector. 1031/// 1032/// \headerfile <x86intrin.h> 1033/// 1034/// \code 1035/// int _mm_extract_epi8(__m128i X, const int N); 1036/// \endcode 1037/// 1038/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1039/// 1040/// \param X 1041/// A 128-bit integer vector. 1042/// \param N 1043/// An immediate value. Bits [3:0] specify which 8-bit vector element from 1044/// the argument \a X to extract and copy to the result. \n 1045/// 0000: Bits [7:0] of parameter \a X are extracted. \n 1046/// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1047/// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1048/// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1049/// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1050/// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1051/// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1052/// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1053/// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1054/// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1055/// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1056/// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1057/// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1058/// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1059/// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1060/// 1111: Bits [127:120] of the parameter \a X are extracted. 1061/// \returns An unsigned integer, whose lower 8 bits are selected from the 1062/// 128-bit integer vector parameter and the remaining bits are assigned 1063/// zeros. 1064#define _mm_extract_epi8(X, N) (__extension__ \ 1065 ({ __v16qi __a = (__v16qi)(__m128i)(X); \ 1066 (int)(unsigned char) __a[(N) & 15];})) 1067 1068/// \brief Extracts a 32-bit element from the 128-bit integer vector of 1069/// [4 x i32], using the immediate value parameter \a N as a selector. 1070/// 1071/// \headerfile <x86intrin.h> 1072/// 1073/// \code 1074/// int _mm_extract_epi32(__m128i X, const int N); 1075/// \endcode 1076/// 1077/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1078/// 1079/// \param X 1080/// A 128-bit integer vector. 1081/// \param N 1082/// An immediate value. Bits [1:0] specify which 32-bit vector element from 1083/// the argument \a X to extract and copy to the result. \n 1084/// 00: Bits [31:0] of the parameter \a X are extracted. \n 1085/// 01: Bits [63:32] of the parameter \a X are extracted. \n 1086/// 10: Bits [95:64] of the parameter \a X are extracted. \n 1087/// 11: Bits [127:96] of the parameter \a X are exracted. 1088/// \returns An integer, whose lower 32 bits are selected from the 128-bit 1089/// integer vector parameter and the remaining bits are assigned zeros. 1090#define _mm_extract_epi32(X, N) (__extension__ \ 1091 ({ __v4si __a = (__v4si)(__m128i)(X); \ 1092 (int)__a[(N) & 3];})) 1093 1094#ifdef __x86_64__ 1095/// \brief Extracts a 64-bit element from the 128-bit integer vector of 1096/// [2 x i64], using the immediate value parameter \a N as a selector. 1097/// 1098/// \headerfile <x86intrin.h> 1099/// 1100/// \code 1101/// long long _mm_extract_epi64(__m128i X, const int N); 1102/// \endcode 1103/// 1104/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1105/// 1106/// \param X 1107/// A 128-bit integer vector. 1108/// \param N 1109/// An immediate value. Bit [0] specifies which 64-bit vector element from 1110/// the argument \a X to return. \n 1111/// 0: Bits [63:0] are returned. \n 1112/// 1: Bits [127:64] are returned. \n 1113/// \returns A 64-bit integer. 1114#define _mm_extract_epi64(X, N) (__extension__ \ 1115 ({ __v2di __a = (__v2di)(__m128i)(X); \ 1116 (long long)__a[(N) & 1];})) 1117#endif /* __x86_64 */ 1118 1119/* SSE4 128-bit Packed Integer Comparisons. */ 1120/// \brief Tests whether the specified bits in a 128-bit integer vector are all 1121/// zeros. 1122/// 1123/// \headerfile <x86intrin.h> 1124/// 1125/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1126/// 1127/// \param __M 1128/// A 128-bit integer vector containing the bits to be tested. 1129/// \param __V 1130/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1131/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1132static __inline__ int __DEFAULT_FN_ATTRS 1133_mm_testz_si128(__m128i __M, __m128i __V) 1134{ 1135 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1136} 1137 1138/// \brief Tests whether the specified bits in a 128-bit integer vector are all 1139/// ones. 1140/// 1141/// \headerfile <x86intrin.h> 1142/// 1143/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1144/// 1145/// \param __M 1146/// A 128-bit integer vector containing the bits to be tested. 1147/// \param __V 1148/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1149/// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1150static __inline__ int __DEFAULT_FN_ATTRS 1151_mm_testc_si128(__m128i __M, __m128i __V) 1152{ 1153 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1154} 1155 1156/// \brief Tests whether the specified bits in a 128-bit integer vector are 1157/// neither all zeros nor all ones. 1158/// 1159/// \headerfile <x86intrin.h> 1160/// 1161/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1162/// 1163/// \param __M 1164/// A 128-bit integer vector containing the bits to be tested. 1165/// \param __V 1166/// A 128-bit integer vector selecting which bits to test in operand \a __M. 1167/// \returns TRUE if the specified bits are neither all zeros nor all ones; 1168/// FALSE otherwise. 1169static __inline__ int __DEFAULT_FN_ATTRS 1170_mm_testnzc_si128(__m128i __M, __m128i __V) 1171{ 1172 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1173} 1174 1175/// \brief Tests whether the specified bits in a 128-bit integer vector are all 1176/// ones. 1177/// 1178/// \headerfile <x86intrin.h> 1179/// 1180/// \code 1181/// int _mm_test_all_ones(__m128i V); 1182/// \endcode 1183/// 1184/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1185/// 1186/// \param V 1187/// A 128-bit integer vector containing the bits to be tested. 1188/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1189/// otherwise. 1190#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 1191 1192/// \brief Tests whether the specified bits in a 128-bit integer vector are 1193/// neither all zeros nor all ones. 1194/// 1195/// \headerfile <x86intrin.h> 1196/// 1197/// \code 1198/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1199/// \endcode 1200/// 1201/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1202/// 1203/// \param M 1204/// A 128-bit integer vector containing the bits to be tested. 1205/// \param V 1206/// A 128-bit integer vector selecting which bits to test in operand \a M. 1207/// \returns TRUE if the specified bits are neither all zeros nor all ones; 1208/// FALSE otherwise. 1209#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1210 1211/// \brief Tests whether the specified bits in a 128-bit integer vector are all 1212/// zeros. 1213/// 1214/// \headerfile <x86intrin.h> 1215/// 1216/// \code 1217/// int _mm_test_all_zeros(__m128i M, __m128i V); 1218/// \endcode 1219/// 1220/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1221/// 1222/// \param M 1223/// A 128-bit integer vector containing the bits to be tested. 1224/// \param V 1225/// A 128-bit integer vector selecting which bits to test in operand \a M. 1226/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1227#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 1228 1229/* SSE4 64-bit Packed Integer Comparisons. */ 1230/// \brief Compares each of the corresponding 64-bit values of the 128-bit 1231/// integer vectors for equality. 1232/// 1233/// \headerfile <x86intrin.h> 1234/// 1235/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1236/// 1237/// \param __V1 1238/// A 128-bit integer vector. 1239/// \param __V2 1240/// A 128-bit integer vector. 1241/// \returns A 128-bit integer vector containing the comparison results. 1242static __inline__ __m128i __DEFAULT_FN_ATTRS 1243_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 1244{ 1245 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1246} 1247 1248/* SSE4 Packed Integer Sign-Extension. */ 1249/// \brief Sign-extends each of the lower eight 8-bit integer elements of a 1250/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1251/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1252/// are unused. 1253/// 1254/// \headerfile <x86intrin.h> 1255/// 1256/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1257/// 1258/// \param __V 1259/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- 1260/// extended to 16-bit values. 1261/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1262static __inline__ __m128i __DEFAULT_FN_ATTRS 1263_mm_cvtepi8_epi16(__m128i __V) 1264{ 1265 /* This function always performs a signed extension, but __v16qi is a char 1266 which may be signed or unsigned, so use __v16qs. */ 1267 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1268} 1269 1270/// \brief Sign-extends each of the lower four 8-bit integer elements of a 1271/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1272/// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1273/// vector are unused. 1274/// 1275/// \headerfile <x86intrin.h> 1276/// 1277/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1278/// 1279/// \param __V 1280/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign- 1281/// extended to 32-bit values. 1282/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1283static __inline__ __m128i __DEFAULT_FN_ATTRS 1284_mm_cvtepi8_epi32(__m128i __V) 1285{ 1286 /* This function always performs a signed extension, but __v16qi is a char 1287 which may be signed or unsigned, so use __v16qs. */ 1288 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1289} 1290 1291/// \brief Sign-extends each of the lower two 8-bit integer elements of a 1292/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1293/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1294/// vector are unused. 1295/// 1296/// \headerfile <x86intrin.h> 1297/// 1298/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1299/// 1300/// \param __V 1301/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign- 1302/// extended to 64-bit values. 1303/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1304static __inline__ __m128i __DEFAULT_FN_ATTRS 1305_mm_cvtepi8_epi64(__m128i __V) 1306{ 1307 /* This function always performs a signed extension, but __v16qi is a char 1308 which may be signed or unsigned, so use __v16qs. */ 1309 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1310} 1311 1312/// \brief Sign-extends each of the lower four 16-bit integer elements of a 1313/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1314/// a 128-bit vector of [4 x i32]. The upper four elements of the input 1315/// vector are unused. 1316/// 1317/// \headerfile <x86intrin.h> 1318/// 1319/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1320/// 1321/// \param __V 1322/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign- 1323/// extended to 32-bit values. 1324/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1325static __inline__ __m128i __DEFAULT_FN_ATTRS 1326_mm_cvtepi16_epi32(__m128i __V) 1327{ 1328 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1329} 1330 1331/// \brief Sign-extends each of the lower two 16-bit integer elements of a 1332/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1333/// a 128-bit vector of [2 x i64]. The upper six elements of the input 1334/// vector are unused. 1335/// 1336/// \headerfile <x86intrin.h> 1337/// 1338/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1339/// 1340/// \param __V 1341/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign- 1342/// extended to 64-bit values. 1343/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1344static __inline__ __m128i __DEFAULT_FN_ATTRS 1345_mm_cvtepi16_epi64(__m128i __V) 1346{ 1347 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1348} 1349 1350/// \brief Sign-extends each of the lower two 32-bit integer elements of a 1351/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1352/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1353/// are unused. 1354/// 1355/// \headerfile <x86intrin.h> 1356/// 1357/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1358/// 1359/// \param __V 1360/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign- 1361/// extended to 64-bit values. 1362/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1363static __inline__ __m128i __DEFAULT_FN_ATTRS 1364_mm_cvtepi32_epi64(__m128i __V) 1365{ 1366 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1367} 1368 1369/* SSE4 Packed Integer Zero-Extension. */ 1370/// \brief Zero-extends each of the lower eight 8-bit integer elements of a 1371/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1372/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1373/// are unused. 1374/// 1375/// \headerfile <x86intrin.h> 1376/// 1377/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1378/// 1379/// \param __V 1380/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero- 1381/// extended to 16-bit values. 1382/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1383static __inline__ __m128i __DEFAULT_FN_ATTRS 1384_mm_cvtepu8_epi16(__m128i __V) 1385{ 1386 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1387} 1388 1389/// \brief Zero-extends each of the lower four 8-bit integer elements of a 1390/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1391/// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1392/// vector are unused. 1393/// 1394/// \headerfile <x86intrin.h> 1395/// 1396/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1397/// 1398/// \param __V 1399/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero- 1400/// extended to 32-bit values. 1401/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1402static __inline__ __m128i __DEFAULT_FN_ATTRS 1403_mm_cvtepu8_epi32(__m128i __V) 1404{ 1405 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1406} 1407 1408/// \brief Zero-extends each of the lower two 8-bit integer elements of a 1409/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1410/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1411/// vector are unused. 1412/// 1413/// \headerfile <x86intrin.h> 1414/// 1415/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1416/// 1417/// \param __V 1418/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero- 1419/// extended to 64-bit values. 1420/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1421static __inline__ __m128i __DEFAULT_FN_ATTRS 1422_mm_cvtepu8_epi64(__m128i __V) 1423{ 1424 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1425} 1426 1427/// \brief Zero-extends each of the lower four 16-bit integer elements of a 1428/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1429/// a 128-bit vector of [4 x i32]. The upper four elements of the input 1430/// vector are unused. 1431/// 1432/// \headerfile <x86intrin.h> 1433/// 1434/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1435/// 1436/// \param __V 1437/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero- 1438/// extended to 32-bit values. 1439/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1440static __inline__ __m128i __DEFAULT_FN_ATTRS 1441_mm_cvtepu16_epi32(__m128i __V) 1442{ 1443 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1444} 1445 1446/// \brief Zero-extends each of the lower two 16-bit integer elements of a 1447/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1448/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1449/// are unused. 1450/// 1451/// \headerfile <x86intrin.h> 1452/// 1453/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1454/// 1455/// \param __V 1456/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero- 1457/// extended to 64-bit values. 1458/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1459static __inline__ __m128i __DEFAULT_FN_ATTRS 1460_mm_cvtepu16_epi64(__m128i __V) 1461{ 1462 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1463} 1464 1465/// \brief Zero-extends each of the lower two 32-bit integer elements of a 1466/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1467/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1468/// are unused. 1469/// 1470/// \headerfile <x86intrin.h> 1471/// 1472/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1473/// 1474/// \param __V 1475/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero- 1476/// extended to 64-bit values. 1477/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1478static __inline__ __m128i __DEFAULT_FN_ATTRS 1479_mm_cvtepu32_epi64(__m128i __V) 1480{ 1481 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1482} 1483 1484/* SSE4 Pack with Unsigned Saturation. */ 1485/// \brief Converts 32-bit signed integers from both 128-bit integer vector 1486/// operands into 16-bit unsigned integers, and returns the packed result. 1487/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1488/// 0x0000 are saturated to 0x0000. 1489/// 1490/// \headerfile <x86intrin.h> 1491/// 1492/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1493/// 1494/// \param __V1 1495/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1496/// signed integer and is converted to a 16-bit unsigned integer with 1497/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1498/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1499/// are written to the lower 64 bits of the result. 1500/// \param __V2 1501/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1502/// signed integer and is converted to a 16-bit unsigned integer with 1503/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1504/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1505/// are written to the higher 64 bits of the result. 1506/// \returns A 128-bit vector of [8 x i16] containing the converted values. 1507static __inline__ __m128i __DEFAULT_FN_ATTRS 1508_mm_packus_epi32(__m128i __V1, __m128i __V2) 1509{ 1510 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1511} 1512 1513/* SSE4 Multiple Packed Sums of Absolute Difference. */ 1514/// \brief Subtracts 8-bit unsigned integer values and computes the absolute 1515/// values of the differences to the corresponding bits in the destination. 1516/// Then sums of the absolute differences are returned according to the bit 1517/// fields in the immediate operand. 1518/// 1519/// \headerfile <x86intrin.h> 1520/// 1521/// \code 1522/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1523/// \endcode 1524/// 1525/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1526/// 1527/// \param X 1528/// A 128-bit vector of [16 x i8]. 1529/// \param Y 1530/// A 128-bit vector of [16 x i8]. 1531/// \param M 1532/// An 8-bit immediate operand specifying how the absolute differences are to 1533/// be calculated, according to the following algorithm: 1534/// \code 1535/// // M2 represents bit 2 of the immediate operand 1536/// // M10 represents bits [1:0] of the immediate operand 1537/// i = M2 * 4 1538/// j = M10 * 4 1539/// for (k = 0; k < 8; k = k + 1) { 1540/// d0 = abs(X[i + k + 0] - Y[j + 0]) 1541/// d1 = abs(X[i + k + 1] - Y[j + 1]) 1542/// d2 = abs(X[i + k + 2] - Y[j + 2]) 1543/// d3 = abs(X[i + k + 3] - Y[j + 3]) 1544/// r[k] = d0 + d1 + d2 + d3 1545/// } 1546/// \endcode 1547/// \returns A 128-bit integer vector containing the sums of the sets of 1548/// absolute differences between both operands. 1549#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ 1550 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1551 (__v16qi)(__m128i)(Y), (M)); }) 1552 1553/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit 1554/// vector of [8 x u16] and returns it and along with its index. 1555/// 1556/// \headerfile <x86intrin.h> 1557/// 1558/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1559/// instruction. 1560/// 1561/// \param __V 1562/// A 128-bit vector of [8 x u16]. 1563/// \returns A 128-bit value where bits [15:0] contain the minimum value found 1564/// in parameter \a __V, bits [18:16] contain the index of the minimum value 1565/// and the remaining bits are set to 0. 1566static __inline__ __m128i __DEFAULT_FN_ATTRS 1567_mm_minpos_epu16(__m128i __V) 1568{ 1569 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 1570} 1571 1572/* Handle the sse4.2 definitions here. */ 1573 1574/* These definitions are normally in nmmintrin.h, but gcc puts them in here 1575 so we'll do the same. */ 1576 1577#undef __DEFAULT_FN_ATTRS 1578#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1579 1580/* These specify the type of data that we're comparing. */ 1581#define _SIDD_UBYTE_OPS 0x00 1582#define _SIDD_UWORD_OPS 0x01 1583#define _SIDD_SBYTE_OPS 0x02 1584#define _SIDD_SWORD_OPS 0x03 1585 1586/* These specify the type of comparison operation. */ 1587#define _SIDD_CMP_EQUAL_ANY 0x00 1588#define _SIDD_CMP_RANGES 0x04 1589#define _SIDD_CMP_EQUAL_EACH 0x08 1590#define _SIDD_CMP_EQUAL_ORDERED 0x0c 1591 1592/* These macros specify the polarity of the operation. */ 1593#define _SIDD_POSITIVE_POLARITY 0x00 1594#define _SIDD_NEGATIVE_POLARITY 0x10 1595#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1596#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1597 1598/* These macros are used in _mm_cmpXstri() to specify the return. */ 1599#define _SIDD_LEAST_SIGNIFICANT 0x00 1600#define _SIDD_MOST_SIGNIFICANT 0x40 1601 1602/* These macros are used in _mm_cmpXstri() to specify the return. */ 1603#define _SIDD_BIT_MASK 0x00 1604#define _SIDD_UNIT_MASK 0x40 1605 1606/* SSE4.2 Packed Comparison Intrinsics. */ 1607/// \brief Uses the immediate operand \a M to perform a comparison of string 1608/// data with implicitly defined lengths that is contained in source operands 1609/// \a A and \a B. Returns a 128-bit integer vector representing the result 1610/// mask of the comparison. 1611/// 1612/// \headerfile <x86intrin.h> 1613/// 1614/// \code 1615/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1616/// \endcode 1617/// 1618/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1619/// instruction. 1620/// 1621/// \param A 1622/// A 128-bit integer vector containing one of the source operands to be 1623/// compared. 1624/// \param B 1625/// A 128-bit integer vector containing one of the source operands to be 1626/// compared. 1627/// \param M 1628/// An 8-bit immediate operand specifying whether the characters are bytes or 1629/// words, the type of comparison to perform, and the format of the return 1630/// value. \n 1631/// Bits [1:0]: Determine source data format. \n 1632/// 00: 16 unsigned bytes \n 1633/// 01: 8 unsigned words \n 1634/// 10: 16 signed bytes \n 1635/// 11: 8 signed words \n 1636/// Bits [3:2]: Determine comparison type and aggregation method. \n 1637/// 00: Subset: Each character in \a B is compared for equality with all 1638/// the characters in \a A. \n 1639/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1640/// basis is greater than or equal for even-indexed elements in \a A, 1641/// and less than or equal for odd-indexed elements in \a A. \n 1642/// 10: Match: Compare each pair of corresponding characters in \a A and 1643/// \a B for equality. \n 1644/// 11: Substring: Search \a B for substring matches of \a A. \n 1645/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1646/// mask of the comparison results. \n 1647/// 00: No effect. \n 1648/// 01: Negate the bit mask. \n 1649/// 10: No effect. \n 1650/// 11: Negate the bit mask only for bits with an index less than or equal 1651/// to the size of \a A or \a B. \n 1652/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1653/// bytes. \n 1654/// 0: The result is zero-extended to 16 bytes. \n 1655/// 1: The result is expanded to 16 bytes (this expansion is performed by 1656/// repeating each bit 8 or 16 times). 1657/// \returns Returns a 128-bit integer vector representing the result mask of 1658/// the comparison. 1659#define _mm_cmpistrm(A, B, M) \ 1660 (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1661 (__v16qi)(__m128i)(B), (int)(M)) 1662 1663/// \brief Uses the immediate operand \a M to perform a comparison of string 1664/// data with implicitly defined lengths that is contained in source operands 1665/// \a A and \a B. Returns an integer representing the result index of the 1666/// comparison. 1667/// 1668/// \headerfile <x86intrin.h> 1669/// 1670/// \code 1671/// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1672/// \endcode 1673/// 1674/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1675/// instruction. 1676/// 1677/// \param A 1678/// A 128-bit integer vector containing one of the source operands to be 1679/// compared. 1680/// \param B 1681/// A 128-bit integer vector containing one of the source operands to be 1682/// compared. 1683/// \param M 1684/// An 8-bit immediate operand specifying whether the characters are bytes or 1685/// words, the type of comparison to perform, and the format of the return 1686/// value. \n 1687/// Bits [1:0]: Determine source data format. \n 1688/// 00: 16 unsigned bytes \n 1689/// 01: 8 unsigned words \n 1690/// 10: 16 signed bytes \n 1691/// 11: 8 signed words \n 1692/// Bits [3:2]: Determine comparison type and aggregation method. \n 1693/// 00: Subset: Each character in \a B is compared for equality with all 1694/// the characters in \a A. \n 1695/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1696/// basis is greater than or equal for even-indexed elements in \a A, 1697/// and less than or equal for odd-indexed elements in \a A. \n 1698/// 10: Match: Compare each pair of corresponding characters in \a A and 1699/// \a B for equality. \n 1700/// 11: Substring: Search B for substring matches of \a A. \n 1701/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1702/// mask of the comparison results. \n 1703/// 00: No effect. \n 1704/// 01: Negate the bit mask. \n 1705/// 10: No effect. \n 1706/// 11: Negate the bit mask only for bits with an index less than or equal 1707/// to the size of \a A or \a B. \n 1708/// Bit [6]: Determines whether the index of the lowest set bit or the 1709/// highest set bit is returned. \n 1710/// 0: The index of the least significant set bit. \n 1711/// 1: The index of the most significant set bit. \n 1712/// \returns Returns an integer representing the result index of the comparison. 1713#define _mm_cmpistri(A, B, M) \ 1714 (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1715 (__v16qi)(__m128i)(B), (int)(M)) 1716 1717/// \brief Uses the immediate operand \a M to perform a comparison of string 1718/// data with explicitly defined lengths that is contained in source operands 1719/// \a A and \a B. Returns a 128-bit integer vector representing the result 1720/// mask of the comparison. 1721/// 1722/// \headerfile <x86intrin.h> 1723/// 1724/// \code 1725/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1726/// \endcode 1727/// 1728/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1729/// instruction. 1730/// 1731/// \param A 1732/// A 128-bit integer vector containing one of the source operands to be 1733/// compared. 1734/// \param LA 1735/// An integer that specifies the length of the string in \a A. 1736/// \param B 1737/// A 128-bit integer vector containing one of the source operands to be 1738/// compared. 1739/// \param LB 1740/// An integer that specifies the length of the string in \a B. 1741/// \param M 1742/// An 8-bit immediate operand specifying whether the characters are bytes or 1743/// words, the type of comparison to perform, and the format of the return 1744/// value. \n 1745/// Bits [1:0]: Determine source data format. \n 1746/// 00: 16 unsigned bytes \n 1747/// 01: 8 unsigned words \n 1748/// 10: 16 signed bytes \n 1749/// 11: 8 signed words \n 1750/// Bits [3:2]: Determine comparison type and aggregation method. \n 1751/// 00: Subset: Each character in \a B is compared for equality with all 1752/// the characters in \a A. \n 1753/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1754/// basis is greater than or equal for even-indexed elements in \a A, 1755/// and less than or equal for odd-indexed elements in \a A. \n 1756/// 10: Match: Compare each pair of corresponding characters in \a A and 1757/// \a B for equality. \n 1758/// 11: Substring: Search \a B for substring matches of \a A. \n 1759/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1760/// mask of the comparison results. \n 1761/// 00: No effect. \n 1762/// 01: Negate the bit mask. \n 1763/// 10: No effect. \n 1764/// 11: Negate the bit mask only for bits with an index less than or equal 1765/// to the size of \a A or \a B. \n 1766/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1767/// bytes. \n 1768/// 0: The result is zero-extended to 16 bytes. \n 1769/// 1: The result is expanded to 16 bytes (this expansion is performed by 1770/// repeating each bit 8 or 16 times). \n 1771/// \returns Returns a 128-bit integer vector representing the result mask of 1772/// the comparison. 1773#define _mm_cmpestrm(A, LA, B, LB, M) \ 1774 (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1775 (__v16qi)(__m128i)(B), (int)(LB), \ 1776 (int)(M)) 1777 1778/// \brief Uses the immediate operand \a M to perform a comparison of string 1779/// data with explicitly defined lengths that is contained in source operands 1780/// \a A and \a B. Returns an integer representing the result index of the 1781/// comparison. 1782/// 1783/// \headerfile <x86intrin.h> 1784/// 1785/// \code 1786/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1787/// \endcode 1788/// 1789/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1790/// instruction. 1791/// 1792/// \param A 1793/// A 128-bit integer vector containing one of the source operands to be 1794/// compared. 1795/// \param LA 1796/// An integer that specifies the length of the string in \a A. 1797/// \param B 1798/// A 128-bit integer vector containing one of the source operands to be 1799/// compared. 1800/// \param LB 1801/// An integer that specifies the length of the string in \a B. 1802/// \param M 1803/// An 8-bit immediate operand specifying whether the characters are bytes or 1804/// words, the type of comparison to perform, and the format of the return 1805/// value. \n 1806/// Bits [1:0]: Determine source data format. \n 1807/// 00: 16 unsigned bytes \n 1808/// 01: 8 unsigned words \n 1809/// 10: 16 signed bytes \n 1810/// 11: 8 signed words \n 1811/// Bits [3:2]: Determine comparison type and aggregation method. \n 1812/// 00: Subset: Each character in \a B is compared for equality with all 1813/// the characters in \a A. \n 1814/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1815/// basis is greater than or equal for even-indexed elements in \a A, 1816/// and less than or equal for odd-indexed elements in \a A. \n 1817/// 10: Match: Compare each pair of corresponding characters in \a A and 1818/// \a B for equality. \n 1819/// 11: Substring: Search B for substring matches of \a A. \n 1820/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1821/// mask of the comparison results. \n 1822/// 00: No effect. \n 1823/// 01: Negate the bit mask. \n 1824/// 10: No effect. \n 1825/// 11: Negate the bit mask only for bits with an index less than or equal 1826/// to the size of \a A or \a B. \n 1827/// Bit [6]: Determines whether the index of the lowest set bit or the 1828/// highest set bit is returned. \n 1829/// 0: The index of the least significant set bit. \n 1830/// 1: The index of the most significant set bit. \n 1831/// \returns Returns an integer representing the result index of the comparison. 1832#define _mm_cmpestri(A, LA, B, LB, M) \ 1833 (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1834 (__v16qi)(__m128i)(B), (int)(LB), \ 1835 (int)(M)) 1836 1837/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1838/// \brief Uses the immediate operand \a M to perform a comparison of string 1839/// data with implicitly defined lengths that is contained in source operands 1840/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1841/// string in \a B is the maximum, otherwise, returns 0. 1842/// 1843/// \headerfile <x86intrin.h> 1844/// 1845/// \code 1846/// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1847/// \endcode 1848/// 1849/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1850/// instruction. 1851/// 1852/// \param A 1853/// A 128-bit integer vector containing one of the source operands to be 1854/// compared. 1855/// \param B 1856/// A 128-bit integer vector containing one of the source operands to be 1857/// compared. 1858/// \param M 1859/// An 8-bit immediate operand specifying whether the characters are bytes or 1860/// words and the type of comparison to perform. \n 1861/// Bits [1:0]: Determine source data format. \n 1862/// 00: 16 unsigned bytes \n 1863/// 01: 8 unsigned words \n 1864/// 10: 16 signed bytes \n 1865/// 11: 8 signed words \n 1866/// Bits [3:2]: Determine comparison type and aggregation method. \n 1867/// 00: Subset: Each character in \a B is compared for equality with all 1868/// the characters in \a A. \n 1869/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1870/// basis is greater than or equal for even-indexed elements in \a A, 1871/// and less than or equal for odd-indexed elements in \a A. \n 1872/// 10: Match: Compare each pair of corresponding characters in \a A and 1873/// \a B for equality. \n 1874/// 11: Substring: Search \a B for substring matches of \a A. \n 1875/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1876/// mask of the comparison results. \n 1877/// 00: No effect. \n 1878/// 01: Negate the bit mask. \n 1879/// 10: No effect. \n 1880/// 11: Negate the bit mask only for bits with an index less than or equal 1881/// to the size of \a A or \a B. \n 1882/// \returns Returns 1 if the bit mask is zero and the length of the string in 1883/// \a B is the maximum; otherwise, returns 0. 1884#define _mm_cmpistra(A, B, M) \ 1885 (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1886 (__v16qi)(__m128i)(B), (int)(M)) 1887 1888/// \brief Uses the immediate operand \a M to perform a comparison of string 1889/// data with implicitly defined lengths that is contained in source operands 1890/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1891/// 0. 1892/// 1893/// \headerfile <x86intrin.h> 1894/// 1895/// \code 1896/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1897/// \endcode 1898/// 1899/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1900/// instruction. 1901/// 1902/// \param A 1903/// A 128-bit integer vector containing one of the source operands to be 1904/// compared. 1905/// \param B 1906/// A 128-bit integer vector containing one of the source operands to be 1907/// compared. 1908/// \param M 1909/// An 8-bit immediate operand specifying whether the characters are bytes or 1910/// words and the type of comparison to perform. \n 1911/// Bits [1:0]: Determine source data format. \n 1912/// 00: 16 unsigned bytes \n 1913/// 01: 8 unsigned words \n 1914/// 10: 16 signed bytes \n 1915/// 11: 8 signed words \n 1916/// Bits [3:2]: Determine comparison type and aggregation method. \n 1917/// 00: Subset: Each character in \a B is compared for equality with all 1918/// the characters in \a A. \n 1919/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1920/// basis is greater than or equal for even-indexed elements in \a A, 1921/// and less than or equal for odd-indexed elements in \a A. \n 1922/// 10: Match: Compare each pair of corresponding characters in \a A and 1923/// \a B for equality. \n 1924/// 11: Substring: Search B for substring matches of \a A. \n 1925/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1926/// mask of the comparison results. \n 1927/// 00: No effect. \n 1928/// 01: Negate the bit mask. \n 1929/// 10: No effect. \n 1930/// 11: Negate the bit mask only for bits with an index less than or equal 1931/// to the size of \a A or \a B. 1932/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1933#define _mm_cmpistrc(A, B, M) \ 1934 (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1935 (__v16qi)(__m128i)(B), (int)(M)) 1936 1937/// \brief Uses the immediate operand \a M to perform a comparison of string 1938/// data with implicitly defined lengths that is contained in source operands 1939/// \a A and \a B. Returns bit 0 of the resulting bit mask. 1940/// 1941/// \headerfile <x86intrin.h> 1942/// 1943/// \code 1944/// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1945/// \endcode 1946/// 1947/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1948/// instruction. 1949/// 1950/// \param A 1951/// A 128-bit integer vector containing one of the source operands to be 1952/// compared. 1953/// \param B 1954/// A 128-bit integer vector containing one of the source operands to be 1955/// compared. 1956/// \param M 1957/// An 8-bit immediate operand specifying whether the characters are bytes or 1958/// words and the type of comparison to perform. \n 1959/// Bits [1:0]: Determine source data format. \n 1960/// 00: 16 unsigned bytes \n 1961/// 01: 8 unsigned words \n 1962/// 10: 16 signed bytes \n 1963/// 11: 8 signed words \n 1964/// Bits [3:2]: Determine comparison type and aggregation method. \n 1965/// 00: Subset: Each character in \a B is compared for equality with all 1966/// the characters in \a A. \n 1967/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1968/// basis is greater than or equal for even-indexed elements in \a A, 1969/// and less than or equal for odd-indexed elements in \a A. \n 1970/// 10: Match: Compare each pair of corresponding characters in \a A and 1971/// \a B for equality. \n 1972/// 11: Substring: Search B for substring matches of \a A. \n 1973/// Bits [5:4]: Determine whether to perform a one's complement on the bit 1974/// mask of the comparison results. \n 1975/// 00: No effect. \n 1976/// 01: Negate the bit mask. \n 1977/// 10: No effect. \n 1978/// 11: Negate the bit mask only for bits with an index less than or equal 1979/// to the size of \a A or \a B. \n 1980/// \returns Returns bit 0 of the resulting bit mask. 1981#define _mm_cmpistro(A, B, M) \ 1982 (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1983 (__v16qi)(__m128i)(B), (int)(M)) 1984 1985/// \brief Uses the immediate operand \a M to perform a comparison of string 1986/// data with implicitly defined lengths that is contained in source operands 1987/// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1988/// the maximum, otherwise, returns 0. 1989/// 1990/// \headerfile <x86intrin.h> 1991/// 1992/// \code 1993/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1994/// \endcode 1995/// 1996/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1997/// instruction. 1998/// 1999/// \param A 2000/// A 128-bit integer vector containing one of the source operands to be 2001/// compared. 2002/// \param B 2003/// A 128-bit integer vector containing one of the source operands to be 2004/// compared. 2005/// \param M 2006/// An 8-bit immediate operand specifying whether the characters are bytes or 2007/// words and the type of comparison to perform. \n 2008/// Bits [1:0]: Determine source data format. \n 2009/// 00: 16 unsigned bytes \n 2010/// 01: 8 unsigned words \n 2011/// 10: 16 signed bytes \n 2012/// 11: 8 signed words \n 2013/// Bits [3:2]: Determine comparison type and aggregation method. \n 2014/// 00: Subset: Each character in \a B is compared for equality with all 2015/// the characters in \a A. \n 2016/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2017/// basis is greater than or equal for even-indexed elements in \a A, 2018/// and less than or equal for odd-indexed elements in \a A. \n 2019/// 10: Match: Compare each pair of corresponding characters in \a A and 2020/// \a B for equality. \n 2021/// 11: Substring: Search \a B for substring matches of \a A. \n 2022/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2023/// mask of the comparison results. \n 2024/// 00: No effect. \n 2025/// 01: Negate the bit mask. \n 2026/// 10: No effect. \n 2027/// 11: Negate the bit mask only for bits with an index less than or equal 2028/// to the size of \a A or \a B. \n 2029/// \returns Returns 1 if the length of the string in \a A is less than the 2030/// maximum, otherwise, returns 0. 2031#define _mm_cmpistrs(A, B, M) \ 2032 (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 2033 (__v16qi)(__m128i)(B), (int)(M)) 2034 2035/// \brief Uses the immediate operand \a M to perform a comparison of string 2036/// data with implicitly defined lengths that is contained in source operands 2037/// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2038/// the maximum, otherwise, returns 0. 2039/// 2040/// \headerfile <x86intrin.h> 2041/// 2042/// \code 2043/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 2044/// \endcode 2045/// 2046/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 2047/// instruction. 2048/// 2049/// \param A 2050/// A 128-bit integer vector containing one of the source operands to be 2051/// compared. 2052/// \param B 2053/// A 128-bit integer vector containing one of the source operands to be 2054/// compared. 2055/// \param M 2056/// An 8-bit immediate operand specifying whether the characters are bytes or 2057/// words and the type of comparison to perform. \n 2058/// Bits [1:0]: Determine source data format. \n 2059/// 00: 16 unsigned bytes \n 2060/// 01: 8 unsigned words \n 2061/// 10: 16 signed bytes \n 2062/// 11: 8 signed words \n 2063/// Bits [3:2]: Determine comparison type and aggregation method. \n 2064/// 00: Subset: Each character in \a B is compared for equality with all 2065/// the characters in \a A. \n 2066/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2067/// basis is greater than or equal for even-indexed elements in \a A, 2068/// and less than or equal for odd-indexed elements in \a A. \n 2069/// 10: Match: Compare each pair of corresponding characters in \a A and 2070/// \a B for equality. \n 2071/// 11: Substring: Search \a B for substring matches of \a A. \n 2072/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2073/// mask of the comparison results. \n 2074/// 00: No effect. \n 2075/// 01: Negate the bit mask. \n 2076/// 10: No effect. \n 2077/// 11: Negate the bit mask only for bits with an index less than or equal 2078/// to the size of \a A or \a B. 2079/// \returns Returns 1 if the length of the string in \a B is less than the 2080/// maximum, otherwise, returns 0. 2081#define _mm_cmpistrz(A, B, M) \ 2082 (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2083 (__v16qi)(__m128i)(B), (int)(M)) 2084 2085/// \brief Uses the immediate operand \a M to perform a comparison of string 2086/// data with explicitly defined lengths that is contained in source operands 2087/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2088/// string in \a B is the maximum, otherwise, returns 0. 2089/// 2090/// \headerfile <x86intrin.h> 2091/// 2092/// \code 2093/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2094/// \endcode 2095/// 2096/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2097/// instruction. 2098/// 2099/// \param A 2100/// A 128-bit integer vector containing one of the source operands to be 2101/// compared. 2102/// \param LA 2103/// An integer that specifies the length of the string in \a A. 2104/// \param B 2105/// A 128-bit integer vector containing one of the source operands to be 2106/// compared. 2107/// \param LB 2108/// An integer that specifies the length of the string in \a B. 2109/// \param M 2110/// An 8-bit immediate operand specifying whether the characters are bytes or 2111/// words and the type of comparison to perform. \n 2112/// Bits [1:0]: Determine source data format. \n 2113/// 00: 16 unsigned bytes \n 2114/// 01: 8 unsigned words \n 2115/// 10: 16 signed bytes \n 2116/// 11: 8 signed words \n 2117/// Bits [3:2]: Determine comparison type and aggregation method. \n 2118/// 00: Subset: Each character in \a B is compared for equality with all 2119/// the characters in \a A. \n 2120/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2121/// basis is greater than or equal for even-indexed elements in \a A, 2122/// and less than or equal for odd-indexed elements in \a A. \n 2123/// 10: Match: Compare each pair of corresponding characters in \a A and 2124/// \a B for equality. \n 2125/// 11: Substring: Search \a B for substring matches of \a A. \n 2126/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2127/// mask of the comparison results. \n 2128/// 00: No effect. \n 2129/// 01: Negate the bit mask. \n 2130/// 10: No effect. \n 2131/// 11: Negate the bit mask only for bits with an index less than or equal 2132/// to the size of \a A or \a B. 2133/// \returns Returns 1 if the bit mask is zero and the length of the string in 2134/// \a B is the maximum, otherwise, returns 0. 2135#define _mm_cmpestra(A, LA, B, LB, M) \ 2136 (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2137 (__v16qi)(__m128i)(B), (int)(LB), \ 2138 (int)(M)) 2139 2140/// \brief Uses the immediate operand \a M to perform a comparison of string 2141/// data with explicitly defined lengths that is contained in source operands 2142/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2143/// returns 0. 2144/// 2145/// \headerfile <x86intrin.h> 2146/// 2147/// \code 2148/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2149/// \endcode 2150/// 2151/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2152/// instruction. 2153/// 2154/// \param A 2155/// A 128-bit integer vector containing one of the source operands to be 2156/// compared. 2157/// \param LA 2158/// An integer that specifies the length of the string in \a A. 2159/// \param B 2160/// A 128-bit integer vector containing one of the source operands to be 2161/// compared. 2162/// \param LB 2163/// An integer that specifies the length of the string in \a B. 2164/// \param M 2165/// An 8-bit immediate operand specifying whether the characters are bytes or 2166/// words and the type of comparison to perform. \n 2167/// Bits [1:0]: Determine source data format. \n 2168/// 00: 16 unsigned bytes \n 2169/// 01: 8 unsigned words \n 2170/// 10: 16 signed bytes \n 2171/// 11: 8 signed words \n 2172/// Bits [3:2]: Determine comparison type and aggregation method. \n 2173/// 00: Subset: Each character in \a B is compared for equality with all 2174/// the characters in \a A. \n 2175/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2176/// basis is greater than or equal for even-indexed elements in \a A, 2177/// and less than or equal for odd-indexed elements in \a A. \n 2178/// 10: Match: Compare each pair of corresponding characters in \a A and 2179/// \a B for equality. \n 2180/// 11: Substring: Search \a B for substring matches of \a A. \n 2181/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2182/// mask of the comparison results. \n 2183/// 00: No effect. \n 2184/// 01: Negate the bit mask. \n 2185/// 10: No effect. \n 2186/// 11: Negate the bit mask only for bits with an index less than or equal 2187/// to the size of \a A or \a B. \n 2188/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2189#define _mm_cmpestrc(A, LA, B, LB, M) \ 2190 (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2191 (__v16qi)(__m128i)(B), (int)(LB), \ 2192 (int)(M)) 2193 2194/// \brief Uses the immediate operand \a M to perform a comparison of string 2195/// data with explicitly defined lengths that is contained in source operands 2196/// \a A and \a B. Returns bit 0 of the resulting bit mask. 2197/// 2198/// \headerfile <x86intrin.h> 2199/// 2200/// \code 2201/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2202/// \endcode 2203/// 2204/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2205/// instruction. 2206/// 2207/// \param A 2208/// A 128-bit integer vector containing one of the source operands to be 2209/// compared. 2210/// \param LA 2211/// An integer that specifies the length of the string in \a A. 2212/// \param B 2213/// A 128-bit integer vector containing one of the source operands to be 2214/// compared. 2215/// \param LB 2216/// An integer that specifies the length of the string in \a B. 2217/// \param M 2218/// An 8-bit immediate operand specifying whether the characters are bytes or 2219/// words and the type of comparison to perform. \n 2220/// Bits [1:0]: Determine source data format. \n 2221/// 00: 16 unsigned bytes \n 2222/// 01: 8 unsigned words \n 2223/// 10: 16 signed bytes \n 2224/// 11: 8 signed words \n 2225/// Bits [3:2]: Determine comparison type and aggregation method. \n 2226/// 00: Subset: Each character in \a B is compared for equality with all 2227/// the characters in \a A. \n 2228/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2229/// basis is greater than or equal for even-indexed elements in \a A, 2230/// and less than or equal for odd-indexed elements in \a A. \n 2231/// 10: Match: Compare each pair of corresponding characters in \a A and 2232/// \a B for equality. \n 2233/// 11: Substring: Search \a B for substring matches of \a A. \n 2234/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2235/// mask of the comparison results. \n 2236/// 00: No effect. \n 2237/// 01: Negate the bit mask. \n 2238/// 10: No effect. \n 2239/// 11: Negate the bit mask only for bits with an index less than or equal 2240/// to the size of \a A or \a B. 2241/// \returns Returns bit 0 of the resulting bit mask. 2242#define _mm_cmpestro(A, LA, B, LB, M) \ 2243 (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2244 (__v16qi)(__m128i)(B), (int)(LB), \ 2245 (int)(M)) 2246 2247/// \brief Uses the immediate operand \a M to perform a comparison of string 2248/// data with explicitly defined lengths that is contained in source operands 2249/// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2250/// the maximum, otherwise, returns 0. 2251/// 2252/// \headerfile <x86intrin.h> 2253/// 2254/// \code 2255/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2256/// \endcode 2257/// 2258/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2259/// instruction. 2260/// 2261/// \param A 2262/// A 128-bit integer vector containing one of the source operands to be 2263/// compared. 2264/// \param LA 2265/// An integer that specifies the length of the string in \a A. 2266/// \param B 2267/// A 128-bit integer vector containing one of the source operands to be 2268/// compared. 2269/// \param LB 2270/// An integer that specifies the length of the string in \a B. 2271/// \param M 2272/// An 8-bit immediate operand specifying whether the characters are bytes or 2273/// words and the type of comparison to perform. \n 2274/// Bits [1:0]: Determine source data format. \n 2275/// 00: 16 unsigned bytes \n 2276/// 01: 8 unsigned words \n 2277/// 10: 16 signed bytes \n 2278/// 11: 8 signed words \n 2279/// Bits [3:2]: Determine comparison type and aggregation method. \n 2280/// 00: Subset: Each character in \a B is compared for equality with all 2281/// the characters in \a A. \n 2282/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2283/// basis is greater than or equal for even-indexed elements in \a A, 2284/// and less than or equal for odd-indexed elements in \a A. \n 2285/// 10: Match: Compare each pair of corresponding characters in \a A and 2286/// \a B for equality. \n 2287/// 11: Substring: Search \a B for substring matches of \a A. \n 2288/// Bits [5:4]: Determine whether to perform a one's complement in the bit 2289/// mask of the comparison results. \n 2290/// 00: No effect. \n 2291/// 01: Negate the bit mask. \n 2292/// 10: No effect. \n 2293/// 11: Negate the bit mask only for bits with an index less than or equal 2294/// to the size of \a A or \a B. \n 2295/// \returns Returns 1 if the length of the string in \a A is less than the 2296/// maximum, otherwise, returns 0. 2297#define _mm_cmpestrs(A, LA, B, LB, M) \ 2298 (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2299 (__v16qi)(__m128i)(B), (int)(LB), \ 2300 (int)(M)) 2301 2302/// \brief Uses the immediate operand \a M to perform a comparison of string 2303/// data with explicitly defined lengths that is contained in source operands 2304/// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2305/// the maximum, otherwise, returns 0. 2306/// 2307/// \headerfile <x86intrin.h> 2308/// 2309/// \code 2310/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2311/// \endcode 2312/// 2313/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2314/// 2315/// \param A 2316/// A 128-bit integer vector containing one of the source operands to be 2317/// compared. 2318/// \param LA 2319/// An integer that specifies the length of the string in \a A. 2320/// \param B 2321/// A 128-bit integer vector containing one of the source operands to be 2322/// compared. 2323/// \param LB 2324/// An integer that specifies the length of the string in \a B. 2325/// \param M 2326/// An 8-bit immediate operand specifying whether the characters are bytes or 2327/// words and the type of comparison to perform. \n 2328/// Bits [1:0]: Determine source data format. \n 2329/// 00: 16 unsigned bytes \n 2330/// 01: 8 unsigned words \n 2331/// 10: 16 signed bytes \n 2332/// 11: 8 signed words \n 2333/// Bits [3:2]: Determine comparison type and aggregation method. \n 2334/// 00: Subset: Each character in \a B is compared for equality with all 2335/// the characters in \a A. \n 2336/// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2337/// basis is greater than or equal for even-indexed elements in \a A, 2338/// and less than or equal for odd-indexed elements in \a A. \n 2339/// 10: Match: Compare each pair of corresponding characters in \a A and 2340/// \a B for equality. \n 2341/// 11: Substring: Search \a B for substring matches of \a A. \n 2342/// Bits [5:4]: Determine whether to perform a one's complement on the bit 2343/// mask of the comparison results. \n 2344/// 00: No effect. \n 2345/// 01: Negate the bit mask. \n 2346/// 10: No effect. \n 2347/// 11: Negate the bit mask only for bits with an index less than or equal 2348/// to the size of \a A or \a B. 2349/// \returns Returns 1 if the length of the string in \a B is less than the 2350/// maximum, otherwise, returns 0. 2351#define _mm_cmpestrz(A, LA, B, LB, M) \ 2352 (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2353 (__v16qi)(__m128i)(B), (int)(LB), \ 2354 (int)(M)) 2355 2356/* SSE4.2 Compare Packed Data -- Greater Than. */ 2357/// \brief Compares each of the corresponding 64-bit values of the 128-bit 2358/// integer vectors to determine if the values in the first operand are 2359/// greater than those in the second operand. 2360/// 2361/// \headerfile <x86intrin.h> 2362/// 2363/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2364/// 2365/// \param __V1 2366/// A 128-bit integer vector. 2367/// \param __V2 2368/// A 128-bit integer vector. 2369/// \returns A 128-bit integer vector containing the comparison results. 2370static __inline__ __m128i __DEFAULT_FN_ATTRS 2371_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 2372{ 2373 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2374} 2375 2376/* SSE4.2 Accumulate CRC32. */ 2377/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2378/// unsigned char operand. 2379/// 2380/// \headerfile <x86intrin.h> 2381/// 2382/// This intrinsic corresponds to the <c> CRC32B </c> instruction. 2383/// 2384/// \param __C 2385/// An unsigned integer operand to add to the CRC-32C checksum of operand 2386/// \a __D. 2387/// \param __D 2388/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. 2389/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2390/// operand \a __D. 2391static __inline__ unsigned int __DEFAULT_FN_ATTRS 2392_mm_crc32_u8(unsigned int __C, unsigned char __D) 2393{ 2394 return __builtin_ia32_crc32qi(__C, __D); 2395} 2396 2397/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2398/// unsigned short operand. 2399/// 2400/// \headerfile <x86intrin.h> 2401/// 2402/// This intrinsic corresponds to the <c> CRC32W </c> instruction. 2403/// 2404/// \param __C 2405/// An unsigned integer operand to add to the CRC-32C checksum of operand 2406/// \a __D. 2407/// \param __D 2408/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. 2409/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2410/// operand \a __D. 2411static __inline__ unsigned int __DEFAULT_FN_ATTRS 2412_mm_crc32_u16(unsigned int __C, unsigned short __D) 2413{ 2414 return __builtin_ia32_crc32hi(__C, __D); 2415} 2416 2417/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of 2418/// the second unsigned integer operand. 2419/// 2420/// \headerfile <x86intrin.h> 2421/// 2422/// This intrinsic corresponds to the <c> CRC32L </c> instruction. 2423/// 2424/// \param __C 2425/// An unsigned integer operand to add to the CRC-32C checksum of operand 2426/// \a __D. 2427/// \param __D 2428/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. 2429/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2430/// operand \a __D. 2431static __inline__ unsigned int __DEFAULT_FN_ATTRS 2432_mm_crc32_u32(unsigned int __C, unsigned int __D) 2433{ 2434 return __builtin_ia32_crc32si(__C, __D); 2435} 2436 2437#ifdef __x86_64__ 2438/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2439/// unsigned 64-bit integer operand. 2440/// 2441/// \headerfile <x86intrin.h> 2442/// 2443/// This intrinsic corresponds to the <c> CRC32Q </c> instruction. 2444/// 2445/// \param __C 2446/// An unsigned integer operand to add to the CRC-32C checksum of operand 2447/// \a __D. 2448/// \param __D 2449/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. 2450/// \returns The result of adding operand \a __C to the CRC-32C checksum of 2451/// operand \a __D. 2452static __inline__ unsigned long long __DEFAULT_FN_ATTRS 2453_mm_crc32_u64(unsigned long long __C, unsigned long long __D) 2454{ 2455 return __builtin_ia32_crc32di(__C, __D); 2456} 2457#endif /* __x86_64__ */ 2458 2459#undef __DEFAULT_FN_ATTRS 2460 2461#ifdef __POPCNT__ 2462#include <popcntintrin.h> 2463#endif 2464 2465#endif /* _SMMINTRIN_H */ 2466