avxintrin.h revision 347208968c303a9c11fe29012f6dc49680465182
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28typedef double __v4df __attribute__ ((__vector_size__ (32))); 29typedef float __v8sf __attribute__ ((__vector_size__ (32))); 30typedef long long __v4di __attribute__ ((__vector_size__ (32))); 31typedef int __v8si __attribute__ ((__vector_size__ (32))); 32typedef short __v16hi __attribute__ ((__vector_size__ (32))); 33typedef char __v32qi __attribute__ ((__vector_size__ (32))); 34 35typedef float __m256 __attribute__ ((__vector_size__ (32))); 36typedef double __m256d __attribute__((__vector_size__(32))); 37typedef long long __m256i __attribute__((__vector_size__(32))); 38 39/* Arithmetic */ 40static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 41_mm256_add_pd(__m256d a, __m256d b) 42{ 43 return a+b; 44} 45 46static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 47_mm256_add_ps(__m256 a, __m256 b) 48{ 49 return a+b; 50} 51 52static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 53_mm256_sub_pd(__m256d a, __m256d b) 54{ 55 return a-b; 56} 57 58static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 59_mm256_sub_ps(__m256 a, __m256 b) 60{ 61 return a-b; 62} 63 64static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 65_mm256_addsub_pd(__m256d a, __m256d b) 66{ 67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 68} 69 70static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 71_mm256_addsub_ps(__m256 a, __m256 b) 72{ 73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 74} 75 76static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 77_mm256_div_pd(__m256d a, __m256d b) 78{ 79 return a / b; 80} 81 82static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 83_mm256_div_ps(__m256 a, __m256 b) 84{ 85 return a / b; 86} 87 88static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 89_mm256_max_pd(__m256d a, __m256d b) 90{ 91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 92} 93 94static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 95_mm256_max_ps(__m256 a, __m256 b) 96{ 97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 98} 99 100static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 101_mm256_min_pd(__m256d a, __m256d b) 102{ 103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 104} 105 106static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 107_mm256_min_ps(__m256 a, __m256 b) 108{ 109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 110} 111 112static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 113_mm256_mul_pd(__m256d a, __m256d b) 114{ 115 return a * b; 116} 117 118static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 119_mm256_mul_ps(__m256 a, __m256 b) 120{ 121 return a * b; 122} 123 124static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 125_mm256_sqrt_pd(__m256d a) 126{ 127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 128} 129 130static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 131_mm256_sqrt_ps(__m256 a) 132{ 133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 134} 135 136static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 137_mm256_rsqrt_ps(__m256 a) 138{ 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 140} 141 142static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 143_mm256_rcp_ps(__m256 a) 144{ 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 146} 147 148static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 149_mm256_round_pd(__m256d v, const int m) 150{ 151 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m); 152} 153 154static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 155_mm256_round_ps(__m256 v, const int m) 156{ 157 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m); 158} 159 160#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 161#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 162#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 163#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 164 165/* Logical */ 166static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 167_mm256_and_pd(__m256d a, __m256d b) 168{ 169 return (__m256d)((__v4di)a & (__v4di)b); 170} 171 172static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 173_mm256_and_ps(__m256 a, __m256 b) 174{ 175 return (__m256)((__v8si)a & (__v8si)b); 176} 177 178static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 179_mm256_andnot_pd(__m256d a, __m256d b) 180{ 181 return (__m256d)(~(__v4di)a & (__v4di)b); 182} 183 184static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 185_mm256_andnot_ps(__m256 a, __m256 b) 186{ 187 return (__m256)(~(__v8si)a & (__v8si)b); 188} 189 190static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 191_mm256_or_pd(__m256d a, __m256d b) 192{ 193 return (__m256d)((__v4di)a | (__v4di)b); 194} 195 196static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 197_mm256_or_ps(__m256 a, __m256 b) 198{ 199 return (__m256)((__v8si)a | (__v8si)b); 200} 201 202static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 203_mm256_xor_pd(__m256d a, __m256d b) 204{ 205 return (__m256d)((__v4di)a ^ (__v4di)b); 206} 207 208static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 209_mm256_xor_ps(__m256 a, __m256 b) 210{ 211 return (__m256)((__v8si)a ^ (__v8si)b); 212} 213 214/* Horizontal arithmetic */ 215static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 216_mm256_hadd_pd(__m256d a, __m256d b) 217{ 218 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 219} 220 221static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 222_mm256_hadd_ps(__m256 a, __m256 b) 223{ 224 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 225} 226 227static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 228_mm256_hsub_pd(__m256d a, __m256d b) 229{ 230 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 231} 232 233static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 234_mm256_hsub_ps(__m256 a, __m256 b) 235{ 236 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 237} 238 239/* Vector permutations */ 240static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 241_mm_permutevar_pd(__m128d a, __m128i c) 242{ 243 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 244} 245 246static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 247_mm256_permutevar_pd(__m256d a, __m256i c) 248{ 249 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 250} 251 252static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 253_mm_permutevar_ps(__m128 a, __m128i c) 254{ 255 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 256} 257 258static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 259_mm256_permutevar_ps(__m256 a, __m256i c) 260{ 261 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 262 (__v8si)c); 263} 264 265static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 266_mm_permute_pd(__m128d a, const int c) 267{ 268 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c); 269} 270 271static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 272_mm256_permute_pd(__m256d a, const int c) 273{ 274 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c); 275} 276 277static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 278_mm_permute_ps(__m128 a, const int c) 279{ 280 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c); 281} 282 283static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 284_mm256_permute_ps(__m256 a, const int c) 285{ 286 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c); 287} 288 289static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 290_mm256_permute2f128_pd(__m256d a, __m256d b, const int c) 291{ 292 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c); 293} 294 295static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 296_mm256_permute2f128_ps(__m256 a, __m256 b, const int c) 297{ 298 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c); 299} 300 301static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 302_mm256_permute2f128_si256(__m256i a, __m256i b, const int c) 303{ 304 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c); 305} 306 307/* Vector Blend */ 308#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 309 __m256d __V1 = (V1); \ 310 __m256d __V2 = (V2); \ 311 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, M); }) 312 313#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 314 __m256 __V1 = (V1); \ 315 __m256 __V2 = (V2); \ 316 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, M); }) 317 318static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 319_mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 320{ 321 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 322} 323 324static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 325_mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 326{ 327 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 328} 329 330/* Vector Dot Product */ 331#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 332 __m256 __V1 = (V1); \ 333 __m256 __V2 = (V2); \ 334 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, M); }) 335 336/* Vector shuffle */ 337#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 338 __m256 __a = (a); \ 339 __m256 __b = (b); \ 340 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ 341 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 342 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 343 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ 344 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) 345 346#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 347 __m256d __a = (a); \ 348 __m256d __b = (b); \ 349 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ 350 (mask) & 0x1, \ 351 (((mask) & 0x2) >> 1) + 4, \ 352 (((mask) & 0x4) >> 2) + 2, \ 353 (((mask) & 0x8) >> 3) + 6); }) 354 355/* Compare */ 356#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 357#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 358#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 359#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 360#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 361#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 362#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 363#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 364#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 365#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 366#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 367#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 368#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 369#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 370#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 371#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 372#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 373#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 374#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 375#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 376#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 377#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 378#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 379#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 380#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 381#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 382#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 383#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 384#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 385#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 386#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 387#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 388 389#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 390 __m128d __a = (a); \ 391 __m128d __b = (b); \ 392 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) 393 394#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 395 __m128 __a = (a); \ 396 __m128 __b = (b); \ 397 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) 398 399#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 400 __m256d __a = (a); \ 401 __m256d __b = (b); \ 402 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) 403 404#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 405 __m256 __a = (a); \ 406 __m256 __b = (b); \ 407 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) 408 409#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 410 __m128d __a = (a); \ 411 __m128d __b = (b); \ 412 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) 413 414#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 415 __m128 __a = (a); \ 416 __m128 __b = (b); \ 417 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) 418 419/* Vector extract */ 420static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 421_mm256_extractf128_pd(__m256d a, const int o) 422{ 423 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o); 424} 425 426static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 427_mm256_extractf128_ps(__m256 a, const int o) 428{ 429 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o); 430} 431 432static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 433_mm256_extractf128_si256(__m256i a, const int o) 434{ 435 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o); 436} 437 438static __inline int __attribute__((__always_inline__, __nodebug__)) 439_mm256_extract_epi32(__m256i a, int const imm) 440{ 441 __v8si b = (__v8si)a; 442 return b[imm]; 443} 444 445static __inline int __attribute__((__always_inline__, __nodebug__)) 446_mm256_extract_epi16(__m256i a, int const imm) 447{ 448 __v16hi b = (__v16hi)a; 449 return b[imm]; 450} 451 452static __inline int __attribute__((__always_inline__, __nodebug__)) 453_mm256_extract_epi8(__m256i a, int const imm) 454{ 455 __v32qi b = (__v32qi)a; 456 return b[imm]; 457} 458 459#ifdef __x86_64__ 460static __inline long long __attribute__((__always_inline__, __nodebug__)) 461_mm256_extract_epi64(__m256i a, const int imm) 462{ 463 __v4di b = (__v4di)a; 464 return b[imm]; 465} 466#endif 467 468/* Vector insert */ 469static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 470_mm256_insertf128_pd(__m256d a, __m128d b, const int o) 471{ 472 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o); 473} 474 475static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 476_mm256_insertf128_ps(__m256 a, __m128 b, const int o) 477{ 478 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o); 479} 480 481static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 482_mm256_insertf128_si256(__m256i a, __m128i b, const int o) 483{ 484 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o); 485} 486 487static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 488_mm256_insert_epi32(__m256i a, int b, int const imm) 489{ 490 __v8si c = (__v8si)a; 491 c[imm & 7] = b; 492 return (__m256i)c; 493} 494 495static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 496_mm256_insert_epi16(__m256i a, int b, int const imm) 497{ 498 __v16hi c = (__v16hi)a; 499 c[imm & 15] = b; 500 return (__m256i)c; 501} 502 503static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 504_mm256_insert_epi8(__m256i a, int b, int const imm) 505{ 506 __v32qi c = (__v32qi)a; 507 c[imm & 31] = b; 508 return (__m256i)c; 509} 510 511#ifdef __x86_64__ 512static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 513_mm256_insert_epi64(__m256i a, int b, int const imm) 514{ 515 __v4di c = (__v4di)a; 516 c[imm & 3] = b; 517 return (__m256i)c; 518} 519#endif 520 521/* Conversion */ 522static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 523_mm256_cvtepi32_pd(__m128i a) 524{ 525 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 526} 527 528static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 529_mm256_cvtepi32_ps(__m256i a) 530{ 531 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 532} 533 534static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 535_mm256_cvtpd_ps(__m256d a) 536{ 537 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 538} 539 540static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 541_mm256_cvtps_epi32(__m256 a) 542{ 543 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 544} 545 546static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 547_mm256_cvtps_pd(__m128 a) 548{ 549 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 550} 551 552static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 553_mm256_cvttpd_epi32(__m256d a) 554{ 555 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 556} 557 558static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 559_mm256_cvtpd_epi32(__m256d a) 560{ 561 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 562} 563 564static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 565_mm256_cvttps_epi32(__m256 a) 566{ 567 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 568} 569 570/* Vector replicate */ 571static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 572_mm256_movehdup_ps(__m256 a) 573{ 574 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 575} 576 577static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 578_mm256_moveldup_ps(__m256 a) 579{ 580 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 581} 582 583static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 584_mm256_movedup_pd(__m256d a) 585{ 586 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 587} 588 589/* Unpack and Interleave */ 590static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 591_mm256_unpackhi_pd(__m256d a, __m256d b) 592{ 593 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 594} 595 596static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 597_mm256_unpacklo_pd(__m256d a, __m256d b) 598{ 599 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 600} 601 602static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 603_mm256_unpackhi_ps(__m256 a, __m256 b) 604{ 605 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 606} 607 608static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 609_mm256_unpacklo_ps(__m256 a, __m256 b) 610{ 611 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 612} 613 614/* Bit Test */ 615static __inline int __attribute__((__always_inline__, __nodebug__)) 616_mm_testz_pd(__m128d a, __m128d b) 617{ 618 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 619} 620 621static __inline int __attribute__((__always_inline__, __nodebug__)) 622_mm_testc_pd(__m128d a, __m128d b) 623{ 624 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 625} 626 627static __inline int __attribute__((__always_inline__, __nodebug__)) 628_mm_testnzc_pd(__m128d a, __m128d b) 629{ 630 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 631} 632 633static __inline int __attribute__((__always_inline__, __nodebug__)) 634_mm_testz_ps(__m128 a, __m128 b) 635{ 636 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 637} 638 639static __inline int __attribute__((__always_inline__, __nodebug__)) 640_mm_testc_ps(__m128 a, __m128 b) 641{ 642 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 643} 644 645static __inline int __attribute__((__always_inline__, __nodebug__)) 646_mm_testnzc_ps(__m128 a, __m128 b) 647{ 648 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 649} 650 651static __inline int __attribute__((__always_inline__, __nodebug__)) 652_mm256_testz_pd(__m256d a, __m256d b) 653{ 654 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 655} 656 657static __inline int __attribute__((__always_inline__, __nodebug__)) 658_mm256_testc_pd(__m256d a, __m256d b) 659{ 660 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 661} 662 663static __inline int __attribute__((__always_inline__, __nodebug__)) 664_mm256_testnzc_pd(__m256d a, __m256d b) 665{ 666 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 667} 668 669static __inline int __attribute__((__always_inline__, __nodebug__)) 670_mm256_testz_ps(__m256 a, __m256 b) 671{ 672 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 673} 674 675static __inline int __attribute__((__always_inline__, __nodebug__)) 676_mm256_testc_ps(__m256 a, __m256 b) 677{ 678 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 679} 680 681static __inline int __attribute__((__always_inline__, __nodebug__)) 682_mm256_testnzc_ps(__m256 a, __m256 b) 683{ 684 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 685} 686 687static __inline int __attribute__((__always_inline__, __nodebug__)) 688_mm256_testz_si256(__m256i a, __m256i b) 689{ 690 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 691} 692 693static __inline int __attribute__((__always_inline__, __nodebug__)) 694_mm256_testc_si256(__m256i a, __m256i b) 695{ 696 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 697} 698 699static __inline int __attribute__((__always_inline__, __nodebug__)) 700_mm256_testnzc_si256(__m256i a, __m256i b) 701{ 702 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 703} 704 705/* Vector extract sign mask */ 706static __inline int __attribute__((__always_inline__, __nodebug__)) 707_mm256_movemask_pd(__m256d a) 708{ 709 return __builtin_ia32_movmskpd256((__v4df)a); 710} 711 712static __inline int __attribute__((__always_inline__, __nodebug__)) 713_mm256_movemask_ps(__m256 a) 714{ 715 return __builtin_ia32_movmskps256((__v8sf)a); 716} 717 718/* Vector zero */ 719static __inline void __attribute__((__always_inline__, __nodebug__)) 720_mm256_zeroall(void) 721{ 722 __builtin_ia32_vzeroall(); 723} 724 725static __inline void __attribute__((__always_inline__, __nodebug__)) 726_mm256_zeroupper(void) 727{ 728 __builtin_ia32_vzeroupper(); 729} 730 731/* Vector load with broadcast */ 732static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 733_mm_broadcast_ss(float const *a) 734{ 735 return (__m128)__builtin_ia32_vbroadcastss(a); 736} 737 738static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 739_mm256_broadcast_sd(double const *a) 740{ 741 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 742} 743 744static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 745_mm256_broadcast_ss(float const *a) 746{ 747 return (__m256)__builtin_ia32_vbroadcastss256(a); 748} 749 750static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 751_mm256_broadcast_pd(__m128d const *a) 752{ 753 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 754} 755 756static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 757_mm256_broadcast_ps(__m128 const *a) 758{ 759 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 760} 761 762/* SIMD load ops */ 763static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 764_mm256_load_pd(double const *p) 765{ 766 return *(__m256d *)p; 767} 768 769static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 770_mm256_load_ps(float const *p) 771{ 772 return *(__m256 *)p; 773} 774 775static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 776_mm256_loadu_pd(double const *p) 777{ 778 return (__m256d)__builtin_ia32_loadupd256(p); 779} 780 781static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 782_mm256_loadu_ps(float const *p) 783{ 784 return (__m256)__builtin_ia32_loadups256(p); 785} 786 787static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 788_mm256_load_si256(__m256i const *p) 789{ 790 return *p; 791} 792 793static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 794_mm256_loadu_si256(__m256i const *p) 795{ 796 return (__m256i)__builtin_ia32_loaddqu256((char const *)p); 797} 798 799static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 800_mm256_lddqu_si256(__m256i const *p) 801{ 802 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 803} 804 805/* SIMD store ops */ 806static __inline void __attribute__((__always_inline__, __nodebug__)) 807_mm256_store_pd(double *p, __m256d a) 808{ 809 *(__m256d *)p = a; 810} 811 812static __inline void __attribute__((__always_inline__, __nodebug__)) 813_mm256_store_ps(float *p, __m256 a) 814{ 815 *(__m256 *)p = a; 816} 817 818static __inline void __attribute__((__always_inline__, __nodebug__)) 819_mm256_storeu_pd(double *p, __m256d a) 820{ 821 __builtin_ia32_storeupd256(p, (__v4df)a); 822} 823 824static __inline void __attribute__((__always_inline__, __nodebug__)) 825_mm256_storeu_ps(float *p, __m256 a) 826{ 827 __builtin_ia32_storeups256(p, (__v8sf)a); 828} 829 830static __inline void __attribute__((__always_inline__, __nodebug__)) 831_mm256_store_si256(__m256i *p, __m256i a) 832{ 833 *p = a; 834} 835 836static __inline void __attribute__((__always_inline__, __nodebug__)) 837_mm256_storeu_si256(__m256i *p, __m256i a) 838{ 839 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 840} 841 842/* Conditional load ops */ 843static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 844_mm_maskload_pd(double const *p, __m128d m) 845{ 846 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 847} 848 849static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 850_mm256_maskload_pd(double const *p, __m256d m) 851{ 852 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 853} 854 855static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 856_mm_maskload_ps(float const *p, __m128 m) 857{ 858 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 859} 860 861static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 862_mm256_maskload_ps(float const *p, __m256 m) 863{ 864 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 865} 866 867/* Conditional store ops */ 868static __inline void __attribute__((__always_inline__, __nodebug__)) 869_mm256_maskstore_ps(float *p, __m256 m, __m256 a) 870{ 871 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 872} 873 874static __inline void __attribute__((__always_inline__, __nodebug__)) 875_mm_maskstore_pd(double *p, __m128d m, __m128d a) 876{ 877 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 878} 879 880static __inline void __attribute__((__always_inline__, __nodebug__)) 881_mm256_maskstore_pd(double *p, __m256d m, __m256d a) 882{ 883 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 884} 885 886static __inline void __attribute__((__always_inline__, __nodebug__)) 887_mm_maskstore_ps(float *p, __m128 m, __m128 a) 888{ 889 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 890} 891 892/* Cacheability support ops */ 893static __inline void __attribute__((__always_inline__, __nodebug__)) 894_mm256_stream_si256(__m256i *a, __m256i b) 895{ 896 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 897} 898 899static __inline void __attribute__((__always_inline__, __nodebug__)) 900_mm256_stream_pd(double *a, __m256d b) 901{ 902 __builtin_ia32_movntpd256(a, (__v4df)b); 903} 904 905static __inline void __attribute__((__always_inline__, __nodebug__)) 906_mm256_stream_ps(float *p, __m256 a) 907{ 908 __builtin_ia32_movntps256(p, (__v8sf)a); 909} 910 911/* Create vectors */ 912static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 913_mm256_set_pd(double a, double b, double c, double d) 914{ 915 return (__m256d){ d, c, b, a }; 916} 917 918static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 919_mm256_set_ps(float a, float b, float c, float d, 920 float e, float f, float g, float h) 921{ 922 return (__m256){ h, g, f, e, d, c, b, a }; 923} 924 925static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 926_mm256_set_epi32(int i0, int i1, int i2, int i3, 927 int i4, int i5, int i6, int i7) 928{ 929 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 930} 931 932static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 933_mm256_set_epi16(short w15, short w14, short w13, short w12, 934 short w11, short w10, short w09, short w08, 935 short w07, short w06, short w05, short w04, 936 short w03, short w02, short w01, short w00) 937{ 938 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 939 w08, w09, w10, w11, w12, w13, w14, w15 }; 940} 941 942static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 943_mm256_set_epi8(char b31, char b30, char b29, char b28, 944 char b27, char b26, char b25, char b24, 945 char b23, char b22, char b21, char b20, 946 char b19, char b18, char b17, char b16, 947 char b15, char b14, char b13, char b12, 948 char b11, char b10, char b09, char b08, 949 char b07, char b06, char b05, char b04, 950 char b03, char b02, char b01, char b00) 951{ 952 return (__m256i)(__v32qi){ 953 b00, b01, b02, b03, b04, b05, b06, b07, 954 b08, b09, b10, b11, b12, b13, b14, b15, 955 b16, b17, b18, b19, b20, b21, b22, b23, 956 b24, b25, b26, b27, b28, b29, b30, b31 957 }; 958} 959 960static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 961_mm256_set_epi64x(long long a, long long b, long long c, long long d) 962{ 963 return (__m256i)(__v4di){ d, c, b, a }; 964} 965 966/* Create vectors with elements in reverse order */ 967static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 968_mm256_setr_pd(double a, double b, double c, double d) 969{ 970 return (__m256d){ a, b, c, d }; 971} 972 973static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 974_mm256_setr_ps(float a, float b, float c, float d, 975 float e, float f, float g, float h) 976{ 977 return (__m256){ a, b, c, d, e, f, g, h }; 978} 979 980static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 981_mm256_setr_epi32(int i0, int i1, int i2, int i3, 982 int i4, int i5, int i6, int i7) 983{ 984 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 985} 986 987static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 988_mm256_setr_epi16(short w15, short w14, short w13, short w12, 989 short w11, short w10, short w09, short w08, 990 short w07, short w06, short w05, short w04, 991 short w03, short w02, short w01, short w00) 992{ 993 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 994 w07, w06, w05, w04, w03, w02, w01, w00 }; 995} 996 997static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 998_mm256_setr_epi8(char b31, char b30, char b29, char b28, 999 char b27, char b26, char b25, char b24, 1000 char b23, char b22, char b21, char b20, 1001 char b19, char b18, char b17, char b16, 1002 char b15, char b14, char b13, char b12, 1003 char b11, char b10, char b09, char b08, 1004 char b07, char b06, char b05, char b04, 1005 char b03, char b02, char b01, char b00) 1006{ 1007 return (__m256i)(__v32qi){ 1008 b31, b30, b29, b28, b27, b26, b25, b24, 1009 b23, b22, b21, b20, b19, b18, b17, b16, 1010 b15, b14, b13, b12, b11, b10, b09, b08, 1011 b07, b06, b05, b04, b03, b02, b01, b00 }; 1012} 1013 1014static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1015_mm256_setr_epi64x(long long a, long long b, long long c, long long d) 1016{ 1017 return (__m256i)(__v4di){ a, b, c, d }; 1018} 1019 1020/* Create vectors with repeated elements */ 1021static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1022_mm256_set1_pd(double w) 1023{ 1024 return (__m256d){ w, w, w, w }; 1025} 1026 1027static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1028_mm256_set1_ps(float w) 1029{ 1030 return (__m256){ w, w, w, w, w, w, w, w }; 1031} 1032 1033static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1034_mm256_set1_epi32(int i) 1035{ 1036 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 1037} 1038 1039static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1040_mm256_set1_epi16(short w) 1041{ 1042 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 1043} 1044 1045static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1046_mm256_set1_epi8(char b) 1047{ 1048 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 1049 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1050} 1051 1052static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1053_mm256_set1_epi64x(long long q) 1054{ 1055 return (__m256i)(__v4di){ q, q, q, q }; 1056} 1057 1058/* Create zeroed vectors */ 1059static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1060_mm256_setzero_pd(void) 1061{ 1062 return (__m256d){ 0, 0, 0, 0 }; 1063} 1064 1065static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1066_mm256_setzero_ps(void) 1067{ 1068 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1069} 1070 1071static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1072_mm256_setzero_si256(void) 1073{ 1074 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1075} 1076 1077/* Cast between vector types */ 1078static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1079_mm256_castpd_ps(__m256d in) 1080{ 1081 return (__m256)in; 1082} 1083 1084static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1085_mm256_castpd_si256(__m256d in) 1086{ 1087 return (__m256i)in; 1088} 1089 1090static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1091_mm256_castps_pd(__m256 in) 1092{ 1093 return (__m256d)in; 1094} 1095 1096static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1097_mm256_castps_si256(__m256 in) 1098{ 1099 return (__m256i)in; 1100} 1101 1102static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1103_mm256_castsi256_ps(__m256i in) 1104{ 1105 return (__m256)in; 1106} 1107 1108static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1109_mm256_castsi256_pd(__m256i in) 1110{ 1111 return (__m256d)in; 1112} 1113 1114static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 1115_mm256_castpd256_pd128(__m256d in) 1116{ 1117 return __builtin_shufflevector(in, in, 0, 1); 1118} 1119 1120static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 1121_mm256_castps256_ps128(__m256 in) 1122{ 1123 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 1124} 1125 1126static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 1127_mm256_castsi256_si128(__m256i in) 1128{ 1129 return __builtin_shufflevector(in, in, 0, 1); 1130} 1131 1132static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1133_mm256_castpd128_pd256(__m128d in) 1134{ 1135 __m128d zero = _mm_setzero_pd(); 1136 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1137} 1138 1139static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1140_mm256_castps128_ps256(__m128 in) 1141{ 1142 __m128 zero = _mm_setzero_ps(); 1143 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 1144} 1145 1146static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1147_mm256_castsi128_si256(__m128i in) 1148{ 1149 __m128i zero = _mm_setzero_si128(); 1150 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1151} 1152