avxintrin.h revision 7fc3702694996d7d373e3280812a4172cf451aac
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __AVXINTRIN_H 25#define __AVXINTRIN_H 26 27#ifndef __AVX__ 28#error "AVX instruction set not enabled" 29#else 30 31typedef double __v4df __attribute__ ((__vector_size__ (32))); 32typedef float __v8sf __attribute__ ((__vector_size__ (32))); 33typedef long long __v4di __attribute__ ((__vector_size__ (32))); 34typedef int __v8si __attribute__ ((__vector_size__ (32))); 35typedef short __v16hi __attribute__ ((__vector_size__ (32))); 36typedef char __v32qi __attribute__ ((__vector_size__ (32))); 37 38typedef float __m256 __attribute__ ((__vector_size__ (32))); 39typedef double __m256d __attribute__((__vector_size__(32))); 40typedef long long __m256i __attribute__((__vector_size__(32))); 41 42/* Arithmetic */ 43static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 44_mm256_add_pd(__m256d a, __m256d b) 45{ 46 return a+b; 47} 48 49static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 50_mm256_add_ps(__m256 a, __m256 b) 51{ 52 return a+b; 53} 54 55static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 56_mm256_sub_pd(__m256d a, __m256d b) 57{ 58 return a-b; 59} 60 61static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 62_mm256_sub_ps(__m256 a, __m256 b) 63{ 64 return a-b; 65} 66 67static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 68_mm256_addsub_pd(__m256d a, __m256d b) 69{ 70 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 71} 72 73static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 74_mm256_addsub_ps(__m256 a, __m256 b) 75{ 76 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 77} 78 79static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 80_mm256_div_pd(__m256d a, __m256d b) 81{ 82 return a / b; 83} 84 85static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 86_mm256_div_ps(__m256 a, __m256 b) 87{ 88 return a / b; 89} 90 91static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 92_mm256_max_pd(__m256d a, __m256d b) 93{ 94 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 95} 96 97static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 98_mm256_max_ps(__m256 a, __m256 b) 99{ 100 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 101} 102 103static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 104_mm256_min_pd(__m256d a, __m256d b) 105{ 106 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 107} 108 109static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 110_mm256_min_ps(__m256 a, __m256 b) 111{ 112 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 113} 114 115static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 116_mm256_mul_pd(__m256d a, __m256d b) 117{ 118 return a * b; 119} 120 121static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 122_mm256_mul_ps(__m256 a, __m256 b) 123{ 124 return a * b; 125} 126 127static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 128_mm256_sqrt_pd(__m256d a) 129{ 130 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 131} 132 133static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 134_mm256_sqrt_ps(__m256 a) 135{ 136 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 137} 138 139static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 140_mm256_rsqrt_ps(__m256 a) 141{ 142 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 143} 144 145static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 146_mm256_rcp_ps(__m256 a) 147{ 148 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 149} 150 151static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 152_mm256_round_pd(__m256d v, const int m) 153{ 154 return (__m256d)__builtin_ia32_roundpd256((__v4df)v, m); 155} 156 157static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 158_mm256_round_ps(__m256 v, const int m) 159{ 160 return (__m256)__builtin_ia32_roundps256((__v8sf)v, m); 161} 162 163#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 164#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 165#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 166#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 167 168/* Logical */ 169static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 170_mm256_and_pd(__m256d a, __m256d b) 171{ 172 return (__m256d)((__v4di)a & (__v4di)b); 173} 174 175static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 176_mm256_and_ps(__m256 a, __m256 b) 177{ 178 return (__m256)((__v8si)a & (__v8si)b); 179} 180 181static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 182_mm256_andnot_pd(__m256d a, __m256d b) 183{ 184 return (__m256d)(~(__v4di)a & (__v4di)b); 185} 186 187static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 188_mm256_andnot_ps(__m256 a, __m256 b) 189{ 190 return (__m256)(~(__v8si)a & (__v8si)b); 191} 192 193static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 194_mm256_or_pd(__m256d a, __m256d b) 195{ 196 return (__m256d)((__v4di)a | (__v4di)b); 197} 198 199static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 200_mm256_or_ps(__m256 a, __m256 b) 201{ 202 return (__m256)((__v8si)a | (__v8si)b); 203} 204 205static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 206_mm256_xor_pd(__m256d a, __m256d b) 207{ 208 return (__m256d)((__v4di)a ^ (__v4di)b); 209} 210 211static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 212_mm256_xor_ps(__m256 a, __m256 b) 213{ 214 return (__m256)((__v8si)a ^ (__v8si)b); 215} 216 217/* Horizontal arithmetic */ 218static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 219_mm256_hadd_pd(__m256d a, __m256d b) 220{ 221 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 222} 223 224static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 225_mm256_hadd_ps(__m256 a, __m256 b) 226{ 227 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 228} 229 230static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 231_mm256_hsub_pd(__m256d a, __m256d b) 232{ 233 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 234} 235 236static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 237_mm256_hsub_ps(__m256 a, __m256 b) 238{ 239 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 240} 241 242/* Vector permutations */ 243static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 244_mm_permutevar_pd(__m128d a, __m128i c) 245{ 246 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 247} 248 249static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 250_mm256_permutevar_pd(__m256d a, __m256i c) 251{ 252 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 253} 254 255static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 256_mm_permutevar_ps(__m128 a, __m128i c) 257{ 258 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 259} 260 261static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 262_mm256_permutevar_ps(__m256 a, __m256i c) 263{ 264 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 265 (__v8si)c); 266} 267 268static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 269_mm_permute_pd(__m128d a, const int c) 270{ 271 return (__m128d)__builtin_ia32_vpermilpd((__v2df)a, c); 272} 273 274static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 275_mm256_permute_pd(__m256d a, const int c) 276{ 277 return (__m256d)__builtin_ia32_vpermilpd256((__v4df)a, c); 278} 279 280static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 281_mm_permute_ps(__m128 a, const int c) 282{ 283 return (__m128)__builtin_ia32_vpermilps((__v4sf)a, c); 284} 285 286static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 287_mm256_permute_ps(__m256 a, const int c) 288{ 289 return (__m256)__builtin_ia32_vpermilps256((__v8sf)a, c); 290} 291 292static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 293_mm256_permute2f128_pd(__m256d a, __m256d b, const int c) 294{ 295 return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)a, (__v4df)b, c); 296} 297 298static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 299_mm256_permute2f128_ps(__m256 a, __m256 b, const int c) 300{ 301 return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)a, (__v8sf)b, c); 302} 303 304static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 305_mm256_permute2f128_si256(__m256i a, __m256i b, const int c) 306{ 307 return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)a, (__v8si)b, c); 308} 309 310/* Vector Blend */ 311static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 312_mm256_blend_pd(__m256d a, __m256d b, const int c) 313{ 314 return (__m256d)__builtin_ia32_blendpd256((__v4df)a, (__v4df)b, c); 315} 316 317static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 318_mm256_blend_ps(__m256 a, __m256 b, const int c) 319{ 320 return (__m256)__builtin_ia32_blendps256((__v8sf)a, (__v8sf)b, c); 321} 322 323static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 324_mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 325{ 326 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 327} 328 329static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 330_mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 331{ 332 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 333} 334 335/* Vector Dot Product */ 336static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 337_mm256_dp_ps(__m256 a, __m256 b, const int c) 338{ 339 return (__m256)__builtin_ia32_dpps256((__v8sf)a, (__v8sf)b, c); 340} 341 342/* Vector shuffle */ 343#define _mm256_shuffle_ps(a, b, mask) \ 344 (__builtin_shufflevector((__v8sf)(a), (__v8sf)(b), \ 345 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 346 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8 \ 347 (mask) & 0x3 + 4, (((mask) & 0xc) >> 2) + 4, \ 348 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12)) 349 350#define _mm256_shuffle_pd(a, b, mask) \ 351 (__builtin_shufflevector((__v4df)(a), (__v4df)(b), \ 352 (mask) & 0x1, \ 353 (((mask) & 0x2) >> 1) + 4, \ 354 (((mask) & 0x4) >> 2) + 2, \ 355 (((mask) & 0x8) >> 3) + 6)) 356 357/* Compare */ 358#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 359#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 360#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 361#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 362#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 363#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 364#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 365#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 366#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 367#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 368#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 369#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 370#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 371#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 372#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 373#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 374#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 375#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 376#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 377#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 378#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 379#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 380#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 381#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 382#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 383#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 384#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 385#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 386#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 387#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 388#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 389#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 390 391static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 392_mm_cmp_pd(__m128d a, __m128d b, const int c) 393{ 394 return (__m128d)__builtin_ia32_cmppd((__v2df)a, (__v2df)b, c); 395} 396 397static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 398_mm_cmp_ps(__m128 a, __m128 b, const int c) 399{ 400 return (__m128)__builtin_ia32_cmpps((__v4sf)a, (__v4sf)b, c); 401} 402 403static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 404_mm256_cmp_pd(__m256d a, __m256d b, const int c) 405{ 406 return (__m256d)__builtin_ia32_cmppd256((__v4df)a, (__v4df)b, c); 407} 408 409static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 410_mm256_cmp_ps(__m256 a, __m256 b, const int c) 411{ 412 return (__m256)__builtin_ia32_cmpps256((__v8sf)a, (__v8sf)b, c); 413} 414 415static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 416_mm_cmp_sd(__m128d a, __m128d b, const int c) 417{ 418 return (__m128d)__builtin_ia32_cmpsd((__v2df)a, (__v2df)b, c); 419} 420 421static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 422_mm_cmp_ss(__m128 a, __m128 b, const int c) 423{ 424 return (__m128)__builtin_ia32_cmpss((__v4sf)a, (__v4sf)b, c); 425} 426 427/* Vector extract */ 428static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 429_mm256_extractf128_pd(__m256d a, const int o) 430{ 431 return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)a, o); 432} 433 434static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 435_mm256_extractf128_ps(__m256 a, const int o) 436{ 437 return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)a, o); 438} 439 440static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 441_mm256_extractf128_si256(__m256i a, const int o) 442{ 443 return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)a, o); 444} 445 446static __inline int __attribute__((__always_inline__, __nodebug__)) 447_mm256_extract_epi32(__m256i a, int const imm) 448{ 449 __v8si b = (__v8si)a; 450 return b[imm]; 451} 452 453static __inline int __attribute__((__always_inline__, __nodebug__)) 454_mm256_extract_epi16(__m256i a, int const imm) 455{ 456 __v16hi b = (__v16hi)a; 457 return b[imm]; 458} 459 460static __inline int __attribute__((__always_inline__, __nodebug__)) 461_mm256_extract_epi8(__m256i a, int const imm) 462{ 463 __v32qi b = (__v32qi)a; 464 return b[imm]; 465} 466 467#ifdef __x86_64__ 468static __inline long long __attribute__((__always_inline__, __nodebug__)) 469_mm256_extract_epi64(__m256i a, const int imm) 470{ 471 __v4di b = (__v4di)a; 472 return b[imm]; 473} 474#endif 475 476/* Vector insert */ 477static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 478_mm256_insertf128_pd(__m256d a, __m128d b, const int o) 479{ 480 return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)a, (__v2df)b, o); 481} 482 483static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 484_mm256_insertf128_ps(__m256 a, __m128 b, const int o) 485{ 486 return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)a, (__v4sf)b, o); 487} 488 489static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 490_mm256_insertf128_si256(__m256i a, __m128i b, const int o) 491{ 492 return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)a, (__v4si)b, o); 493} 494 495static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 496_mm256_insert_epi32(__m256i a, int b, int const imm) 497{ 498 __v8si c = (__v8si)a; 499 c[imm & 7] = b; 500 return (__m256i)c; 501} 502 503static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 504_mm256_insert_epi16(__m256i a, int b, int const imm) 505{ 506 __v16hi c = (__v16hi)a; 507 c[imm & 15] = b; 508 return (__m256i)c; 509} 510 511static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 512_mm256_insert_epi8(__m256i a, int b, int const imm) 513{ 514 __v32qi c = (__v32qi)a; 515 c[imm & 31] = b; 516 return (__m256i)c; 517} 518 519#ifdef __x86_64__ 520static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 521_mm256_insert_epi64(__m256i a, int b, int const imm) 522{ 523 __v4di c = (__v4di)a; 524 c[imm & 3] = b; 525 return (__m256i)c; 526} 527#endif 528 529/* Conversion */ 530static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 531_mm256_cvtepi32_pd(__m128i a) 532{ 533 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 534} 535 536static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 537_mm256_cvtepi32_ps(__m256i a) 538{ 539 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 540} 541 542static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 543_mm256_cvtpd_ps(__m256d a) 544{ 545 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 546} 547 548static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 549_mm256_cvtps_epi32(__m256 a) 550{ 551 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 552} 553 554static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 555_mm256_cvtps_pd(__m128 a) 556{ 557 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 558} 559 560static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 561_mm256_cvttpd_epi32(__m256d a) 562{ 563 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 564} 565 566static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 567_mm256_cvtpd_epi32(__m256d a) 568{ 569 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 570} 571 572static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 573_mm256_cvttps_epi32(__m256 a) 574{ 575 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 576} 577 578/* Vector replicate */ 579static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 580_mm256_movehdup_ps(__m256 a) 581{ 582 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 583} 584 585static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 586_mm256_moveldup_ps(__m256 a) 587{ 588 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 589} 590 591static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 592_mm256_movedup_pd(__m256d a) 593{ 594 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 595} 596 597/* Unpack and Interleave */ 598static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 599_mm256_unpackhi_pd(__m256d a, __m256d b) 600{ 601 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 602} 603 604static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 605_mm256_unpacklo_pd(__m256d a, __m256d b) 606{ 607 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 608} 609 610static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 611_mm256_unpackhi_ps(__m256 a, __m256 b) 612{ 613 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 614} 615 616static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 617_mm256_unpacklo_ps(__m256 a, __m256 b) 618{ 619 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 620} 621 622/* Bit Test */ 623static __inline int __attribute__((__always_inline__, __nodebug__)) 624_mm_testz_pd(__m128d a, __m128d b) 625{ 626 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 627} 628 629static __inline int __attribute__((__always_inline__, __nodebug__)) 630_mm_testc_pd(__m128d a, __m128d b) 631{ 632 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 633} 634 635static __inline int __attribute__((__always_inline__, __nodebug__)) 636_mm_testnzc_pd(__m128d a, __m128d b) 637{ 638 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 639} 640 641static __inline int __attribute__((__always_inline__, __nodebug__)) 642_mm_testz_ps(__m128 a, __m128 b) 643{ 644 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 645} 646 647static __inline int __attribute__((__always_inline__, __nodebug__)) 648_mm_testc_ps(__m128 a, __m128 b) 649{ 650 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 651} 652 653static __inline int __attribute__((__always_inline__, __nodebug__)) 654_mm_testnzc_ps(__m128 a, __m128 b) 655{ 656 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 657} 658 659static __inline int __attribute__((__always_inline__, __nodebug__)) 660_mm256_testz_pd(__m256d a, __m256d b) 661{ 662 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 663} 664 665static __inline int __attribute__((__always_inline__, __nodebug__)) 666_mm256_testc_pd(__m256d a, __m256d b) 667{ 668 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 669} 670 671static __inline int __attribute__((__always_inline__, __nodebug__)) 672_mm256_testnzc_pd(__m256d a, __m256d b) 673{ 674 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 675} 676 677static __inline int __attribute__((__always_inline__, __nodebug__)) 678_mm256_testz_ps(__m256 a, __m256 b) 679{ 680 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 681} 682 683static __inline int __attribute__((__always_inline__, __nodebug__)) 684_mm256_testc_ps(__m256 a, __m256 b) 685{ 686 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 687} 688 689static __inline int __attribute__((__always_inline__, __nodebug__)) 690_mm256_testnzc_ps(__m256 a, __m256 b) 691{ 692 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 693} 694 695static __inline int __attribute__((__always_inline__, __nodebug__)) 696_mm256_testz_si256(__m256i a, __m256i b) 697{ 698 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 699} 700 701static __inline int __attribute__((__always_inline__, __nodebug__)) 702_mm256_testc_si256(__m256i a, __m256i b) 703{ 704 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 705} 706 707static __inline int __attribute__((__always_inline__, __nodebug__)) 708_mm256_testnzc_si256(__m256i a, __m256i b) 709{ 710 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 711} 712 713/* Vector extract sign mask */ 714static __inline int __attribute__((__always_inline__, __nodebug__)) 715_mm256_movemask_pd(__m256d a) 716{ 717 return __builtin_ia32_movmskpd256((__v4df)a); 718} 719 720static __inline int __attribute__((__always_inline__, __nodebug__)) 721_mm256_movemask_ps(__m256 a) 722{ 723 return __builtin_ia32_movmskps256((__v8sf)a); 724} 725 726/* Vector zero */ 727static __inline void __attribute__((__always_inline__, __nodebug__)) 728_mm256_zeroall(void) 729{ 730 __builtin_ia32_vzeroall(); 731} 732 733static __inline void __attribute__((__always_inline__, __nodebug__)) 734_mm256_zeroupper(void) 735{ 736 __builtin_ia32_vzeroupper(); 737} 738 739/* Vector load with broadcast */ 740static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 741_mm_broadcast_ss(float const *a) 742{ 743 return (__m128)__builtin_ia32_vbroadcastss(a); 744} 745 746static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 747_mm256_broadcast_sd(double const *a) 748{ 749 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 750} 751 752static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 753_mm256_broadcast_ss(float const *a) 754{ 755 return (__m256)__builtin_ia32_vbroadcastss256(a); 756} 757 758static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 759_mm256_broadcast_pd(__m128d const *a) 760{ 761 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 762} 763 764static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 765_mm256_broadcast_ps(__m128 const *a) 766{ 767 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 768} 769 770/* SIMD load ops */ 771static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 772_mm256_load_pd(double const *p) 773{ 774 return *(__m256d *)p; 775} 776 777static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 778_mm256_load_ps(float const *p) 779{ 780 return *(__m256 *)p; 781} 782 783static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 784_mm256_loadu_pd(double const *p) 785{ 786 return (__m256d)__builtin_ia32_loadupd256(p); 787} 788 789static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 790_mm256_loadu_ps(float const *p) 791{ 792 return (__m256)__builtin_ia32_loadups256(p); 793} 794 795static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 796_mm256_load_si256(__m256i const *p) 797{ 798 return *p; 799} 800 801static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 802_mm256_loadu_si256(__m256i const *p) 803{ 804 return (__m256i)__builtin_ia32_loaddqu256((char const *)p); 805} 806 807static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 808_mm256_lddqu_si256(__m256i const *p) 809{ 810 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 811} 812 813/* SIMD store ops */ 814static __inline void __attribute__((__always_inline__, __nodebug__)) 815_mm256_store_pd(double *p, __m256d a) 816{ 817 *(__m256d *)p = a; 818} 819 820static __inline void __attribute__((__always_inline__, __nodebug__)) 821_mm256_store_ps(float *p, __m256 a) 822{ 823 *(__m256 *)p = a; 824} 825 826static __inline void __attribute__((__always_inline__, __nodebug__)) 827_mm256_storeu_pd(double *p, __m256d a) 828{ 829 __builtin_ia32_storeupd256(p, (__v4df)a); 830} 831 832static __inline void __attribute__((__always_inline__, __nodebug__)) 833_mm256_storeu_ps(float *p, __m256 a) 834{ 835 __builtin_ia32_storeups256(p, (__v8sf)a); 836} 837 838static __inline void __attribute__((__always_inline__, __nodebug__)) 839_mm256_store_si256(__m256i *p, __m256i a) 840{ 841 *p = a; 842} 843 844static __inline void __attribute__((__always_inline__, __nodebug__)) 845_mm256_storeu_si256(__m256i *p, __m256i a) 846{ 847 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 848} 849 850/* Conditional load ops */ 851static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 852_mm_maskload_pd(double const *p, __m128d m) 853{ 854 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 855} 856 857static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 858_mm256_maskload_pd(double const *p, __m256d m) 859{ 860 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 861} 862 863static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 864_mm_maskload_ps(float const *p, __m128 m) 865{ 866 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 867} 868 869static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 870_mm256_maskload_ps(float const *p, __m256 m) 871{ 872 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 873} 874 875/* Conditional store ops */ 876static __inline void __attribute__((__always_inline__, __nodebug__)) 877_mm256_maskstore_ps(float *p, __m256 m, __m256 a) 878{ 879 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 880} 881 882static __inline void __attribute__((__always_inline__, __nodebug__)) 883_mm_maskstore_pd(double *p, __m128d m, __m128d a) 884{ 885 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 886} 887 888static __inline void __attribute__((__always_inline__, __nodebug__)) 889_mm256_maskstore_pd(double *p, __m256d m, __m256d a) 890{ 891 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 892} 893 894static __inline void __attribute__((__always_inline__, __nodebug__)) 895_mm_maskstore_ps(float *p, __m128 m, __m128 a) 896{ 897 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 898} 899 900/* Cacheability support ops */ 901static __inline void __attribute__((__always_inline__, __nodebug__)) 902_mm256_stream_si256(__m256i *a, __m256i b) 903{ 904 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 905} 906 907static __inline void __attribute__((__always_inline__, __nodebug__)) 908_mm256_stream_pd(double *a, __m256d b) 909{ 910 __builtin_ia32_movntpd256(a, (__v4df)b); 911} 912 913static __inline void __attribute__((__always_inline__, __nodebug__)) 914_mm256_stream_ps(float *p, __m256 a) 915{ 916 __builtin_ia32_movntps256(p, (__v8sf)a); 917} 918 919/* Create vectors */ 920static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 921_mm256_set_pd(double a, double b, double c, double d) 922{ 923 return (__m256d){ d, c, b, a }; 924} 925 926static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 927_mm256_set_ps(float a, float b, float c, float d, 928 float e, float f, float g, float h) 929{ 930 return (__m256){ h, g, f, e, d, c, b, a }; 931} 932 933static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 934_mm256_set_epi32(int i0, int i1, int i2, int i3, 935 int i4, int i5, int i6, int i7) 936{ 937 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 938} 939 940static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 941_mm256_set_epi16(short w15, short w14, short w13, short w12, 942 short w11, short w10, short w09, short w08, 943 short w07, short w06, short w05, short w04, 944 short w03, short w02, short w01, short w00) 945{ 946 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 947 w08, w09, w10, w11, w12, w13, w14, w15 }; 948} 949 950static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 951_mm256_set_epi8(char b31, char b30, char b29, char b28, 952 char b27, char b26, char b25, char b24, 953 char b23, char b22, char b21, char b20, 954 char b19, char b18, char b17, char b16, 955 char b15, char b14, char b13, char b12, 956 char b11, char b10, char b09, char b08, 957 char b07, char b06, char b05, char b04, 958 char b03, char b02, char b01, char b00) 959{ 960 return (__m256i)(__v32qi){ 961 b00, b01, b02, b03, b04, b05, b06, b07, 962 b08, b09, b10, b11, b12, b13, b14, b15, 963 b16, b17, b18, b19, b20, b21, b22, b23, 964 b24, b25, b26, b27, b28, b29, b30, b31 965 }; 966} 967 968static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 969_mm256_set_epi64x(long long a, long long b, long long c, long long d) 970{ 971 return (__m256i)(__v4di){ d, c, b, a }; 972} 973 974/* Create vectors with elements in reverse order */ 975static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 976_mm256_setr_pd(double a, double b, double c, double d) 977{ 978 return (__m256d){ a, b, c, d }; 979} 980 981static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 982_mm256_setr_ps(float a, float b, float c, float d, 983 float e, float f, float g, float h) 984{ 985 return (__m256){ a, b, c, d, e, f, g, h }; 986} 987 988static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 989_mm256_setr_epi32(int i0, int i1, int i2, int i3, 990 int i4, int i5, int i6, int i7) 991{ 992 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 993} 994 995static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 996_mm256_setr_epi16(short w15, short w14, short w13, short w12, 997 short w11, short w10, short w09, short w08, 998 short w07, short w06, short w05, short w04, 999 short w03, short w02, short w01, short w00) 1000{ 1001 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 1002 w07, w06, w05, w04, w03, w02, w01, w00 }; 1003} 1004 1005static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1006_mm256_setr_epi8(char b31, char b30, char b29, char b28, 1007 char b27, char b26, char b25, char b24, 1008 char b23, char b22, char b21, char b20, 1009 char b19, char b18, char b17, char b16, 1010 char b15, char b14, char b13, char b12, 1011 char b11, char b10, char b09, char b08, 1012 char b07, char b06, char b05, char b04, 1013 char b03, char b02, char b01, char b00) 1014{ 1015 return (__m256i)(__v32qi){ 1016 b31, b30, b29, b28, b27, b26, b25, b24, 1017 b23, b22, b21, b20, b19, b18, b17, b16, 1018 b15, b14, b13, b12, b11, b10, b09, b08, 1019 b07, b06, b05, b04, b03, b02, b01, b00 }; 1020} 1021 1022static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1023_mm256_setr_epi64x(long long a, long long b, long long c, long long d) 1024{ 1025 return (__m256i)(__v4di){ a, b, c, d }; 1026} 1027 1028/* Create vectors with repeated elements */ 1029static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1030_mm256_set1_pd(double w) 1031{ 1032 return (__m256d){ w, w, w, w }; 1033} 1034 1035static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1036_mm256_set1_ps(float w) 1037{ 1038 return (__m256){ w, w, w, w, w, w, w, w }; 1039} 1040 1041static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1042_mm256_set1_epi32(int i) 1043{ 1044 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 1045} 1046 1047static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1048_mm256_set1_epi16(short w) 1049{ 1050 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 1051} 1052 1053static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1054_mm256_set1_epi8(char b) 1055{ 1056 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 1057 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1058} 1059 1060static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1061_mm256_set1_epi64x(long long q) 1062{ 1063 return (__m256i)(__v4di){ q, q, q, q }; 1064} 1065 1066/* Create zeroed vectors */ 1067static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1068_mm256_setzero_pd(void) 1069{ 1070 return (__m256d){ 0, 0, 0, 0 }; 1071} 1072 1073static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1074_mm256_setzero_ps(void) 1075{ 1076 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1077} 1078 1079static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1080_mm256_setzero_si256(void) 1081{ 1082 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1083} 1084 1085/* Cast between vector types */ 1086static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1087_mm256_castpd_ps(__m256d in) 1088{ 1089 return (__m256)in; 1090} 1091 1092static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1093_mm256_castpd_si256(__m256d in) 1094{ 1095 return (__m256i)in; 1096} 1097 1098static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1099_mm256_castps_pd(__m256 in) 1100{ 1101 return (__m256d)in; 1102} 1103 1104static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1105_mm256_castps_si256(__m256 in) 1106{ 1107 return (__m256i)in; 1108} 1109 1110static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1111_mm256_castsi256_ps(__m256i in) 1112{ 1113 return (__m256)in; 1114} 1115 1116static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1117_mm256_castsi256_pd(__m256i in) 1118{ 1119 return (__m256d)in; 1120} 1121 1122static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 1123_mm256_castpd256_pd128(__m256d in) 1124{ 1125 return __builtin_shufflevector(in, in, 0, 1); 1126} 1127 1128static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 1129_mm256_castps256_ps128(__m256 in) 1130{ 1131 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 1132} 1133 1134static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 1135_mm256_castsi256_si128(__m256i in) 1136{ 1137 return __builtin_shufflevector(in, in, 0, 1); 1138} 1139 1140static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1141_mm256_castpd128_pd256(__m128d in) 1142{ 1143 __m128d zero = _mm_setzero_pd(); 1144 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1145} 1146 1147static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1148_mm256_castps128_ps256(__m128 in) 1149{ 1150 __m128 zero = _mm_setzero_ps(); 1151 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 1152} 1153 1154static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1155_mm256_castsi128_si256(__m128i in) 1156{ 1157 __m128i zero = _mm_setzero_si128(); 1158 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1159} 1160 1161#endif /* __AVX__ */ 1162 1163#endif /* __AVXINTRIN_H */ 1164