avxintrin.h revision 34a1da4354959522cd1721ce9ca099cc5c743f01
1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28typedef double __v4df __attribute__ ((__vector_size__ (32))); 29typedef float __v8sf __attribute__ ((__vector_size__ (32))); 30typedef long long __v4di __attribute__ ((__vector_size__ (32))); 31typedef int __v8si __attribute__ ((__vector_size__ (32))); 32typedef short __v16hi __attribute__ ((__vector_size__ (32))); 33typedef char __v32qi __attribute__ ((__vector_size__ (32))); 34 35typedef float __m256 __attribute__ ((__vector_size__ (32))); 36typedef double __m256d __attribute__((__vector_size__(32))); 37typedef long long __m256i __attribute__((__vector_size__(32))); 38 39/* Arithmetic */ 40static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 41_mm256_add_pd(__m256d a, __m256d b) 42{ 43 return a+b; 44} 45 46static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 47_mm256_add_ps(__m256 a, __m256 b) 48{ 49 return a+b; 50} 51 52static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 53_mm256_sub_pd(__m256d a, __m256d b) 54{ 55 return a-b; 56} 57 58static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 59_mm256_sub_ps(__m256 a, __m256 b) 60{ 61 return a-b; 62} 63 64static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 65_mm256_addsub_pd(__m256d a, __m256d b) 66{ 67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 68} 69 70static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 71_mm256_addsub_ps(__m256 a, __m256 b) 72{ 73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 74} 75 76static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 77_mm256_div_pd(__m256d a, __m256d b) 78{ 79 return a / b; 80} 81 82static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 83_mm256_div_ps(__m256 a, __m256 b) 84{ 85 return a / b; 86} 87 88static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 89_mm256_max_pd(__m256d a, __m256d b) 90{ 91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 92} 93 94static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 95_mm256_max_ps(__m256 a, __m256 b) 96{ 97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 98} 99 100static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 101_mm256_min_pd(__m256d a, __m256d b) 102{ 103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 104} 105 106static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 107_mm256_min_ps(__m256 a, __m256 b) 108{ 109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 110} 111 112static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 113_mm256_mul_pd(__m256d a, __m256d b) 114{ 115 return a * b; 116} 117 118static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 119_mm256_mul_ps(__m256 a, __m256 b) 120{ 121 return a * b; 122} 123 124static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 125_mm256_sqrt_pd(__m256d a) 126{ 127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 128} 129 130static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 131_mm256_sqrt_ps(__m256 a) 132{ 133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 134} 135 136static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 137_mm256_rsqrt_ps(__m256 a) 138{ 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 140} 141 142static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 143_mm256_rcp_ps(__m256 a) 144{ 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 146} 147 148#define _mm256_round_pd(V, M) __extension__ ({ \ 149 __m256d __V = (V); \ 150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) 151 152#define _mm256_round_ps(V, M) __extension__ ({ \ 153 __m256 __V = (V); \ 154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) 155 156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 160 161/* Logical */ 162static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 163_mm256_and_pd(__m256d a, __m256d b) 164{ 165 return (__m256d)((__v4di)a & (__v4di)b); 166} 167 168static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 169_mm256_and_ps(__m256 a, __m256 b) 170{ 171 return (__m256)((__v8si)a & (__v8si)b); 172} 173 174static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 175_mm256_andnot_pd(__m256d a, __m256d b) 176{ 177 return (__m256d)(~(__v4di)a & (__v4di)b); 178} 179 180static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 181_mm256_andnot_ps(__m256 a, __m256 b) 182{ 183 return (__m256)(~(__v8si)a & (__v8si)b); 184} 185 186static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 187_mm256_or_pd(__m256d a, __m256d b) 188{ 189 return (__m256d)((__v4di)a | (__v4di)b); 190} 191 192static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 193_mm256_or_ps(__m256 a, __m256 b) 194{ 195 return (__m256)((__v8si)a | (__v8si)b); 196} 197 198static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 199_mm256_xor_pd(__m256d a, __m256d b) 200{ 201 return (__m256d)((__v4di)a ^ (__v4di)b); 202} 203 204static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 205_mm256_xor_ps(__m256 a, __m256 b) 206{ 207 return (__m256)((__v8si)a ^ (__v8si)b); 208} 209 210/* Horizontal arithmetic */ 211static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 212_mm256_hadd_pd(__m256d a, __m256d b) 213{ 214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 215} 216 217static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 218_mm256_hadd_ps(__m256 a, __m256 b) 219{ 220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 221} 222 223static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 224_mm256_hsub_pd(__m256d a, __m256d b) 225{ 226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 227} 228 229static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 230_mm256_hsub_ps(__m256 a, __m256 b) 231{ 232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 233} 234 235/* Vector permutations */ 236static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 237_mm_permutevar_pd(__m128d a, __m128i c) 238{ 239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 240} 241 242static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 243_mm256_permutevar_pd(__m256d a, __m256i c) 244{ 245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 246} 247 248static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 249_mm_permutevar_ps(__m128 a, __m128i c) 250{ 251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 252} 253 254static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 255_mm256_permutevar_ps(__m256 a, __m256i c) 256{ 257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 258 (__v8si)c); 259} 260 261#define _mm_permute_pd(A, C) __extension__ ({ \ 262 __m128d __A = (A); \ 263 (__m128d)__builtin_ia32_vpermilpd((__v2df)__A, (C)); }) 264 265#define _mm256_permute_pd(A, C) __extension__ ({ \ 266 __m256d __A = (A); \ 267 (__m256d)__builtin_ia32_vpermilpd256((__v4df)__A, (C)); }) 268 269#define _mm_permute_ps(A, C) __extension__ ({ \ 270 __m128 __A = (A); \ 271 (__m128)__builtin_ia32_vpermilps((__v4sf)__A, (C)); }) 272 273#define _mm256_permute_ps(A, C) __extension__ ({ \ 274 __m256 __A = (A); \ 275 (__m256)__builtin_ia32_vpermilps256((__v8sf)__A, (C)); }) 276 277#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 278 __m256d __V1 = (V1); \ 279 __m256d __V2 = (V2); \ 280 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); }) 281 282#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 283 __m256 __V1 = (V1); \ 284 __m256 __V2 = (V2); \ 285 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 286 287#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 288 __m256i __V1 = (V1); \ 289 __m256i __V2 = (V2); \ 290 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); }) 291 292/* Vector Blend */ 293#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 294 __m256d __V1 = (V1); \ 295 __m256d __V2 = (V2); \ 296 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); }) 297 298#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 299 __m256 __V1 = (V1); \ 300 __m256 __V2 = (V2); \ 301 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 302 303static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 304_mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 305{ 306 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 307} 308 309static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 310_mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 311{ 312 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 313} 314 315/* Vector Dot Product */ 316#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 317 __m256 __V1 = (V1); \ 318 __m256 __V2 = (V2); \ 319 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 320 321/* Vector shuffle */ 322#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 323 __m256 __a = (a); \ 324 __m256 __b = (b); \ 325 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ 326 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 327 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 328 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ 329 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) 330 331#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 332 __m256d __a = (a); \ 333 __m256d __b = (b); \ 334 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ 335 (mask) & 0x1, \ 336 (((mask) & 0x2) >> 1) + 4, \ 337 (((mask) & 0x4) >> 2) + 2, \ 338 (((mask) & 0x8) >> 3) + 6); }) 339 340/* Compare */ 341#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 342#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 343#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 344#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 345#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 346#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 347#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 348#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 349#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 350#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 351#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 352#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 353#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 354#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 355#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 356#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 357#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 358#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 359#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 360#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 361#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 362#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 363#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 364#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 365#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 366#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 367#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 368#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 369#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 370#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 371#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 372#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 373 374#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 375 __m128d __a = (a); \ 376 __m128d __b = (b); \ 377 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) 378 379#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 380 __m128 __a = (a); \ 381 __m128 __b = (b); \ 382 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) 383 384#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 385 __m256d __a = (a); \ 386 __m256d __b = (b); \ 387 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) 388 389#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 390 __m256 __a = (a); \ 391 __m256 __b = (b); \ 392 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) 393 394#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 395 __m128d __a = (a); \ 396 __m128d __b = (b); \ 397 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) 398 399#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 400 __m128 __a = (a); \ 401 __m128 __b = (b); \ 402 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) 403 404/* Vector extract */ 405#define _mm256_extractf128_pd(A, O) __extension__ ({ \ 406 __m256d __A = (A); \ 407 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) 408 409#define _mm256_extractf128_ps(A, O) __extension__ ({ \ 410 __m256 __A = (A); \ 411 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) 412 413#define _mm256_extractf128_si256(A, O) __extension__ ({ \ 414 __m256i __A = (A); \ 415 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) 416 417static __inline int __attribute__((__always_inline__, __nodebug__)) 418_mm256_extract_epi32(__m256i a, int const imm) 419{ 420 __v8si b = (__v8si)a; 421 return b[imm]; 422} 423 424static __inline int __attribute__((__always_inline__, __nodebug__)) 425_mm256_extract_epi16(__m256i a, int const imm) 426{ 427 __v16hi b = (__v16hi)a; 428 return b[imm]; 429} 430 431static __inline int __attribute__((__always_inline__, __nodebug__)) 432_mm256_extract_epi8(__m256i a, int const imm) 433{ 434 __v32qi b = (__v32qi)a; 435 return b[imm]; 436} 437 438#ifdef __x86_64__ 439static __inline long long __attribute__((__always_inline__, __nodebug__)) 440_mm256_extract_epi64(__m256i a, const int imm) 441{ 442 __v4di b = (__v4di)a; 443 return b[imm]; 444} 445#endif 446 447/* Vector insert */ 448#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ 449 __m256d __V1 = (V1); \ 450 __m128d __V2 = (V2); \ 451 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) 452 453#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ 454 __m256 __V1 = (V1); \ 455 __m128 __V2 = (V2); \ 456 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) 457 458#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ 459 __m256i __V1 = (V1); \ 460 __m128i __V2 = (V2); \ 461 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) 462 463static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 464_mm256_insert_epi32(__m256i a, int b, int const imm) 465{ 466 __v8si c = (__v8si)a; 467 c[imm & 7] = b; 468 return (__m256i)c; 469} 470 471static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 472_mm256_insert_epi16(__m256i a, int b, int const imm) 473{ 474 __v16hi c = (__v16hi)a; 475 c[imm & 15] = b; 476 return (__m256i)c; 477} 478 479static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 480_mm256_insert_epi8(__m256i a, int b, int const imm) 481{ 482 __v32qi c = (__v32qi)a; 483 c[imm & 31] = b; 484 return (__m256i)c; 485} 486 487#ifdef __x86_64__ 488static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 489_mm256_insert_epi64(__m256i a, int b, int const imm) 490{ 491 __v4di c = (__v4di)a; 492 c[imm & 3] = b; 493 return (__m256i)c; 494} 495#endif 496 497/* Conversion */ 498static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 499_mm256_cvtepi32_pd(__m128i a) 500{ 501 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 502} 503 504static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 505_mm256_cvtepi32_ps(__m256i a) 506{ 507 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 508} 509 510static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 511_mm256_cvtpd_ps(__m256d a) 512{ 513 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 514} 515 516static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 517_mm256_cvtps_epi32(__m256 a) 518{ 519 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 520} 521 522static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 523_mm256_cvtps_pd(__m128 a) 524{ 525 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 526} 527 528static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 529_mm256_cvttpd_epi32(__m256d a) 530{ 531 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 532} 533 534static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 535_mm256_cvtpd_epi32(__m256d a) 536{ 537 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 538} 539 540static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 541_mm256_cvttps_epi32(__m256 a) 542{ 543 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 544} 545 546/* Vector replicate */ 547static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 548_mm256_movehdup_ps(__m256 a) 549{ 550 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 551} 552 553static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 554_mm256_moveldup_ps(__m256 a) 555{ 556 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 557} 558 559static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 560_mm256_movedup_pd(__m256d a) 561{ 562 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 563} 564 565/* Unpack and Interleave */ 566static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 567_mm256_unpackhi_pd(__m256d a, __m256d b) 568{ 569 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 570} 571 572static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 573_mm256_unpacklo_pd(__m256d a, __m256d b) 574{ 575 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 576} 577 578static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 579_mm256_unpackhi_ps(__m256 a, __m256 b) 580{ 581 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 582} 583 584static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 585_mm256_unpacklo_ps(__m256 a, __m256 b) 586{ 587 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 588} 589 590/* Bit Test */ 591static __inline int __attribute__((__always_inline__, __nodebug__)) 592_mm_testz_pd(__m128d a, __m128d b) 593{ 594 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 595} 596 597static __inline int __attribute__((__always_inline__, __nodebug__)) 598_mm_testc_pd(__m128d a, __m128d b) 599{ 600 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 601} 602 603static __inline int __attribute__((__always_inline__, __nodebug__)) 604_mm_testnzc_pd(__m128d a, __m128d b) 605{ 606 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 607} 608 609static __inline int __attribute__((__always_inline__, __nodebug__)) 610_mm_testz_ps(__m128 a, __m128 b) 611{ 612 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 613} 614 615static __inline int __attribute__((__always_inline__, __nodebug__)) 616_mm_testc_ps(__m128 a, __m128 b) 617{ 618 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 619} 620 621static __inline int __attribute__((__always_inline__, __nodebug__)) 622_mm_testnzc_ps(__m128 a, __m128 b) 623{ 624 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 625} 626 627static __inline int __attribute__((__always_inline__, __nodebug__)) 628_mm256_testz_pd(__m256d a, __m256d b) 629{ 630 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 631} 632 633static __inline int __attribute__((__always_inline__, __nodebug__)) 634_mm256_testc_pd(__m256d a, __m256d b) 635{ 636 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 637} 638 639static __inline int __attribute__((__always_inline__, __nodebug__)) 640_mm256_testnzc_pd(__m256d a, __m256d b) 641{ 642 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 643} 644 645static __inline int __attribute__((__always_inline__, __nodebug__)) 646_mm256_testz_ps(__m256 a, __m256 b) 647{ 648 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 649} 650 651static __inline int __attribute__((__always_inline__, __nodebug__)) 652_mm256_testc_ps(__m256 a, __m256 b) 653{ 654 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 655} 656 657static __inline int __attribute__((__always_inline__, __nodebug__)) 658_mm256_testnzc_ps(__m256 a, __m256 b) 659{ 660 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 661} 662 663static __inline int __attribute__((__always_inline__, __nodebug__)) 664_mm256_testz_si256(__m256i a, __m256i b) 665{ 666 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 667} 668 669static __inline int __attribute__((__always_inline__, __nodebug__)) 670_mm256_testc_si256(__m256i a, __m256i b) 671{ 672 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 673} 674 675static __inline int __attribute__((__always_inline__, __nodebug__)) 676_mm256_testnzc_si256(__m256i a, __m256i b) 677{ 678 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 679} 680 681/* Vector extract sign mask */ 682static __inline int __attribute__((__always_inline__, __nodebug__)) 683_mm256_movemask_pd(__m256d a) 684{ 685 return __builtin_ia32_movmskpd256((__v4df)a); 686} 687 688static __inline int __attribute__((__always_inline__, __nodebug__)) 689_mm256_movemask_ps(__m256 a) 690{ 691 return __builtin_ia32_movmskps256((__v8sf)a); 692} 693 694/* Vector zero */ 695static __inline void __attribute__((__always_inline__, __nodebug__)) 696_mm256_zeroall(void) 697{ 698 __builtin_ia32_vzeroall(); 699} 700 701static __inline void __attribute__((__always_inline__, __nodebug__)) 702_mm256_zeroupper(void) 703{ 704 __builtin_ia32_vzeroupper(); 705} 706 707/* Vector load with broadcast */ 708static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 709_mm_broadcast_ss(float const *a) 710{ 711 return (__m128)__builtin_ia32_vbroadcastss(a); 712} 713 714static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 715_mm256_broadcast_sd(double const *a) 716{ 717 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 718} 719 720static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 721_mm256_broadcast_ss(float const *a) 722{ 723 return (__m256)__builtin_ia32_vbroadcastss256(a); 724} 725 726static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 727_mm256_broadcast_pd(__m128d const *a) 728{ 729 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 730} 731 732static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 733_mm256_broadcast_ps(__m128 const *a) 734{ 735 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 736} 737 738/* SIMD load ops */ 739static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 740_mm256_load_pd(double const *p) 741{ 742 return *(__m256d *)p; 743} 744 745static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 746_mm256_load_ps(float const *p) 747{ 748 return *(__m256 *)p; 749} 750 751static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 752_mm256_loadu_pd(double const *p) 753{ 754 return (__m256d)__builtin_ia32_loadupd256(p); 755} 756 757static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 758_mm256_loadu_ps(float const *p) 759{ 760 return (__m256)__builtin_ia32_loadups256(p); 761} 762 763static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 764_mm256_load_si256(__m256i const *p) 765{ 766 return *p; 767} 768 769static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 770_mm256_loadu_si256(__m256i const *p) 771{ 772 return (__m256i)__builtin_ia32_loaddqu256((char const *)p); 773} 774 775static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 776_mm256_lddqu_si256(__m256i const *p) 777{ 778 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 779} 780 781/* SIMD store ops */ 782static __inline void __attribute__((__always_inline__, __nodebug__)) 783_mm256_store_pd(double *p, __m256d a) 784{ 785 *(__m256d *)p = a; 786} 787 788static __inline void __attribute__((__always_inline__, __nodebug__)) 789_mm256_store_ps(float *p, __m256 a) 790{ 791 *(__m256 *)p = a; 792} 793 794static __inline void __attribute__((__always_inline__, __nodebug__)) 795_mm256_storeu_pd(double *p, __m256d a) 796{ 797 __builtin_ia32_storeupd256(p, (__v4df)a); 798} 799 800static __inline void __attribute__((__always_inline__, __nodebug__)) 801_mm256_storeu_ps(float *p, __m256 a) 802{ 803 __builtin_ia32_storeups256(p, (__v8sf)a); 804} 805 806static __inline void __attribute__((__always_inline__, __nodebug__)) 807_mm256_store_si256(__m256i *p, __m256i a) 808{ 809 *p = a; 810} 811 812static __inline void __attribute__((__always_inline__, __nodebug__)) 813_mm256_storeu_si256(__m256i *p, __m256i a) 814{ 815 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 816} 817 818/* Conditional load ops */ 819static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 820_mm_maskload_pd(double const *p, __m128d m) 821{ 822 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 823} 824 825static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 826_mm256_maskload_pd(double const *p, __m256d m) 827{ 828 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 829} 830 831static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 832_mm_maskload_ps(float const *p, __m128 m) 833{ 834 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 835} 836 837static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 838_mm256_maskload_ps(float const *p, __m256 m) 839{ 840 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 841} 842 843/* Conditional store ops */ 844static __inline void __attribute__((__always_inline__, __nodebug__)) 845_mm256_maskstore_ps(float *p, __m256 m, __m256 a) 846{ 847 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 848} 849 850static __inline void __attribute__((__always_inline__, __nodebug__)) 851_mm_maskstore_pd(double *p, __m128d m, __m128d a) 852{ 853 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 854} 855 856static __inline void __attribute__((__always_inline__, __nodebug__)) 857_mm256_maskstore_pd(double *p, __m256d m, __m256d a) 858{ 859 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 860} 861 862static __inline void __attribute__((__always_inline__, __nodebug__)) 863_mm_maskstore_ps(float *p, __m128 m, __m128 a) 864{ 865 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 866} 867 868/* Cacheability support ops */ 869static __inline void __attribute__((__always_inline__, __nodebug__)) 870_mm256_stream_si256(__m256i *a, __m256i b) 871{ 872 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 873} 874 875static __inline void __attribute__((__always_inline__, __nodebug__)) 876_mm256_stream_pd(double *a, __m256d b) 877{ 878 __builtin_ia32_movntpd256(a, (__v4df)b); 879} 880 881static __inline void __attribute__((__always_inline__, __nodebug__)) 882_mm256_stream_ps(float *p, __m256 a) 883{ 884 __builtin_ia32_movntps256(p, (__v8sf)a); 885} 886 887/* Create vectors */ 888static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 889_mm256_set_pd(double a, double b, double c, double d) 890{ 891 return (__m256d){ d, c, b, a }; 892} 893 894static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 895_mm256_set_ps(float a, float b, float c, float d, 896 float e, float f, float g, float h) 897{ 898 return (__m256){ h, g, f, e, d, c, b, a }; 899} 900 901static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 902_mm256_set_epi32(int i0, int i1, int i2, int i3, 903 int i4, int i5, int i6, int i7) 904{ 905 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 906} 907 908static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 909_mm256_set_epi16(short w15, short w14, short w13, short w12, 910 short w11, short w10, short w09, short w08, 911 short w07, short w06, short w05, short w04, 912 short w03, short w02, short w01, short w00) 913{ 914 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 915 w08, w09, w10, w11, w12, w13, w14, w15 }; 916} 917 918static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 919_mm256_set_epi8(char b31, char b30, char b29, char b28, 920 char b27, char b26, char b25, char b24, 921 char b23, char b22, char b21, char b20, 922 char b19, char b18, char b17, char b16, 923 char b15, char b14, char b13, char b12, 924 char b11, char b10, char b09, char b08, 925 char b07, char b06, char b05, char b04, 926 char b03, char b02, char b01, char b00) 927{ 928 return (__m256i)(__v32qi){ 929 b00, b01, b02, b03, b04, b05, b06, b07, 930 b08, b09, b10, b11, b12, b13, b14, b15, 931 b16, b17, b18, b19, b20, b21, b22, b23, 932 b24, b25, b26, b27, b28, b29, b30, b31 933 }; 934} 935 936static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 937_mm256_set_epi64x(long long a, long long b, long long c, long long d) 938{ 939 return (__m256i)(__v4di){ d, c, b, a }; 940} 941 942/* Create vectors with elements in reverse order */ 943static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 944_mm256_setr_pd(double a, double b, double c, double d) 945{ 946 return (__m256d){ a, b, c, d }; 947} 948 949static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 950_mm256_setr_ps(float a, float b, float c, float d, 951 float e, float f, float g, float h) 952{ 953 return (__m256){ a, b, c, d, e, f, g, h }; 954} 955 956static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 957_mm256_setr_epi32(int i0, int i1, int i2, int i3, 958 int i4, int i5, int i6, int i7) 959{ 960 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 961} 962 963static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 964_mm256_setr_epi16(short w15, short w14, short w13, short w12, 965 short w11, short w10, short w09, short w08, 966 short w07, short w06, short w05, short w04, 967 short w03, short w02, short w01, short w00) 968{ 969 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 970 w07, w06, w05, w04, w03, w02, w01, w00 }; 971} 972 973static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 974_mm256_setr_epi8(char b31, char b30, char b29, char b28, 975 char b27, char b26, char b25, char b24, 976 char b23, char b22, char b21, char b20, 977 char b19, char b18, char b17, char b16, 978 char b15, char b14, char b13, char b12, 979 char b11, char b10, char b09, char b08, 980 char b07, char b06, char b05, char b04, 981 char b03, char b02, char b01, char b00) 982{ 983 return (__m256i)(__v32qi){ 984 b31, b30, b29, b28, b27, b26, b25, b24, 985 b23, b22, b21, b20, b19, b18, b17, b16, 986 b15, b14, b13, b12, b11, b10, b09, b08, 987 b07, b06, b05, b04, b03, b02, b01, b00 }; 988} 989 990static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 991_mm256_setr_epi64x(long long a, long long b, long long c, long long d) 992{ 993 return (__m256i)(__v4di){ a, b, c, d }; 994} 995 996/* Create vectors with repeated elements */ 997static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 998_mm256_set1_pd(double w) 999{ 1000 return (__m256d){ w, w, w, w }; 1001} 1002 1003static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1004_mm256_set1_ps(float w) 1005{ 1006 return (__m256){ w, w, w, w, w, w, w, w }; 1007} 1008 1009static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1010_mm256_set1_epi32(int i) 1011{ 1012 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 1013} 1014 1015static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1016_mm256_set1_epi16(short w) 1017{ 1018 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 1019} 1020 1021static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1022_mm256_set1_epi8(char b) 1023{ 1024 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 1025 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1026} 1027 1028static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1029_mm256_set1_epi64x(long long q) 1030{ 1031 return (__m256i)(__v4di){ q, q, q, q }; 1032} 1033 1034/* Create zeroed vectors */ 1035static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1036_mm256_setzero_pd(void) 1037{ 1038 return (__m256d){ 0, 0, 0, 0 }; 1039} 1040 1041static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1042_mm256_setzero_ps(void) 1043{ 1044 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1045} 1046 1047static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1048_mm256_setzero_si256(void) 1049{ 1050 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1051} 1052 1053/* Cast between vector types */ 1054static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1055_mm256_castpd_ps(__m256d in) 1056{ 1057 return (__m256)in; 1058} 1059 1060static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1061_mm256_castpd_si256(__m256d in) 1062{ 1063 return (__m256i)in; 1064} 1065 1066static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1067_mm256_castps_pd(__m256 in) 1068{ 1069 return (__m256d)in; 1070} 1071 1072static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1073_mm256_castps_si256(__m256 in) 1074{ 1075 return (__m256i)in; 1076} 1077 1078static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1079_mm256_castsi256_ps(__m256i in) 1080{ 1081 return (__m256)in; 1082} 1083 1084static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1085_mm256_castsi256_pd(__m256i in) 1086{ 1087 return (__m256d)in; 1088} 1089 1090static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 1091_mm256_castpd256_pd128(__m256d in) 1092{ 1093 return __builtin_shufflevector(in, in, 0, 1); 1094} 1095 1096static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 1097_mm256_castps256_ps128(__m256 in) 1098{ 1099 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 1100} 1101 1102static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 1103_mm256_castsi256_si128(__m256i in) 1104{ 1105 return __builtin_shufflevector(in, in, 0, 1); 1106} 1107 1108static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1109_mm256_castpd128_pd256(__m128d in) 1110{ 1111 __m128d zero = _mm_setzero_pd(); 1112 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1113} 1114 1115static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1116_mm256_castps128_ps256(__m128 in) 1117{ 1118 __m128 zero = _mm_setzero_ps(); 1119 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 1120} 1121 1122static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1123_mm256_castsi128_si256(__m128i in) 1124{ 1125 __m128i zero = _mm_setzero_si128(); 1126 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1127} 1128