1/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 26#endif 27 28typedef double __v4df __attribute__ ((__vector_size__ (32))); 29typedef float __v8sf __attribute__ ((__vector_size__ (32))); 30typedef long long __v4di __attribute__ ((__vector_size__ (32))); 31typedef int __v8si __attribute__ ((__vector_size__ (32))); 32typedef short __v16hi __attribute__ ((__vector_size__ (32))); 33typedef char __v32qi __attribute__ ((__vector_size__ (32))); 34 35typedef float __m256 __attribute__ ((__vector_size__ (32))); 36typedef double __m256d __attribute__((__vector_size__(32))); 37typedef long long __m256i __attribute__((__vector_size__(32))); 38 39/* Arithmetic */ 40static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 41_mm256_add_pd(__m256d a, __m256d b) 42{ 43 return a+b; 44} 45 46static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 47_mm256_add_ps(__m256 a, __m256 b) 48{ 49 return a+b; 50} 51 52static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 53_mm256_sub_pd(__m256d a, __m256d b) 54{ 55 return a-b; 56} 57 58static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 59_mm256_sub_ps(__m256 a, __m256 b) 60{ 61 return a-b; 62} 63 64static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 65_mm256_addsub_pd(__m256d a, __m256d b) 66{ 67 return (__m256d)__builtin_ia32_addsubpd256((__v4df)a, (__v4df)b); 68} 69 70static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 71_mm256_addsub_ps(__m256 a, __m256 b) 72{ 73 return (__m256)__builtin_ia32_addsubps256((__v8sf)a, (__v8sf)b); 74} 75 76static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 77_mm256_div_pd(__m256d a, __m256d b) 78{ 79 return a / b; 80} 81 82static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 83_mm256_div_ps(__m256 a, __m256 b) 84{ 85 return a / b; 86} 87 88static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 89_mm256_max_pd(__m256d a, __m256d b) 90{ 91 return (__m256d)__builtin_ia32_maxpd256((__v4df)a, (__v4df)b); 92} 93 94static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 95_mm256_max_ps(__m256 a, __m256 b) 96{ 97 return (__m256)__builtin_ia32_maxps256((__v8sf)a, (__v8sf)b); 98} 99 100static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 101_mm256_min_pd(__m256d a, __m256d b) 102{ 103 return (__m256d)__builtin_ia32_minpd256((__v4df)a, (__v4df)b); 104} 105 106static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 107_mm256_min_ps(__m256 a, __m256 b) 108{ 109 return (__m256)__builtin_ia32_minps256((__v8sf)a, (__v8sf)b); 110} 111 112static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 113_mm256_mul_pd(__m256d a, __m256d b) 114{ 115 return a * b; 116} 117 118static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 119_mm256_mul_ps(__m256 a, __m256 b) 120{ 121 return a * b; 122} 123 124static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 125_mm256_sqrt_pd(__m256d a) 126{ 127 return (__m256d)__builtin_ia32_sqrtpd256((__v4df)a); 128} 129 130static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 131_mm256_sqrt_ps(__m256 a) 132{ 133 return (__m256)__builtin_ia32_sqrtps256((__v8sf)a); 134} 135 136static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 137_mm256_rsqrt_ps(__m256 a) 138{ 139 return (__m256)__builtin_ia32_rsqrtps256((__v8sf)a); 140} 141 142static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 143_mm256_rcp_ps(__m256 a) 144{ 145 return (__m256)__builtin_ia32_rcpps256((__v8sf)a); 146} 147 148#define _mm256_round_pd(V, M) __extension__ ({ \ 149 __m256d __V = (V); \ 150 (__m256d)__builtin_ia32_roundpd256((__v4df)__V, (M)); }) 151 152#define _mm256_round_ps(V, M) __extension__ ({ \ 153 __m256 __V = (V); \ 154 (__m256)__builtin_ia32_roundps256((__v8sf)__V, (M)); }) 155 156#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) 157#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) 158#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) 159#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) 160 161/* Logical */ 162static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 163_mm256_and_pd(__m256d a, __m256d b) 164{ 165 return (__m256d)((__v4di)a & (__v4di)b); 166} 167 168static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 169_mm256_and_ps(__m256 a, __m256 b) 170{ 171 return (__m256)((__v8si)a & (__v8si)b); 172} 173 174static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 175_mm256_andnot_pd(__m256d a, __m256d b) 176{ 177 return (__m256d)(~(__v4di)a & (__v4di)b); 178} 179 180static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 181_mm256_andnot_ps(__m256 a, __m256 b) 182{ 183 return (__m256)(~(__v8si)a & (__v8si)b); 184} 185 186static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 187_mm256_or_pd(__m256d a, __m256d b) 188{ 189 return (__m256d)((__v4di)a | (__v4di)b); 190} 191 192static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 193_mm256_or_ps(__m256 a, __m256 b) 194{ 195 return (__m256)((__v8si)a | (__v8si)b); 196} 197 198static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 199_mm256_xor_pd(__m256d a, __m256d b) 200{ 201 return (__m256d)((__v4di)a ^ (__v4di)b); 202} 203 204static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 205_mm256_xor_ps(__m256 a, __m256 b) 206{ 207 return (__m256)((__v8si)a ^ (__v8si)b); 208} 209 210/* Horizontal arithmetic */ 211static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 212_mm256_hadd_pd(__m256d a, __m256d b) 213{ 214 return (__m256d)__builtin_ia32_haddpd256((__v4df)a, (__v4df)b); 215} 216 217static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 218_mm256_hadd_ps(__m256 a, __m256 b) 219{ 220 return (__m256)__builtin_ia32_haddps256((__v8sf)a, (__v8sf)b); 221} 222 223static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 224_mm256_hsub_pd(__m256d a, __m256d b) 225{ 226 return (__m256d)__builtin_ia32_hsubpd256((__v4df)a, (__v4df)b); 227} 228 229static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 230_mm256_hsub_ps(__m256 a, __m256 b) 231{ 232 return (__m256)__builtin_ia32_hsubps256((__v8sf)a, (__v8sf)b); 233} 234 235/* Vector permutations */ 236static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 237_mm_permutevar_pd(__m128d a, __m128i c) 238{ 239 return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)a, (__v2di)c); 240} 241 242static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 243_mm256_permutevar_pd(__m256d a, __m256i c) 244{ 245 return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)a, (__v4di)c); 246} 247 248static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 249_mm_permutevar_ps(__m128 a, __m128i c) 250{ 251 return (__m128)__builtin_ia32_vpermilvarps((__v4sf)a, (__v4si)c); 252} 253 254static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 255_mm256_permutevar_ps(__m256 a, __m256i c) 256{ 257 return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)a, 258 (__v8si)c); 259} 260 261#define _mm_permute_pd(A, C) __extension__ ({ \ 262 __m128d __A = (A); \ 263 (__m128d)__builtin_shufflevector((__v2df)__A, (__v2df) _mm_setzero_pd(), \ 264 (C) & 0x1, ((C) & 0x2) >> 1); }) 265 266#define _mm256_permute_pd(A, C) __extension__ ({ \ 267 __m256d __A = (A); \ 268 (__m256d)__builtin_shufflevector((__v4df)__A, (__v4df) _mm256_setzero_pd(), \ 269 (C) & 0x1, ((C) & 0x2) >> 1, \ 270 2 + (((C) & 0x4) >> 2), \ 271 2 + (((C) & 0x8) >> 3)); }) 272 273#define _mm_permute_ps(A, C) __extension__ ({ \ 274 __m128 __A = (A); \ 275 (__m128)__builtin_shufflevector((__v4sf)__A, (__v4sf) _mm_setzero_ps(), \ 276 (C) & 0x3, ((C) & 0xc) >> 2, \ 277 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) 278 279#define _mm256_permute_ps(A, C) __extension__ ({ \ 280 __m256 __A = (A); \ 281 (__m256)__builtin_shufflevector((__v8sf)__A, (__v8sf) _mm256_setzero_ps(), \ 282 (C) & 0x3, ((C) & 0xc) >> 2, \ 283 ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6, \ 284 4 + (((C) & 0x03) >> 0), \ 285 4 + (((C) & 0x0c) >> 2), \ 286 4 + (((C) & 0x30) >> 4), \ 287 4 + (((C) & 0xc0) >> 6)); }) 288 289#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ 290 __m256d __V1 = (V1); \ 291 __m256d __V2 = (V2); \ 292 (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__V1, (__v4df)__V2, (M)); }) 293 294#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ 295 __m256 __V1 = (V1); \ 296 __m256 __V2 = (V2); \ 297 (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 298 299#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ 300 __m256i __V1 = (V1); \ 301 __m256i __V2 = (V2); \ 302 (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__V1, (__v8si)__V2, (M)); }) 303 304/* Vector Blend */ 305#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ 306 __m256d __V1 = (V1); \ 307 __m256d __V2 = (V2); \ 308 (__m256d)__builtin_ia32_blendpd256((__v4df)__V1, (__v4df)__V2, (M)); }) 309 310#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ 311 __m256 __V1 = (V1); \ 312 __m256 __V2 = (V2); \ 313 (__m256)__builtin_ia32_blendps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 314 315static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 316_mm256_blendv_pd(__m256d a, __m256d b, __m256d c) 317{ 318 return (__m256d)__builtin_ia32_blendvpd256((__v4df)a, (__v4df)b, (__v4df)c); 319} 320 321static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 322_mm256_blendv_ps(__m256 a, __m256 b, __m256 c) 323{ 324 return (__m256)__builtin_ia32_blendvps256((__v8sf)a, (__v8sf)b, (__v8sf)c); 325} 326 327/* Vector Dot Product */ 328#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ 329 __m256 __V1 = (V1); \ 330 __m256 __V2 = (V2); \ 331 (__m256)__builtin_ia32_dpps256((__v8sf)__V1, (__v8sf)__V2, (M)); }) 332 333/* Vector shuffle */ 334#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ 335 __m256 __a = (a); \ 336 __m256 __b = (b); \ 337 (__m256)__builtin_shufflevector((__v8sf)__a, (__v8sf)__b, \ 338 (mask) & 0x3, ((mask) & 0xc) >> 2, \ 339 (((mask) & 0x30) >> 4) + 8, (((mask) & 0xc0) >> 6) + 8, \ 340 ((mask) & 0x3) + 4, (((mask) & 0xc) >> 2) + 4, \ 341 (((mask) & 0x30) >> 4) + 12, (((mask) & 0xc0) >> 6) + 12); }) 342 343#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ 344 __m256d __a = (a); \ 345 __m256d __b = (b); \ 346 (__m256d)__builtin_shufflevector((__v4df)__a, (__v4df)__b, \ 347 (mask) & 0x1, \ 348 (((mask) & 0x2) >> 1) + 4, \ 349 (((mask) & 0x4) >> 2) + 2, \ 350 (((mask) & 0x8) >> 3) + 6); }) 351 352/* Compare */ 353#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ 354#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ 355#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ 356#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ 357#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ 358#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ 359#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ 360#define _CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ 361#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ 362#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ 363#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ 364#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ 365#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ 366#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ 367#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ 368#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ 369#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ 370#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ 371#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ 372#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ 373#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ 374#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ 375#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ 376#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ 377#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ 378#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ 379#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ 380#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ 381#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ 382#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ 383#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ 384#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ 385 386#define _mm_cmp_pd(a, b, c) __extension__ ({ \ 387 __m128d __a = (a); \ 388 __m128d __b = (b); \ 389 (__m128d)__builtin_ia32_cmppd((__v2df)__a, (__v2df)__b, (c)); }) 390 391#define _mm_cmp_ps(a, b, c) __extension__ ({ \ 392 __m128 __a = (a); \ 393 __m128 __b = (b); \ 394 (__m128)__builtin_ia32_cmpps((__v4sf)__a, (__v4sf)__b, (c)); }) 395 396#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ 397 __m256d __a = (a); \ 398 __m256d __b = (b); \ 399 (__m256d)__builtin_ia32_cmppd256((__v4df)__a, (__v4df)__b, (c)); }) 400 401#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ 402 __m256 __a = (a); \ 403 __m256 __b = (b); \ 404 (__m256)__builtin_ia32_cmpps256((__v8sf)__a, (__v8sf)__b, (c)); }) 405 406#define _mm_cmp_sd(a, b, c) __extension__ ({ \ 407 __m128d __a = (a); \ 408 __m128d __b = (b); \ 409 (__m128d)__builtin_ia32_cmpsd((__v2df)__a, (__v2df)__b, (c)); }) 410 411#define _mm_cmp_ss(a, b, c) __extension__ ({ \ 412 __m128 __a = (a); \ 413 __m128 __b = (b); \ 414 (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) 415 416/* Vector extract */ 417#define _mm256_extractf128_pd(A, O) __extension__ ({ \ 418 __m256d __A = (A); \ 419 (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) 420 421#define _mm256_extractf128_ps(A, O) __extension__ ({ \ 422 __m256 __A = (A); \ 423 (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) 424 425#define _mm256_extractf128_si256(A, O) __extension__ ({ \ 426 __m256i __A = (A); \ 427 (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) 428 429static __inline int __attribute__((__always_inline__, __nodebug__)) 430_mm256_extract_epi32(__m256i a, int const imm) 431{ 432 __v8si b = (__v8si)a; 433 return b[imm]; 434} 435 436static __inline int __attribute__((__always_inline__, __nodebug__)) 437_mm256_extract_epi16(__m256i a, int const imm) 438{ 439 __v16hi b = (__v16hi)a; 440 return b[imm]; 441} 442 443static __inline int __attribute__((__always_inline__, __nodebug__)) 444_mm256_extract_epi8(__m256i a, int const imm) 445{ 446 __v32qi b = (__v32qi)a; 447 return b[imm]; 448} 449 450#ifdef __x86_64__ 451static __inline long long __attribute__((__always_inline__, __nodebug__)) 452_mm256_extract_epi64(__m256i a, const int imm) 453{ 454 __v4di b = (__v4di)a; 455 return b[imm]; 456} 457#endif 458 459/* Vector insert */ 460#define _mm256_insertf128_pd(V1, V2, O) __extension__ ({ \ 461 __m256d __V1 = (V1); \ 462 __m128d __V2 = (V2); \ 463 (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__V1, (__v2df)__V2, (O)); }) 464 465#define _mm256_insertf128_ps(V1, V2, O) __extension__ ({ \ 466 __m256 __V1 = (V1); \ 467 __m128 __V2 = (V2); \ 468 (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__V1, (__v4sf)__V2, (O)); }) 469 470#define _mm256_insertf128_si256(V1, V2, O) __extension__ ({ \ 471 __m256i __V1 = (V1); \ 472 __m128i __V2 = (V2); \ 473 (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__V1, (__v4si)__V2, (O)); }) 474 475static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 476_mm256_insert_epi32(__m256i a, int b, int const imm) 477{ 478 __v8si c = (__v8si)a; 479 c[imm & 7] = b; 480 return (__m256i)c; 481} 482 483static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 484_mm256_insert_epi16(__m256i a, int b, int const imm) 485{ 486 __v16hi c = (__v16hi)a; 487 c[imm & 15] = b; 488 return (__m256i)c; 489} 490 491static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 492_mm256_insert_epi8(__m256i a, int b, int const imm) 493{ 494 __v32qi c = (__v32qi)a; 495 c[imm & 31] = b; 496 return (__m256i)c; 497} 498 499#ifdef __x86_64__ 500static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 501_mm256_insert_epi64(__m256i a, int b, int const imm) 502{ 503 __v4di c = (__v4di)a; 504 c[imm & 3] = b; 505 return (__m256i)c; 506} 507#endif 508 509/* Conversion */ 510static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 511_mm256_cvtepi32_pd(__m128i a) 512{ 513 return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) a); 514} 515 516static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 517_mm256_cvtepi32_ps(__m256i a) 518{ 519 return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) a); 520} 521 522static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 523_mm256_cvtpd_ps(__m256d a) 524{ 525 return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) a); 526} 527 528static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 529_mm256_cvtps_epi32(__m256 a) 530{ 531 return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) a); 532} 533 534static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 535_mm256_cvtps_pd(__m128 a) 536{ 537 return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) a); 538} 539 540static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 541_mm256_cvttpd_epi32(__m256d a) 542{ 543 return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) a); 544} 545 546static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 547_mm256_cvtpd_epi32(__m256d a) 548{ 549 return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) a); 550} 551 552static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 553_mm256_cvttps_epi32(__m256 a) 554{ 555 return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) a); 556} 557 558/* Vector replicate */ 559static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 560_mm256_movehdup_ps(__m256 a) 561{ 562 return __builtin_shufflevector(a, a, 1, 1, 3, 3, 5, 5, 7, 7); 563} 564 565static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 566_mm256_moveldup_ps(__m256 a) 567{ 568 return __builtin_shufflevector(a, a, 0, 0, 2, 2, 4, 4, 6, 6); 569} 570 571static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 572_mm256_movedup_pd(__m256d a) 573{ 574 return __builtin_shufflevector(a, a, 0, 0, 2, 2); 575} 576 577/* Unpack and Interleave */ 578static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 579_mm256_unpackhi_pd(__m256d a, __m256d b) 580{ 581 return __builtin_shufflevector(a, b, 1, 5, 1+2, 5+2); 582} 583 584static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 585_mm256_unpacklo_pd(__m256d a, __m256d b) 586{ 587 return __builtin_shufflevector(a, b, 0, 4, 0+2, 4+2); 588} 589 590static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 591_mm256_unpackhi_ps(__m256 a, __m256 b) 592{ 593 return __builtin_shufflevector(a, b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); 594} 595 596static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 597_mm256_unpacklo_ps(__m256 a, __m256 b) 598{ 599 return __builtin_shufflevector(a, b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); 600} 601 602/* Bit Test */ 603static __inline int __attribute__((__always_inline__, __nodebug__)) 604_mm_testz_pd(__m128d a, __m128d b) 605{ 606 return __builtin_ia32_vtestzpd((__v2df)a, (__v2df)b); 607} 608 609static __inline int __attribute__((__always_inline__, __nodebug__)) 610_mm_testc_pd(__m128d a, __m128d b) 611{ 612 return __builtin_ia32_vtestcpd((__v2df)a, (__v2df)b); 613} 614 615static __inline int __attribute__((__always_inline__, __nodebug__)) 616_mm_testnzc_pd(__m128d a, __m128d b) 617{ 618 return __builtin_ia32_vtestnzcpd((__v2df)a, (__v2df)b); 619} 620 621static __inline int __attribute__((__always_inline__, __nodebug__)) 622_mm_testz_ps(__m128 a, __m128 b) 623{ 624 return __builtin_ia32_vtestzps((__v4sf)a, (__v4sf)b); 625} 626 627static __inline int __attribute__((__always_inline__, __nodebug__)) 628_mm_testc_ps(__m128 a, __m128 b) 629{ 630 return __builtin_ia32_vtestcps((__v4sf)a, (__v4sf)b); 631} 632 633static __inline int __attribute__((__always_inline__, __nodebug__)) 634_mm_testnzc_ps(__m128 a, __m128 b) 635{ 636 return __builtin_ia32_vtestnzcps((__v4sf)a, (__v4sf)b); 637} 638 639static __inline int __attribute__((__always_inline__, __nodebug__)) 640_mm256_testz_pd(__m256d a, __m256d b) 641{ 642 return __builtin_ia32_vtestzpd256((__v4df)a, (__v4df)b); 643} 644 645static __inline int __attribute__((__always_inline__, __nodebug__)) 646_mm256_testc_pd(__m256d a, __m256d b) 647{ 648 return __builtin_ia32_vtestcpd256((__v4df)a, (__v4df)b); 649} 650 651static __inline int __attribute__((__always_inline__, __nodebug__)) 652_mm256_testnzc_pd(__m256d a, __m256d b) 653{ 654 return __builtin_ia32_vtestnzcpd256((__v4df)a, (__v4df)b); 655} 656 657static __inline int __attribute__((__always_inline__, __nodebug__)) 658_mm256_testz_ps(__m256 a, __m256 b) 659{ 660 return __builtin_ia32_vtestzps256((__v8sf)a, (__v8sf)b); 661} 662 663static __inline int __attribute__((__always_inline__, __nodebug__)) 664_mm256_testc_ps(__m256 a, __m256 b) 665{ 666 return __builtin_ia32_vtestcps256((__v8sf)a, (__v8sf)b); 667} 668 669static __inline int __attribute__((__always_inline__, __nodebug__)) 670_mm256_testnzc_ps(__m256 a, __m256 b) 671{ 672 return __builtin_ia32_vtestnzcps256((__v8sf)a, (__v8sf)b); 673} 674 675static __inline int __attribute__((__always_inline__, __nodebug__)) 676_mm256_testz_si256(__m256i a, __m256i b) 677{ 678 return __builtin_ia32_ptestz256((__v4di)a, (__v4di)b); 679} 680 681static __inline int __attribute__((__always_inline__, __nodebug__)) 682_mm256_testc_si256(__m256i a, __m256i b) 683{ 684 return __builtin_ia32_ptestc256((__v4di)a, (__v4di)b); 685} 686 687static __inline int __attribute__((__always_inline__, __nodebug__)) 688_mm256_testnzc_si256(__m256i a, __m256i b) 689{ 690 return __builtin_ia32_ptestnzc256((__v4di)a, (__v4di)b); 691} 692 693/* Vector extract sign mask */ 694static __inline int __attribute__((__always_inline__, __nodebug__)) 695_mm256_movemask_pd(__m256d a) 696{ 697 return __builtin_ia32_movmskpd256((__v4df)a); 698} 699 700static __inline int __attribute__((__always_inline__, __nodebug__)) 701_mm256_movemask_ps(__m256 a) 702{ 703 return __builtin_ia32_movmskps256((__v8sf)a); 704} 705 706/* Vector zero */ 707static __inline void __attribute__((__always_inline__, __nodebug__)) 708_mm256_zeroall(void) 709{ 710 __builtin_ia32_vzeroall(); 711} 712 713static __inline void __attribute__((__always_inline__, __nodebug__)) 714_mm256_zeroupper(void) 715{ 716 __builtin_ia32_vzeroupper(); 717} 718 719/* Vector load with broadcast */ 720static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 721_mm_broadcast_ss(float const *a) 722{ 723 return (__m128)__builtin_ia32_vbroadcastss(a); 724} 725 726static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 727_mm256_broadcast_sd(double const *a) 728{ 729 return (__m256d)__builtin_ia32_vbroadcastsd256(a); 730} 731 732static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 733_mm256_broadcast_ss(float const *a) 734{ 735 return (__m256)__builtin_ia32_vbroadcastss256(a); 736} 737 738static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 739_mm256_broadcast_pd(__m128d const *a) 740{ 741 return (__m256d)__builtin_ia32_vbroadcastf128_pd256(a); 742} 743 744static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 745_mm256_broadcast_ps(__m128 const *a) 746{ 747 return (__m256)__builtin_ia32_vbroadcastf128_ps256(a); 748} 749 750/* SIMD load ops */ 751static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 752_mm256_load_pd(double const *p) 753{ 754 return *(__m256d *)p; 755} 756 757static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 758_mm256_load_ps(float const *p) 759{ 760 return *(__m256 *)p; 761} 762 763static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 764_mm256_loadu_pd(double const *p) 765{ 766 struct __loadu_pd { 767 __m256d v; 768 } __attribute__((packed, may_alias)); 769 return ((struct __loadu_pd*)p)->v; 770} 771 772static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 773_mm256_loadu_ps(float const *p) 774{ 775 struct __loadu_ps { 776 __m256 v; 777 } __attribute__((packed, may_alias)); 778 return ((struct __loadu_ps*)p)->v; 779} 780 781static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 782_mm256_load_si256(__m256i const *p) 783{ 784 return *p; 785} 786 787static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 788_mm256_loadu_si256(__m256i const *p) 789{ 790 struct __loadu_si256 { 791 __m256i v; 792 } __attribute__((packed, may_alias)); 793 return ((struct __loadu_si256*)p)->v; 794} 795 796static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 797_mm256_lddqu_si256(__m256i const *p) 798{ 799 return (__m256i)__builtin_ia32_lddqu256((char const *)p); 800} 801 802/* SIMD store ops */ 803static __inline void __attribute__((__always_inline__, __nodebug__)) 804_mm256_store_pd(double *p, __m256d a) 805{ 806 *(__m256d *)p = a; 807} 808 809static __inline void __attribute__((__always_inline__, __nodebug__)) 810_mm256_store_ps(float *p, __m256 a) 811{ 812 *(__m256 *)p = a; 813} 814 815static __inline void __attribute__((__always_inline__, __nodebug__)) 816_mm256_storeu_pd(double *p, __m256d a) 817{ 818 __builtin_ia32_storeupd256(p, (__v4df)a); 819} 820 821static __inline void __attribute__((__always_inline__, __nodebug__)) 822_mm256_storeu_ps(float *p, __m256 a) 823{ 824 __builtin_ia32_storeups256(p, (__v8sf)a); 825} 826 827static __inline void __attribute__((__always_inline__, __nodebug__)) 828_mm256_store_si256(__m256i *p, __m256i a) 829{ 830 *p = a; 831} 832 833static __inline void __attribute__((__always_inline__, __nodebug__)) 834_mm256_storeu_si256(__m256i *p, __m256i a) 835{ 836 __builtin_ia32_storedqu256((char *)p, (__v32qi)a); 837} 838 839/* Conditional load ops */ 840static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 841_mm_maskload_pd(double const *p, __m128d m) 842{ 843 return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)p, (__v2df)m); 844} 845 846static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 847_mm256_maskload_pd(double const *p, __m256d m) 848{ 849 return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)p, (__v4df)m); 850} 851 852static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 853_mm_maskload_ps(float const *p, __m128 m) 854{ 855 return (__m128)__builtin_ia32_maskloadps((const __v4sf *)p, (__v4sf)m); 856} 857 858static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 859_mm256_maskload_ps(float const *p, __m256 m) 860{ 861 return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)p, (__v8sf)m); 862} 863 864/* Conditional store ops */ 865static __inline void __attribute__((__always_inline__, __nodebug__)) 866_mm256_maskstore_ps(float *p, __m256 m, __m256 a) 867{ 868 __builtin_ia32_maskstoreps256((__v8sf *)p, (__v8sf)m, (__v8sf)a); 869} 870 871static __inline void __attribute__((__always_inline__, __nodebug__)) 872_mm_maskstore_pd(double *p, __m128d m, __m128d a) 873{ 874 __builtin_ia32_maskstorepd((__v2df *)p, (__v2df)m, (__v2df)a); 875} 876 877static __inline void __attribute__((__always_inline__, __nodebug__)) 878_mm256_maskstore_pd(double *p, __m256d m, __m256d a) 879{ 880 __builtin_ia32_maskstorepd256((__v4df *)p, (__v4df)m, (__v4df)a); 881} 882 883static __inline void __attribute__((__always_inline__, __nodebug__)) 884_mm_maskstore_ps(float *p, __m128 m, __m128 a) 885{ 886 __builtin_ia32_maskstoreps((__v4sf *)p, (__v4sf)m, (__v4sf)a); 887} 888 889/* Cacheability support ops */ 890static __inline void __attribute__((__always_inline__, __nodebug__)) 891_mm256_stream_si256(__m256i *a, __m256i b) 892{ 893 __builtin_ia32_movntdq256((__v4di *)a, (__v4di)b); 894} 895 896static __inline void __attribute__((__always_inline__, __nodebug__)) 897_mm256_stream_pd(double *a, __m256d b) 898{ 899 __builtin_ia32_movntpd256(a, (__v4df)b); 900} 901 902static __inline void __attribute__((__always_inline__, __nodebug__)) 903_mm256_stream_ps(float *p, __m256 a) 904{ 905 __builtin_ia32_movntps256(p, (__v8sf)a); 906} 907 908/* Create vectors */ 909static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 910_mm256_set_pd(double a, double b, double c, double d) 911{ 912 return (__m256d){ d, c, b, a }; 913} 914 915static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 916_mm256_set_ps(float a, float b, float c, float d, 917 float e, float f, float g, float h) 918{ 919 return (__m256){ h, g, f, e, d, c, b, a }; 920} 921 922static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 923_mm256_set_epi32(int i0, int i1, int i2, int i3, 924 int i4, int i5, int i6, int i7) 925{ 926 return (__m256i)(__v8si){ i7, i6, i5, i4, i3, i2, i1, i0 }; 927} 928 929static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 930_mm256_set_epi16(short w15, short w14, short w13, short w12, 931 short w11, short w10, short w09, short w08, 932 short w07, short w06, short w05, short w04, 933 short w03, short w02, short w01, short w00) 934{ 935 return (__m256i)(__v16hi){ w00, w01, w02, w03, w04, w05, w06, w07, 936 w08, w09, w10, w11, w12, w13, w14, w15 }; 937} 938 939static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 940_mm256_set_epi8(char b31, char b30, char b29, char b28, 941 char b27, char b26, char b25, char b24, 942 char b23, char b22, char b21, char b20, 943 char b19, char b18, char b17, char b16, 944 char b15, char b14, char b13, char b12, 945 char b11, char b10, char b09, char b08, 946 char b07, char b06, char b05, char b04, 947 char b03, char b02, char b01, char b00) 948{ 949 return (__m256i)(__v32qi){ 950 b00, b01, b02, b03, b04, b05, b06, b07, 951 b08, b09, b10, b11, b12, b13, b14, b15, 952 b16, b17, b18, b19, b20, b21, b22, b23, 953 b24, b25, b26, b27, b28, b29, b30, b31 954 }; 955} 956 957static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 958_mm256_set_epi64x(long long a, long long b, long long c, long long d) 959{ 960 return (__m256i)(__v4di){ d, c, b, a }; 961} 962 963/* Create vectors with elements in reverse order */ 964static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 965_mm256_setr_pd(double a, double b, double c, double d) 966{ 967 return (__m256d){ a, b, c, d }; 968} 969 970static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 971_mm256_setr_ps(float a, float b, float c, float d, 972 float e, float f, float g, float h) 973{ 974 return (__m256){ a, b, c, d, e, f, g, h }; 975} 976 977static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 978_mm256_setr_epi32(int i0, int i1, int i2, int i3, 979 int i4, int i5, int i6, int i7) 980{ 981 return (__m256i)(__v8si){ i0, i1, i2, i3, i4, i5, i6, i7 }; 982} 983 984static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 985_mm256_setr_epi16(short w15, short w14, short w13, short w12, 986 short w11, short w10, short w09, short w08, 987 short w07, short w06, short w05, short w04, 988 short w03, short w02, short w01, short w00) 989{ 990 return (__m256i)(__v16hi){ w15, w14, w13, w12, w11, w10, w09, w08, 991 w07, w06, w05, w04, w03, w02, w01, w00 }; 992} 993 994static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 995_mm256_setr_epi8(char b31, char b30, char b29, char b28, 996 char b27, char b26, char b25, char b24, 997 char b23, char b22, char b21, char b20, 998 char b19, char b18, char b17, char b16, 999 char b15, char b14, char b13, char b12, 1000 char b11, char b10, char b09, char b08, 1001 char b07, char b06, char b05, char b04, 1002 char b03, char b02, char b01, char b00) 1003{ 1004 return (__m256i)(__v32qi){ 1005 b31, b30, b29, b28, b27, b26, b25, b24, 1006 b23, b22, b21, b20, b19, b18, b17, b16, 1007 b15, b14, b13, b12, b11, b10, b09, b08, 1008 b07, b06, b05, b04, b03, b02, b01, b00 }; 1009} 1010 1011static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1012_mm256_setr_epi64x(long long a, long long b, long long c, long long d) 1013{ 1014 return (__m256i)(__v4di){ a, b, c, d }; 1015} 1016 1017/* Create vectors with repeated elements */ 1018static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1019_mm256_set1_pd(double w) 1020{ 1021 return (__m256d){ w, w, w, w }; 1022} 1023 1024static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1025_mm256_set1_ps(float w) 1026{ 1027 return (__m256){ w, w, w, w, w, w, w, w }; 1028} 1029 1030static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1031_mm256_set1_epi32(int i) 1032{ 1033 return (__m256i)(__v8si){ i, i, i, i, i, i, i, i }; 1034} 1035 1036static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1037_mm256_set1_epi16(short w) 1038{ 1039 return (__m256i)(__v16hi){ w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w }; 1040} 1041 1042static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1043_mm256_set1_epi8(char b) 1044{ 1045 return (__m256i)(__v32qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, 1046 b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b }; 1047} 1048 1049static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1050_mm256_set1_epi64x(long long q) 1051{ 1052 return (__m256i)(__v4di){ q, q, q, q }; 1053} 1054 1055/* Create zeroed vectors */ 1056static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1057_mm256_setzero_pd(void) 1058{ 1059 return (__m256d){ 0, 0, 0, 0 }; 1060} 1061 1062static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1063_mm256_setzero_ps(void) 1064{ 1065 return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1066} 1067 1068static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1069_mm256_setzero_si256(void) 1070{ 1071 return (__m256i){ 0LL, 0LL, 0LL, 0LL }; 1072} 1073 1074/* Cast between vector types */ 1075static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1076_mm256_castpd_ps(__m256d in) 1077{ 1078 return (__m256)in; 1079} 1080 1081static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1082_mm256_castpd_si256(__m256d in) 1083{ 1084 return (__m256i)in; 1085} 1086 1087static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1088_mm256_castps_pd(__m256 in) 1089{ 1090 return (__m256d)in; 1091} 1092 1093static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1094_mm256_castps_si256(__m256 in) 1095{ 1096 return (__m256i)in; 1097} 1098 1099static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1100_mm256_castsi256_ps(__m256i in) 1101{ 1102 return (__m256)in; 1103} 1104 1105static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1106_mm256_castsi256_pd(__m256i in) 1107{ 1108 return (__m256d)in; 1109} 1110 1111static __inline __m128d __attribute__((__always_inline__, __nodebug__)) 1112_mm256_castpd256_pd128(__m256d in) 1113{ 1114 return __builtin_shufflevector(in, in, 0, 1); 1115} 1116 1117static __inline __m128 __attribute__((__always_inline__, __nodebug__)) 1118_mm256_castps256_ps128(__m256 in) 1119{ 1120 return __builtin_shufflevector(in, in, 0, 1, 2, 3); 1121} 1122 1123static __inline __m128i __attribute__((__always_inline__, __nodebug__)) 1124_mm256_castsi256_si128(__m256i in) 1125{ 1126 return __builtin_shufflevector(in, in, 0, 1); 1127} 1128 1129static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1130_mm256_castpd128_pd256(__m128d in) 1131{ 1132 __m128d zero = _mm_setzero_pd(); 1133 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1134} 1135 1136static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1137_mm256_castps128_ps256(__m128 in) 1138{ 1139 __m128 zero = _mm_setzero_ps(); 1140 return __builtin_shufflevector(in, zero, 0, 1, 2, 3, 4, 4, 4, 4); 1141} 1142 1143static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1144_mm256_castsi128_si256(__m128i in) 1145{ 1146 __m128i zero = _mm_setzero_si128(); 1147 return __builtin_shufflevector(in, zero, 0, 1, 2, 2); 1148} 1149 1150/* SIMD load ops (unaligned) */ 1151static __inline __m256 __attribute__((__always_inline__, __nodebug__)) 1152_mm256_loadu2_m128(float const *addr_hi, float const *addr_lo) 1153{ 1154 struct __loadu_ps { 1155 __m128 v; 1156 } __attribute__((__packed__, __may_alias__)); 1157 1158 __m256 v256 = _mm256_castps128_ps256(((struct __loadu_ps*)addr_lo)->v); 1159 return _mm256_insertf128_ps(v256, ((struct __loadu_ps*)addr_hi)->v, 1); 1160} 1161 1162static __inline __m256d __attribute__((__always_inline__, __nodebug__)) 1163_mm256_loadu2_m128d(double const *addr_hi, double const *addr_lo) 1164{ 1165 struct __loadu_pd { 1166 __m128d v; 1167 } __attribute__((__packed__, __may_alias__)); 1168 1169 __m256d v256 = _mm256_castpd128_pd256(((struct __loadu_pd*)addr_lo)->v); 1170 return _mm256_insertf128_pd(v256, ((struct __loadu_pd*)addr_hi)->v, 1); 1171} 1172 1173static __inline __m256i __attribute__((__always_inline__, __nodebug__)) 1174_mm256_loadu2_m128i(__m128i const *addr_hi, __m128i const *addr_lo) 1175{ 1176 struct __loadu_si128 { 1177 __m128i v; 1178 } __attribute__((packed, may_alias)); 1179 __m256i v256 = _mm256_castsi128_si256(((struct __loadu_si128*)addr_lo)->v); 1180 return _mm256_insertf128_si256(v256, ((struct __loadu_si128*)addr_hi)->v, 1); 1181} 1182 1183/* SIMD store ops (unaligned) */ 1184static __inline void __attribute__((__always_inline__, __nodebug__)) 1185_mm256_storeu2_m128(float *addr_hi, float *addr_lo, __m256 a) 1186{ 1187 __m128 v128; 1188 1189 v128 = _mm256_castps256_ps128(a); 1190 __builtin_ia32_storeups(addr_lo, v128); 1191 v128 = _mm256_extractf128_ps(a, 1); 1192 __builtin_ia32_storeups(addr_hi, v128); 1193} 1194 1195static __inline void __attribute__((__always_inline__, __nodebug__)) 1196_mm256_storeu2_m128d(double *addr_hi, double *addr_lo, __m256d a) 1197{ 1198 __m128d v128; 1199 1200 v128 = _mm256_castpd256_pd128(a); 1201 __builtin_ia32_storeupd(addr_lo, v128); 1202 v128 = _mm256_extractf128_pd(a, 1); 1203 __builtin_ia32_storeupd(addr_hi, v128); 1204} 1205 1206static __inline void __attribute__((__always_inline__, __nodebug__)) 1207_mm256_storeu2_m128i(__m128i *addr_hi, __m128i *addr_lo, __m256i a) 1208{ 1209 __m128i v128; 1210 1211 v128 = _mm256_castsi256_si128(a); 1212 __builtin_ia32_storedqu((char *)addr_lo, (__v16qi)v128); 1213 v128 = _mm256_extractf128_si256(a, 1); 1214 __builtin_ia32_storedqu((char *)addr_hi, (__v16qi)v128); 1215} 1216