1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24#ifndef __IMMINTRIN_H 25#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26#endif 27 28/* SSE4 Multiple Packed Sums of Absolute Difference. */ 29#define _mm256_mpsadbw_epu8(X, Y, M) __builtin_ia32_mpsadbw256((X), (Y), (M)) 30 31static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 32_mm256_abs_epi8(__m256i __a) 33{ 34 return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a); 35} 36 37static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 38_mm256_abs_epi16(__m256i __a) 39{ 40 return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a); 41} 42 43static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 44_mm256_abs_epi32(__m256i __a) 45{ 46 return (__m256i)__builtin_ia32_pabsd256((__v8si)__a); 47} 48 49static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 50_mm256_packs_epi16(__m256i __a, __m256i __b) 51{ 52 return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); 53} 54 55static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 56_mm256_packs_epi32(__m256i __a, __m256i __b) 57{ 58 return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); 59} 60 61static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 62_mm256_packus_epi16(__m256i __a, __m256i __b) 63{ 64 return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); 65} 66 67static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 68_mm256_packus_epi32(__m256i __V1, __m256i __V2) 69{ 70 return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); 71} 72 73static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 74_mm256_add_epi8(__m256i __a, __m256i __b) 75{ 76 return (__m256i)((__v32qi)__a + (__v32qi)__b); 77} 78 79static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 80_mm256_add_epi16(__m256i __a, __m256i __b) 81{ 82 return (__m256i)((__v16hi)__a + (__v16hi)__b); 83} 84 85static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 86_mm256_add_epi32(__m256i __a, __m256i __b) 87{ 88 return (__m256i)((__v8si)__a + (__v8si)__b); 89} 90 91static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 92_mm256_add_epi64(__m256i __a, __m256i __b) 93{ 94 return __a + __b; 95} 96 97static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 98_mm256_adds_epi8(__m256i __a, __m256i __b) 99{ 100 return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b); 101} 102 103static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 104_mm256_adds_epi16(__m256i __a, __m256i __b) 105{ 106 return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b); 107} 108 109static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 110_mm256_adds_epu8(__m256i __a, __m256i __b) 111{ 112 return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b); 113} 114 115static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 116_mm256_adds_epu16(__m256i __a, __m256i __b) 117{ 118 return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); 119} 120 121#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \ 122 __m256i __a = (a); \ 123 __m256i __b = (b); \ 124 (__m256i)__builtin_ia32_palignr256((__v32qi)__a, (__v32qi)__b, (n)); }) 125 126static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 127_mm256_and_si256(__m256i __a, __m256i __b) 128{ 129 return __a & __b; 130} 131 132static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 133_mm256_andnot_si256(__m256i __a, __m256i __b) 134{ 135 return ~__a & __b; 136} 137 138static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 139_mm256_avg_epu8(__m256i __a, __m256i __b) 140{ 141 return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); 142} 143 144static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 145_mm256_avg_epu16(__m256i __a, __m256i __b) 146{ 147 return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); 148} 149 150static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 151_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) 152{ 153 return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, 154 (__v32qi)__M); 155} 156 157#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \ 158 __m256i __V1 = (V1); \ 159 __m256i __V2 = (V2); \ 160 (__m256i)__builtin_ia32_pblendw256((__v16hi)__V1, (__v16hi)__V2, (M)); }) 161 162static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 163_mm256_cmpeq_epi8(__m256i __a, __m256i __b) 164{ 165 return (__m256i)((__v32qi)__a == (__v32qi)__b); 166} 167 168static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 169_mm256_cmpeq_epi16(__m256i __a, __m256i __b) 170{ 171 return (__m256i)((__v16hi)__a == (__v16hi)__b); 172} 173 174static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 175_mm256_cmpeq_epi32(__m256i __a, __m256i __b) 176{ 177 return (__m256i)((__v8si)__a == (__v8si)__b); 178} 179 180static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 181_mm256_cmpeq_epi64(__m256i __a, __m256i __b) 182{ 183 return (__m256i)(__a == __b); 184} 185 186static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 187_mm256_cmpgt_epi8(__m256i __a, __m256i __b) 188{ 189 return (__m256i)((__v32qi)__a > (__v32qi)__b); 190} 191 192static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 193_mm256_cmpgt_epi16(__m256i __a, __m256i __b) 194{ 195 return (__m256i)((__v16hi)__a > (__v16hi)__b); 196} 197 198static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 199_mm256_cmpgt_epi32(__m256i __a, __m256i __b) 200{ 201 return (__m256i)((__v8si)__a > (__v8si)__b); 202} 203 204static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 205_mm256_cmpgt_epi64(__m256i __a, __m256i __b) 206{ 207 return (__m256i)(__a > __b); 208} 209 210static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 211_mm256_hadd_epi16(__m256i __a, __m256i __b) 212{ 213 return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); 214} 215 216static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 217_mm256_hadd_epi32(__m256i __a, __m256i __b) 218{ 219 return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); 220} 221 222static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 223_mm256_hadds_epi16(__m256i __a, __m256i __b) 224{ 225 return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); 226} 227 228static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 229_mm256_hsub_epi16(__m256i __a, __m256i __b) 230{ 231 return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); 232} 233 234static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 235_mm256_hsub_epi32(__m256i __a, __m256i __b) 236{ 237 return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); 238} 239 240static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 241_mm256_hsubs_epi16(__m256i __a, __m256i __b) 242{ 243 return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); 244} 245 246static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 247_mm256_maddubs_epi16(__m256i __a, __m256i __b) 248{ 249 return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); 250} 251 252static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 253_mm256_madd_epi16(__m256i __a, __m256i __b) 254{ 255 return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); 256} 257 258static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 259_mm256_max_epi8(__m256i __a, __m256i __b) 260{ 261 return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b); 262} 263 264static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 265_mm256_max_epi16(__m256i __a, __m256i __b) 266{ 267 return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b); 268} 269 270static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 271_mm256_max_epi32(__m256i __a, __m256i __b) 272{ 273 return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b); 274} 275 276static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 277_mm256_max_epu8(__m256i __a, __m256i __b) 278{ 279 return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b); 280} 281 282static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 283_mm256_max_epu16(__m256i __a, __m256i __b) 284{ 285 return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b); 286} 287 288static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 289_mm256_max_epu32(__m256i __a, __m256i __b) 290{ 291 return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b); 292} 293 294static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 295_mm256_min_epi8(__m256i __a, __m256i __b) 296{ 297 return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b); 298} 299 300static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 301_mm256_min_epi16(__m256i __a, __m256i __b) 302{ 303 return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b); 304} 305 306static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 307_mm256_min_epi32(__m256i __a, __m256i __b) 308{ 309 return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b); 310} 311 312static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 313_mm256_min_epu8(__m256i __a, __m256i __b) 314{ 315 return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b); 316} 317 318static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 319_mm256_min_epu16(__m256i __a, __m256i __b) 320{ 321 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b); 322} 323 324static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 325_mm256_min_epu32(__m256i __a, __m256i __b) 326{ 327 return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b); 328} 329 330static __inline__ int __attribute__((__always_inline__, __nodebug__)) 331_mm256_movemask_epi8(__m256i __a) 332{ 333 return __builtin_ia32_pmovmskb256((__v32qi)__a); 334} 335 336static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 337_mm256_cvtepi8_epi16(__m128i __V) 338{ 339 return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__V); 340} 341 342static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 343_mm256_cvtepi8_epi32(__m128i __V) 344{ 345 return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__V); 346} 347 348static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 349_mm256_cvtepi8_epi64(__m128i __V) 350{ 351 return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__V); 352} 353 354static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 355_mm256_cvtepi16_epi32(__m128i __V) 356{ 357 return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__V); 358} 359 360static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 361_mm256_cvtepi16_epi64(__m128i __V) 362{ 363 return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__V); 364} 365 366static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 367_mm256_cvtepi32_epi64(__m128i __V) 368{ 369 return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__V); 370} 371 372static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 373_mm256_cvtepu8_epi16(__m128i __V) 374{ 375 return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__V); 376} 377 378static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 379_mm256_cvtepu8_epi32(__m128i __V) 380{ 381 return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__V); 382} 383 384static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 385_mm256_cvtepu8_epi64(__m128i __V) 386{ 387 return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__V); 388} 389 390static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 391_mm256_cvtepu16_epi32(__m128i __V) 392{ 393 return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__V); 394} 395 396static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 397_mm256_cvtepu16_epi64(__m128i __V) 398{ 399 return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__V); 400} 401 402static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 403_mm256_cvtepu32_epi64(__m128i __V) 404{ 405 return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__V); 406} 407 408static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 409_mm256_mul_epi32(__m256i __a, __m256i __b) 410{ 411 return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); 412} 413 414static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 415_mm256_mulhrs_epi16(__m256i __a, __m256i __b) 416{ 417 return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); 418} 419 420static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 421_mm256_mulhi_epu16(__m256i __a, __m256i __b) 422{ 423 return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); 424} 425 426static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 427_mm256_mulhi_epi16(__m256i __a, __m256i __b) 428{ 429 return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); 430} 431 432static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 433_mm256_mullo_epi16(__m256i __a, __m256i __b) 434{ 435 return (__m256i)((__v16hi)__a * (__v16hi)__b); 436} 437 438static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 439_mm256_mullo_epi32 (__m256i __a, __m256i __b) 440{ 441 return (__m256i)((__v8si)__a * (__v8si)__b); 442} 443 444static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 445_mm256_mul_epu32(__m256i __a, __m256i __b) 446{ 447 return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); 448} 449 450static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 451_mm256_or_si256(__m256i __a, __m256i __b) 452{ 453 return __a | __b; 454} 455 456static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 457_mm256_sad_epu8(__m256i __a, __m256i __b) 458{ 459 return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); 460} 461 462static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 463_mm256_shuffle_epi8(__m256i __a, __m256i __b) 464{ 465 return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); 466} 467 468#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \ 469 __m256i __a = (a); \ 470 (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)_mm256_set1_epi32(0), \ 471 (imm) & 0x3, ((imm) & 0xc) >> 2, \ 472 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 473 4 + (((imm) & 0x03) >> 0), \ 474 4 + (((imm) & 0x0c) >> 2), \ 475 4 + (((imm) & 0x30) >> 4), \ 476 4 + (((imm) & 0xc0) >> 6)); }) 477 478#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \ 479 __m256i __a = (a); \ 480 (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \ 481 0, 1, 2, 3, \ 482 4 + (((imm) & 0x03) >> 0), \ 483 4 + (((imm) & 0x0c) >> 2), \ 484 4 + (((imm) & 0x30) >> 4), \ 485 4 + (((imm) & 0xc0) >> 6), \ 486 8, 9, 10, 11, \ 487 12 + (((imm) & 0x03) >> 0), \ 488 12 + (((imm) & 0x0c) >> 2), \ 489 12 + (((imm) & 0x30) >> 4), \ 490 12 + (((imm) & 0xc0) >> 6)); }) 491 492#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \ 493 __m256i __a = (a); \ 494 (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)_mm256_set1_epi16(0), \ 495 (imm) & 0x3,((imm) & 0xc) >> 2, \ 496 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 497 4, 5, 6, 7, \ 498 8 + (((imm) & 0x03) >> 0), \ 499 8 + (((imm) & 0x0c) >> 2), \ 500 8 + (((imm) & 0x30) >> 4), \ 501 8 + (((imm) & 0xc0) >> 6), \ 502 12, 13, 14, 15); }) 503 504static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 505_mm256_sign_epi8(__m256i __a, __m256i __b) 506{ 507 return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); 508} 509 510static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 511_mm256_sign_epi16(__m256i __a, __m256i __b) 512{ 513 return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); 514} 515 516static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 517_mm256_sign_epi32(__m256i __a, __m256i __b) 518{ 519 return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); 520} 521 522#define _mm256_slli_si256(a, count) __extension__ ({ \ 523 __m256i __a = (a); \ 524 (__m256i)__builtin_ia32_pslldqi256(__a, (count)*8); }) 525 526static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 527_mm256_slli_epi16(__m256i __a, int __count) 528{ 529 return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); 530} 531 532static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 533_mm256_sll_epi16(__m256i __a, __m128i __count) 534{ 535 return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); 536} 537 538static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 539_mm256_slli_epi32(__m256i __a, int __count) 540{ 541 return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); 542} 543 544static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 545_mm256_sll_epi32(__m256i __a, __m128i __count) 546{ 547 return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); 548} 549 550static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 551_mm256_slli_epi64(__m256i __a, int __count) 552{ 553 return __builtin_ia32_psllqi256(__a, __count); 554} 555 556static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 557_mm256_sll_epi64(__m256i __a, __m128i __count) 558{ 559 return __builtin_ia32_psllq256(__a, __count); 560} 561 562static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 563_mm256_srai_epi16(__m256i __a, int __count) 564{ 565 return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); 566} 567 568static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 569_mm256_sra_epi16(__m256i __a, __m128i __count) 570{ 571 return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); 572} 573 574static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 575_mm256_srai_epi32(__m256i __a, int __count) 576{ 577 return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); 578} 579 580static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 581_mm256_sra_epi32(__m256i __a, __m128i __count) 582{ 583 return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); 584} 585 586#define _mm256_srli_si256(a, count) __extension__ ({ \ 587 __m256i __a = (a); \ 588 (__m256i)__builtin_ia32_psrldqi256(__a, (count)*8); }) 589 590static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 591_mm256_srli_epi16(__m256i __a, int __count) 592{ 593 return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); 594} 595 596static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 597_mm256_srl_epi16(__m256i __a, __m128i __count) 598{ 599 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); 600} 601 602static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 603_mm256_srli_epi32(__m256i __a, int __count) 604{ 605 return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); 606} 607 608static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 609_mm256_srl_epi32(__m256i __a, __m128i __count) 610{ 611 return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); 612} 613 614static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 615_mm256_srli_epi64(__m256i __a, int __count) 616{ 617 return __builtin_ia32_psrlqi256(__a, __count); 618} 619 620static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 621_mm256_srl_epi64(__m256i __a, __m128i __count) 622{ 623 return __builtin_ia32_psrlq256(__a, __count); 624} 625 626static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 627_mm256_sub_epi8(__m256i __a, __m256i __b) 628{ 629 return (__m256i)((__v32qi)__a - (__v32qi)__b); 630} 631 632static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 633_mm256_sub_epi16(__m256i __a, __m256i __b) 634{ 635 return (__m256i)((__v16hi)__a - (__v16hi)__b); 636} 637 638static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 639_mm256_sub_epi32(__m256i __a, __m256i __b) 640{ 641 return (__m256i)((__v8si)__a - (__v8si)__b); 642} 643 644static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 645_mm256_sub_epi64(__m256i __a, __m256i __b) 646{ 647 return __a - __b; 648} 649 650static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 651_mm256_subs_epi8(__m256i __a, __m256i __b) 652{ 653 return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b); 654} 655 656static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 657_mm256_subs_epi16(__m256i __a, __m256i __b) 658{ 659 return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b); 660} 661 662static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 663_mm256_subs_epu8(__m256i __a, __m256i __b) 664{ 665 return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b); 666} 667 668static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 669_mm256_subs_epu16(__m256i __a, __m256i __b) 670{ 671 return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b); 672} 673 674static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 675_mm256_unpackhi_epi8(__m256i __a, __m256i __b) 676{ 677 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); 678} 679 680static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 681_mm256_unpackhi_epi16(__m256i __a, __m256i __b) 682{ 683 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 684} 685 686static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 687_mm256_unpackhi_epi32(__m256i __a, __m256i __b) 688{ 689 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); 690} 691 692static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 693_mm256_unpackhi_epi64(__m256i __a, __m256i __b) 694{ 695 return (__m256i)__builtin_shufflevector(__a, __b, 1, 4+1, 3, 4+3); 696} 697 698static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 699_mm256_unpacklo_epi8(__m256i __a, __m256i __b) 700{ 701 return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); 702} 703 704static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 705_mm256_unpacklo_epi16(__m256i __a, __m256i __b) 706{ 707 return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); 708} 709 710static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 711_mm256_unpacklo_epi32(__m256i __a, __m256i __b) 712{ 713 return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); 714} 715 716static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 717_mm256_unpacklo_epi64(__m256i __a, __m256i __b) 718{ 719 return (__m256i)__builtin_shufflevector(__a, __b, 0, 4+0, 2, 4+2); 720} 721 722static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 723_mm256_xor_si256(__m256i __a, __m256i __b) 724{ 725 return __a ^ __b; 726} 727 728static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 729_mm256_stream_load_si256(__m256i *__V) 730{ 731 return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__V); 732} 733 734static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) 735_mm_broadcastss_ps(__m128 __X) 736{ 737 return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X); 738} 739 740static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) 741_mm256_broadcastss_ps(__m128 __X) 742{ 743 return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X); 744} 745 746static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) 747_mm256_broadcastsd_pd(__m128d __X) 748{ 749 return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X); 750} 751 752static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 753_mm_broadcastsi128_si256(__m128i const *__a) 754{ 755 return (__m256i)__builtin_ia32_vbroadcastsi256(__a); 756} 757 758#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \ 759 __m128i __V1 = (V1); \ 760 __m128i __V2 = (V2); \ 761 (__m128i)__builtin_ia32_pblendd128((__v4si)__V1, (__v4si)__V2, (M)); }) 762 763#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \ 764 __m256i __V1 = (V1); \ 765 __m256i __V2 = (V2); \ 766 (__m256i)__builtin_ia32_pblendd256((__v8si)__V1, (__v8si)__V2, (M)); }) 767 768static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 769_mm256_broadcastb_epi8(__m128i __X) 770{ 771 return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X); 772} 773 774static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 775_mm256_broadcastw_epi16(__m128i __X) 776{ 777 return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X); 778} 779 780static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 781_mm256_broadcastd_epi32(__m128i __X) 782{ 783 return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X); 784} 785 786static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 787_mm256_broadcastq_epi64(__m128i __X) 788{ 789 return (__m256i)__builtin_ia32_pbroadcastq256(__X); 790} 791 792static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 793_mm_broadcastb_epi8(__m128i __X) 794{ 795 return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X); 796} 797 798static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 799_mm_broadcastw_epi16(__m128i __X) 800{ 801 return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X); 802} 803 804 805static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 806_mm_broadcastd_epi32(__m128i __X) 807{ 808 return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X); 809} 810 811static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 812_mm_broadcastq_epi64(__m128i __X) 813{ 814 return (__m128i)__builtin_ia32_pbroadcastq128(__X); 815} 816 817static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 818_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) 819{ 820 return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); 821} 822 823#define _mm256_permute4x64_pd(V, M) __extension__ ({ \ 824 __m256d __V = (V); \ 825 (__m256d)__builtin_shufflevector((__v4df)__V, (__v4df) _mm256_setzero_pd(), \ 826 (M) & 0x3, ((M) & 0xc) >> 2, \ 827 ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); }) 828 829static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) 830_mm256_permutevar8x32_ps(__m256 __a, __m256 __b) 831{ 832 return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8sf)__b); 833} 834 835#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \ 836 __m256i __V = (V); \ 837 (__m256i)__builtin_shufflevector((__v4di)__V, (__v4di) _mm256_setzero_si256(), \ 838 (M) & 0x3, ((M) & 0xc) >> 2, \ 839 ((M) & 0x30) >> 4, ((M) & 0xc0) >> 6); }) 840 841#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \ 842 __m256i __V1 = (V1); \ 843 __m256i __V2 = (V2); \ 844 (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); }) 845 846#define _mm256_extracti128_si256(A, O) __extension__ ({ \ 847 __m256i __A = (A); \ 848 (__m128i)__builtin_ia32_extract128i256(__A, (O)); }) 849 850#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \ 851 __m256i __V1 = (V1); \ 852 __m128i __V2 = (V2); \ 853 (__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); }) 854 855static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 856_mm256_maskload_epi32(int const *__X, __m256i __M) 857{ 858 return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); 859} 860 861static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 862_mm256_maskload_epi64(long long const *__X, __m256i __M) 863{ 864 return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, __M); 865} 866 867static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 868_mm_maskload_epi32(int const *__X, __m128i __M) 869{ 870 return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); 871} 872 873static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 874_mm_maskload_epi64(long long const *__X, __m128i __M) 875{ 876 return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); 877} 878 879static __inline__ void __attribute__((__always_inline__, __nodebug__)) 880_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) 881{ 882 __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 883} 884 885static __inline__ void __attribute__((__always_inline__, __nodebug__)) 886_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) 887{ 888 __builtin_ia32_maskstoreq256((__v4di *)__X, __M, __Y); 889} 890 891static __inline__ void __attribute__((__always_inline__, __nodebug__)) 892_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) 893{ 894 __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 895} 896 897static __inline__ void __attribute__((__always_inline__, __nodebug__)) 898_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) 899{ 900 __builtin_ia32_maskstoreq(( __v2di *)__X, __M, __Y); 901} 902 903static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 904_mm256_sllv_epi32(__m256i __X, __m256i __Y) 905{ 906 return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); 907} 908 909static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 910_mm_sllv_epi32(__m128i __X, __m128i __Y) 911{ 912 return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); 913} 914 915static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 916_mm256_sllv_epi64(__m256i __X, __m256i __Y) 917{ 918 return (__m256i)__builtin_ia32_psllv4di(__X, __Y); 919} 920 921static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 922_mm_sllv_epi64(__m128i __X, __m128i __Y) 923{ 924 return (__m128i)__builtin_ia32_psllv2di(__X, __Y); 925} 926 927static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 928_mm256_srav_epi32(__m256i __X, __m256i __Y) 929{ 930 return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); 931} 932 933static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 934_mm_srav_epi32(__m128i __X, __m128i __Y) 935{ 936 return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); 937} 938 939static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 940_mm256_srlv_epi32(__m256i __X, __m256i __Y) 941{ 942 return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); 943} 944 945static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 946_mm_srlv_epi32(__m128i __X, __m128i __Y) 947{ 948 return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); 949} 950 951static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) 952_mm256_srlv_epi64(__m256i __X, __m256i __Y) 953{ 954 return (__m256i)__builtin_ia32_psrlv4di(__X, __Y); 955} 956 957static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) 958_mm_srlv_epi64(__m128i __X, __m128i __Y) 959{ 960 return (__m128i)__builtin_ia32_psrlv2di(__X, __Y); 961} 962 963#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \ 964 __m128d __a = (a); \ 965 double const *__m = (m); \ 966 __m128i __i = (i); \ 967 __m128d __mask = (mask); \ 968 (__m128d)__builtin_ia32_gatherd_pd((__v2df)__a, (const __v2df *)__m, \ 969 (__v4si)__i, (__v2df)__mask, (s)); }) 970 971#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \ 972 __m256d __a = (a); \ 973 double const *__m = (m); \ 974 __m128i __i = (i); \ 975 __m256d __mask = (mask); \ 976 (__m256d)__builtin_ia32_gatherd_pd256((__v4df)__a, (const __v4df *)__m, \ 977 (__v4si)__i, (__v4df)__mask, (s)); }) 978 979#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \ 980 __m128d __a = (a); \ 981 double const *__m = (m); \ 982 __m128i __i = (i); \ 983 __m128d __mask = (mask); \ 984 (__m128d)__builtin_ia32_gatherq_pd((__v2df)__a, (const __v2df *)__m, \ 985 (__v2di)__i, (__v2df)__mask, (s)); }) 986 987#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \ 988 __m256d __a = (a); \ 989 double const *__m = (m); \ 990 __m256i __i = (i); \ 991 __m256d __mask = (mask); \ 992 (__m256d)__builtin_ia32_gatherq_pd256((__v4df)__a, (const __v4df *)__m, \ 993 (__v4di)__i, (__v4df)__mask, (s)); }) 994 995#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \ 996 __m128 __a = (a); \ 997 float const *__m = (m); \ 998 __m128i __i = (i); \ 999 __m128 __mask = (mask); \ 1000 (__m128)__builtin_ia32_gatherd_ps((__v4sf)__a, (const __v4sf *)__m, \ 1001 (__v4si)__i, (__v4sf)__mask, (s)); }) 1002 1003#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \ 1004 __m256 __a = (a); \ 1005 float const *__m = (m); \ 1006 __m256i __i = (i); \ 1007 __m256 __mask = (mask); \ 1008 (__m256)__builtin_ia32_gatherd_ps256((__v8sf)__a, (const __v8sf *)__m, \ 1009 (__v8si)__i, (__v8sf)__mask, (s)); }) 1010 1011#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \ 1012 __m128 __a = (a); \ 1013 float const *__m = (m); \ 1014 __m128i __i = (i); \ 1015 __m128 __mask = (mask); \ 1016 (__m128)__builtin_ia32_gatherq_ps((__v4sf)__a, (const __v4sf *)__m, \ 1017 (__v2di)__i, (__v4sf)__mask, (s)); }) 1018 1019#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \ 1020 __m128 __a = (a); \ 1021 float const *__m = (m); \ 1022 __m256i __i = (i); \ 1023 __m128 __mask = (mask); \ 1024 (__m128)__builtin_ia32_gatherq_ps256((__v4sf)__a, (const __v4sf *)__m, \ 1025 (__v4di)__i, (__v4sf)__mask, (s)); }) 1026 1027#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \ 1028 __m128i __a = (a); \ 1029 int const *__m = (m); \ 1030 __m128i __i = (i); \ 1031 __m128i __mask = (mask); \ 1032 (__m128i)__builtin_ia32_gatherd_d((__v4si)__a, (const __v4si *)__m, \ 1033 (__v4si)__i, (__v4si)__mask, (s)); }) 1034 1035#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \ 1036 __m256i __a = (a); \ 1037 int const *__m = (m); \ 1038 __m256i __i = (i); \ 1039 __m256i __mask = (mask); \ 1040 (__m256i)__builtin_ia32_gatherd_d256((__v8si)__a, (const __v8si *)__m, \ 1041 (__v8si)__i, (__v8si)__mask, (s)); }) 1042 1043#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \ 1044 __m128i __a = (a); \ 1045 int const *__m = (m); \ 1046 __m128i __i = (i); \ 1047 __m128i __mask = (mask); \ 1048 (__m128i)__builtin_ia32_gatherq_d((__v4si)__a, (const __v4si *)__m, \ 1049 (__v2di)__i, (__v4si)__mask, (s)); }) 1050 1051#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \ 1052 __m128i __a = (a); \ 1053 int const *__m = (m); \ 1054 __m256i __i = (i); \ 1055 __m128i __mask = (mask); \ 1056 (__m128i)__builtin_ia32_gatherq_d256((__v4si)__a, (const __v4si *)__m, \ 1057 (__v4di)__i, (__v4si)__mask, (s)); }) 1058 1059#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \ 1060 __m128i __a = (a); \ 1061 int const *__m = (m); \ 1062 __m128i __i = (i); \ 1063 __m128i __mask = (mask); \ 1064 (__m128i)__builtin_ia32_gatherd_q((__v2di)__a, (const __v2di *)__m, \ 1065 (__v4si)__i, (__v2di)__mask, (s)); }) 1066 1067#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \ 1068 __m256i __a = (a); \ 1069 int const *__m = (m); \ 1070 __m128i __i = (i); \ 1071 __m256i __mask = (mask); \ 1072 (__m256i)__builtin_ia32_gatherd_q256((__v4di)__a, (const __v4di *)__m, \ 1073 (__v4si)__i, (__v4di)__mask, (s)); }) 1074 1075#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \ 1076 __m128i __a = (a); \ 1077 int const *__m = (m); \ 1078 __m128i __i = (i); \ 1079 __m128i __mask = (mask); \ 1080 (__m128i)__builtin_ia32_gatherq_q((__v2di)__a, (const __v2di *)__m, \ 1081 (__v2di)__i, (__v2di)__mask, (s)); }) 1082 1083#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \ 1084 __m256i __a = (a); \ 1085 int const *__m = (m); \ 1086 __m256i __i = (i); \ 1087 __m256i __mask = (mask); \ 1088 (__m256i)__builtin_ia32_gatherq_q256((__v4di)__a, (const __v4di *)__m, \ 1089 (__v4di)__i, (__v4di)__mask, (s)); }) 1090 1091#define _mm_i32gather_pd(m, i, s) __extension__ ({ \ 1092 double const *__m = (m); \ 1093 __m128i __i = (i); \ 1094 (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_setzero_pd(), \ 1095 (const __v2df *)__m, (__v4si)__i, \ 1096 (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); }) 1097 1098#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \ 1099 double const *__m = (m); \ 1100 __m128i __i = (i); \ 1101 (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_setzero_pd(), \ 1102 (const __v4df *)__m, (__v4si)__i, \ 1103 (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); }) 1104 1105#define _mm_i64gather_pd(m, i, s) __extension__ ({ \ 1106 double const *__m = (m); \ 1107 __m128i __i = (i); \ 1108 (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_setzero_pd(), \ 1109 (const __v2df *)__m, (__v2di)__i, \ 1110 (__v2df)_mm_set1_pd((double)(long long int)-1), (s)); }) 1111 1112#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \ 1113 double const *__m = (m); \ 1114 __m256i __i = (i); \ 1115 (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_setzero_pd(), \ 1116 (const __v4df *)__m, (__v4di)__i, \ 1117 (__v4df)_mm256_set1_pd((double)(long long int)-1), (s)); }) 1118 1119#define _mm_i32gather_ps(m, i, s) __extension__ ({ \ 1120 float const *__m = (m); \ 1121 __m128i __i = (i); \ 1122 (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_setzero_ps(), \ 1123 (const __v4sf *)__m, (__v4si)__i, \ 1124 (__v4sf)_mm_set1_ps((float)(int)-1), (s)); }) 1125 1126#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \ 1127 float const *__m = (m); \ 1128 __m256i __i = (i); \ 1129 (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_setzero_ps(), \ 1130 (const __v8sf *)__m, (__v8si)__i, \ 1131 (__v8sf)_mm256_set1_ps((float)(int)-1), (s)); }) 1132 1133#define _mm_i64gather_ps(m, i, s) __extension__ ({ \ 1134 float const *__m = (m); \ 1135 __m128i __i = (i); \ 1136 (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_setzero_ps(), \ 1137 (const __v4sf *)__m, (__v2di)__i, \ 1138 (__v4sf)_mm_set1_ps((float)(int)-1), (s)); }) 1139 1140#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \ 1141 float const *__m = (m); \ 1142 __m256i __i = (i); \ 1143 (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_setzero_ps(), \ 1144 (const __v4sf *)__m, (__v4di)__i, \ 1145 (__v4sf)_mm_set1_ps((float)(int)-1), (s)); }) 1146 1147#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \ 1148 int const *__m = (m); \ 1149 __m128i __i = (i); \ 1150 (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_setzero_si128(), \ 1151 (const __v4si *)__m, (__v4si)__i, \ 1152 (__v4si)_mm_set1_epi32(-1), (s)); }) 1153 1154#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \ 1155 int const *__m = (m); \ 1156 __m256i __i = (i); \ 1157 (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_setzero_si256(), \ 1158 (const __v8si *)__m, (__v8si)__i, \ 1159 (__v8si)_mm256_set1_epi32(-1), (s)); }) 1160 1161#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \ 1162 int const *__m = (m); \ 1163 __m128i __i = (i); \ 1164 (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_setzero_si128(), \ 1165 (const __v4si *)__m, (__v2di)__i, \ 1166 (__v4si)_mm_set1_epi32(-1), (s)); }) 1167 1168#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \ 1169 int const *__m = (m); \ 1170 __m256i __i = (i); \ 1171 (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_setzero_si128(), \ 1172 (const __v4si *)__m, (__v4di)__i, \ 1173 (__v4si)_mm_set1_epi32(-1), (s)); }) 1174 1175#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \ 1176 int const *__m = (m); \ 1177 __m128i __i = (i); \ 1178 (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_setzero_si128(), \ 1179 (const __v2di *)__m, (__v4si)__i, \ 1180 (__v2di)_mm_set1_epi64x(-1), (s)); }) 1181 1182#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \ 1183 int const *__m = (m); \ 1184 __m128i __i = (i); \ 1185 (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_setzero_si256(), \ 1186 (const __v4di *)__m, (__v4si)__i, \ 1187 (__v4di)_mm256_set1_epi64x(-1), (s)); }) 1188 1189#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \ 1190 int const *__m = (m); \ 1191 __m128i __i = (i); \ 1192 (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_setzero_si128(), \ 1193 (const __v2di *)__m, (__v2di)__i, \ 1194 (__v2di)_mm_set1_epi64x(-1), (s)); }) 1195 1196#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \ 1197 int const *__m = (m); \ 1198 __m256i __i = (i); \ 1199 (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_setzero_si256(), \ 1200 (const __v4di *)__m, (__v4di)__i, \ 1201 (__v4di)_mm256_set1_epi64x(-1), (s)); }) 1202