1/**************************************************************************** 2* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22****************************************************************************/ 23 24#ifndef __SWR_SIMD16INTRIN_H__ 25#define __SWR_SIMD16INTRIN_H__ 26 27#if ENABLE_AVX512_SIMD16 28 29#if KNOB_SIMD16_WIDTH == 16 30 31#if ENABLE_AVX512_EMULATION 32struct simd16scalar 33{ 34 __m256 lo; 35 __m256 hi; 36}; 37struct simd16scalard 38{ 39 __m256d lo; 40 __m256d hi; 41}; 42struct simd16scalari 43{ 44 __m256i lo; 45 __m256i hi; 46}; 47typedef uint16_t simd16mask; 48 49#define _simd16_masklo(mask) ((mask) & 0xFF) 50#define _simd16_maskhi(mask) (((mask) >> 8)) 51#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo)) 52 53#else 54typedef __m512 simd16scalar; 55typedef __m512d simd16scalard; 56typedef __m512i simd16scalari; 57typedef __mmask16 simd16mask; 58#endif//ENABLE_AVX512_EMULATION 59#else 60#error Unsupported vector width 61#endif//KNOB_SIMD16_WIDTH == 16 62 63OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector 64{ 65 simd16scalar v[4]; 66 struct 67 { 68 simd16scalar x, y, z, w; 69 }; 70 71 simd16scalar& operator[] (const int i) { return v[i]; } 72 const simd16scalar& operator[] (const int i) const { return v[i]; } 73}; 74 75#if ENABLE_AVX512_EMULATION 76 77#define SIMD16_EMU_AVX512_0(type, func, intrin) \ 78INLINE type func()\ 79{\ 80 type result;\ 81\ 82 result.lo = intrin();\ 83 result.hi = intrin();\ 84\ 85 return result;\ 86} 87 88#define SIMD16_EMU_AVX512_1(type, func, intrin) \ 89INLINE type func(type a)\ 90{\ 91 type result;\ 92\ 93 result.lo = intrin(a.lo);\ 94 result.hi = intrin(a.hi);\ 95\ 96 return result;\ 97} 98 99#define SIMD16_EMU_AVX512_2(type, func, intrin) \ 100INLINE type func(type a, type b)\ 101{\ 102 type result;\ 103\ 104 result.lo = intrin(a.lo, b.lo);\ 105 result.hi = intrin(a.hi, b.hi);\ 106\ 107 return result;\ 108} 109 110#define SIMD16_EMU_AVX512_3(type, func, intrin) \ 111INLINE type func(type a, type b, type c)\ 112{\ 113 type result;\ 114\ 115 result.lo = intrin(a.lo, b.lo, c.lo);\ 116 result.hi = intrin(a.hi, b.hi, c.hi);\ 117\ 118 return result;\ 119} 120 121SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps) 122SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256) 123 124INLINE simd16scalar _simd16_set1_ps(float a) 125{ 126 simd16scalar result; 127 128 result.lo = _mm256_set1_ps(a); 129 result.hi = _mm256_set1_ps(a); 130 131 return result; 132} 133 134INLINE simd16scalari _simd16_set1_epi8(char a) 135{ 136 simd16scalari result; 137 138 result.lo = _mm256_set1_epi8(a); 139 result.hi = _mm256_set1_epi8(a); 140 141 return result; 142} 143 144INLINE simd16scalari _simd16_set1_epi32(int a) 145{ 146 simd16scalari result; 147 148 result.lo = _mm256_set1_epi32(a); 149 result.hi = _mm256_set1_epi32(a); 150 151 return result; 152} 153 154INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) 155{ 156 simd16scalar result; 157 158 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); 159 result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8); 160 161 return result; 162} 163 164INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) 165{ 166 simd16scalari result; 167 168 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); 169 result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8); 170 171 return result; 172} 173 174INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) 175{ 176 simd16scalar result; 177 178 result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); 179 result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); 180 181 return result; 182} 183 184INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) 185{ 186 simd16scalari result; 187 188 result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); 189 result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); 190 191 return result; 192} 193 194INLINE simd16scalar _simd16_load_ps(float const *m) 195{ 196 simd16scalar result; 197 198 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo)); 199 200 result.lo = _mm256_load_ps(m); 201 result.hi = _mm256_load_ps(n); 202 203 return result; 204} 205 206INLINE simd16scalar _simd16_loadu_ps(float const *m) 207{ 208 simd16scalar result; 209 210 float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo)); 211 212 result.lo = _mm256_loadu_ps(m); 213 result.hi = _mm256_loadu_ps(n); 214 215 return result; 216} 217 218INLINE simd16scalar _simd16_load1_ps(float const *m) 219{ 220 simd16scalar result; 221 222 result.lo = _mm256_broadcast_ss(m); 223 result.hi = _mm256_broadcast_ss(m); 224 225 return result; 226} 227 228INLINE simd16scalari _simd16_load_si(simd16scalari const *m) 229{ 230 simd16scalari result; 231 232 result.lo = _mm256_load_si256(&m[0].lo); 233 result.hi = _mm256_load_si256(&m[0].hi); 234 235 return result; 236} 237 238INLINE simd16scalari _simd16_loadu_si(simd16scalari const *m) 239{ 240 simd16scalari result; 241 242 result.lo = _mm256_loadu_si256(&m[0].lo); 243 result.hi = _mm256_loadu_si256(&m[0].hi); 244 245 return result; 246} 247 248INLINE simd16scalar _simd16_broadcast_ss(float const *m) 249{ 250 simd16scalar result; 251 252 result.lo = _mm256_broadcast_ss(m); 253 result.hi = _mm256_broadcast_ss(m); 254 255 return result; 256} 257 258INLINE simd16scalar _simd16_broadcast_ps(__m128 const *m) 259{ 260 simd16scalar result; 261 262 result.lo = _mm256_broadcast_ps(m); 263 result.hi = _mm256_broadcast_ps(m); 264 265 return result; 266} 267 268INLINE void _simd16_store_ps(float *m, simd16scalar a) 269{ 270 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo)); 271 272 _mm256_store_ps(m, a.lo); 273 _mm256_store_ps(n, a.hi); 274} 275 276INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a) 277{ 278 float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo)); 279 280 _mm256_maskstore_ps(m, mask.lo, a.lo); 281 _mm256_maskstore_ps(n, mask.hi, a.hi); 282} 283 284INLINE void _simd16_store_si(simd16scalari *m, simd16scalari a) 285{ 286 _mm256_store_si256(&m[0].lo, a.lo); 287 _mm256_store_si256(&m[0].hi, a.hi); 288} 289 290INLINE simdscalar _simd16_extract_ps(simd16scalar a, int imm8) 291{ 292 switch (imm8) 293 { 294 case 0: 295 return a.lo; 296 case 1: 297 return a.hi; 298 } 299 return _simd_set1_ps(0.0f); 300} 301 302INLINE simdscalari _simd16_extract_si(simd16scalari a, int imm8) 303{ 304 switch (imm8) 305 { 306 case 0: 307 return a.lo; 308 case 1: 309 return a.hi; 310 } 311 return _simd_set1_epi32(0); 312} 313 314INLINE simd16scalar _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8) 315{ 316 switch (imm8) 317 { 318 case 0: 319 a.lo = b; 320 break; 321 case 1: 322 a.hi = b; 323 break; 324 } 325 return a; 326} 327 328INLINE simd16scalari _simd16_insert_si(simd16scalari a, simdscalari b, int imm8) 329{ 330 switch (imm8) 331 { 332 case 0: 333 a.lo = b; 334 break; 335 case 1: 336 a.hi = b; 337 break; 338 } 339 return a; 340} 341 342template <simd16mask mask> 343INLINE simd16scalar _simd16_blend_ps_temp(simd16scalar a, simd16scalar b) 344{ 345 simd16scalar result; 346 347 result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask)); 348 result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask)); 349 350 return result; 351} 352 353#define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b) 354 355SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps) 356 357INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask) 358{ 359 simd16scalari result; 360 361 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo)); 362 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi)); 363 364 return result; 365} 366 367INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask) 368{ 369 simd16scalari result; 370 371 result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo))); 372 result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi))); 373 374 return result; 375} 376 377SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps) 378SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps) 379SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps) 380SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps) 381SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps) 382SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps) 383 384INLINE simd16mask _simd16_movemask_ps(simd16scalar a) 385{ 386 simd16mask mask; 387 388 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo); 389 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi); 390 391 return mask; 392} 393 394INLINE simd16mask _simd16_movemask_pd(simd16scalard a) 395{ 396 simd16mask mask; 397 398 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo); 399 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi); 400 401 return mask; 402} 403 404INLINE simd16mask _simd16_movemask_epi8(simd16scalari a) 405{ 406 simd16mask mask; 407 408 reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo); 409 reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi); 410 411 return mask; 412} 413 414INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a) 415{ 416 simd16scalari result; 417 418 result.lo = _mm256_cvtps_epi32(a.lo); 419 result.hi = _mm256_cvtps_epi32(a.hi); 420 421 return result; 422} 423 424INLINE simd16scalari _simd16_cvttps_epi32(simd16scalar a) 425{ 426 simd16scalari result; 427 428 result.lo = _mm256_cvttps_epi32(a.lo); 429 result.hi = _mm256_cvttps_epi32(a.hi); 430 431 return result; 432} 433 434INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a) 435{ 436 simd16scalar result; 437 438 result.lo = _mm256_cvtepi32_ps(a.lo); 439 result.hi = _mm256_cvtepi32_ps(a.hi); 440 441 return result; 442} 443 444template <int comp> 445INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b) 446{ 447 simd16scalar result; 448 449 result.lo = _mm256_cmp_ps(a.lo, b.lo, comp); 450 result.hi = _mm256_cmp_ps(a.hi, b.hi, comp); 451 452 return result; 453} 454 455#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b) 456#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b) 457#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b) 458#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b) 459#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b) 460#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) 461 462SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps) 463SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps) 464SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps) 465SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps) 466 467INLINE simd16scalar _simd16_castsi_ps(simd16scalari a) 468{ 469 return *reinterpret_cast<simd16scalar *>(&a); 470} 471 472INLINE simd16scalari _simd16_castps_si(simd16scalar a) 473{ 474 return *reinterpret_cast<simd16scalari *>(&a); 475} 476 477INLINE simd16scalard _simd16_castsi_pd(simd16scalari a) 478{ 479 return *reinterpret_cast<simd16scalard *>(&a); 480} 481 482INLINE simd16scalari _simd16_castpd_si(simd16scalard a) 483{ 484 return *reinterpret_cast<simd16scalari *>(&a); 485} 486 487INLINE simd16scalar _simd16_castpd_ps(simd16scalard a) 488{ 489 return *reinterpret_cast<simd16scalar *>(&a); 490} 491 492INLINE simd16scalard _simd16_castps_pd(simd16scalar a) 493{ 494 return *reinterpret_cast<simd16scalard *>(&a); 495} 496 497SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps) 498 499template <int mode> 500INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) 501{ 502 simd16scalar result; 503 504 result.lo = _mm256_round_ps(a.lo, mode); 505 result.hi = _mm256_round_ps(a.hi, mode); 506 507 return result; 508} 509 510#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a) 511 512SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32) 513SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32) 514SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32) 515SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64) 516SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32) 517SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32) 518SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32) 519SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32) 520SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32) 521SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si) 522SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si) 523SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si) 524SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si) 525SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32) 526SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32) 527SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32) 528 529INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) 530{ 531 int lo = _mm256_testz_ps(a.lo, b.lo); 532 int hi = _mm256_testz_ps(a.hi, b.hi); 533 534 return lo & hi; 535} 536 537#define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a) 538 539SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps) 540SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps) 541SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd) 542SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd) 543 544SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8) 545SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8) 546SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16) 547SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16) 548SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32) 549SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32) 550SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64) 551SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64) 552 553template <int imm8> 554INLINE simd16scalari _simd16_slli_epi32_temp(simd16scalari a) 555{ 556 simd16scalari result; 557 558 result.lo = _simd_slli_epi32(a.lo, imm8); 559 result.hi = _simd_slli_epi32(a.hi, imm8); 560 561 return result; 562} 563 564#define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a) 565 566template <int imm8> 567INLINE simd16scalari _simd16_srai_epi32_temp(simd16scalari a) 568{ 569 simd16scalari result; 570 571 result.lo = _simd_srai_epi32(a.lo, imm8); 572 result.hi = _simd_srai_epi32(a.hi, imm8); 573 574 return result; 575} 576 577#define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a) 578 579template <int imm8> 580INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a) 581{ 582 simd16scalari result; 583 584 result.lo = _simd_srli_epi32(a.lo, imm8); 585 result.hi = _simd_srli_epi32(a.hi, imm8); 586 587 return result; 588} 589 590#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a) 591 592SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps) 593SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps) 594 595//__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) 596template <int scale> 597INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index) 598{ 599 simd16scalar result; 600 601 result.lo = _simd_i32gather_ps(m, index.lo, scale); 602 result.hi = _simd_i32gather_ps(m, index.hi, scale); 603 604 return result; 605} 606 607#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index) 608 609//__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) 610template <int scale> 611INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask) 612{ 613 simd16scalar result; 614 615 result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale); 616 result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale); 617 618 return result; 619} 620 621#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index) 622 623SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8) 624SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8) 625SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8) 626SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8) 627SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32) 628SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64) 629SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64) 630SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16) 631SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16) 632SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8) 633SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8) 634 635INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i) 636{ 637 simd16scalar result; 638 639 const simdscalari mask = _simd_set1_epi32(7); 640 641 simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask)); 642 simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask)); 643 644 simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask)); 645 simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask)); 646 647 result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask))); 648 result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask))); 649 650 return result; 651} 652 653INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i) 654{ 655 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i)); 656} 657 658SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32) 659SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32) 660 661template <int imm8> 662INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b) 663{ 664 simd16scalar result; 665 666 result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2)); 667 result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2)); 668 669 return result; 670} 671 672#define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b) 673 674template <int imm8> 675INLINE simd16scalard _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b) 676{ 677 simd16scalard result; 678 679 result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2)); 680 result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2)); 681 682 return result; 683} 684 685#define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b) 686 687template <int imm8> 688INLINE simd16scalari _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b) 689{ 690 simd16scalari result; 691 692 result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2)); 693 result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2)); 694 695 return result; 696} 697 698#define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b) 699 700template <int imm8> 701INLINE simd16scalar _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b) 702{ 703 simd16scalar result; 704 705 result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8); 706 result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8); 707 708 return result; 709} 710 711#define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b) 712 713template <int imm8> 714INLINE simd16scalard _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b) 715{ 716 simd16scalard result; 717 718 result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15)); 719 result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4)); 720 721 return result; 722} 723 724#define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b) 725 726template <int imm8> 727INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b) 728{ 729 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8)); 730} 731 732#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b) 733 734template <int imm8> 735INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b) 736{ 737 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8)); 738} 739 740#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b) 741 742INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a) 743{ 744 simd16scalari result; 745 746 result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0)); 747 result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1)); 748 749 return result; 750} 751 752INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a) 753{ 754 simd16scalari result; 755 756 result.lo = _simd_cvtepu8_epi32(a); 757 result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8)); 758 759 return result; 760} 761 762INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a) 763{ 764 simd16scalari result; 765 766 result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0)); 767 result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1)); 768 769 return result; 770} 771 772SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16) 773SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16) 774SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32) 775SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32) 776 777INLINE simd16mask _simd16_int2mask(int mask) 778{ 779 return mask; 780} 781 782INLINE int _simd16_mask2int(simd16mask mask) 783{ 784 return mask; 785} 786 787INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b) 788{ 789 return _simd16_movemask_ps(_simd16_cmplt_ps(a, b)); 790} 791 792// convert bitmask to vector mask 793INLINE simd16scalar vMask16(int32_t mask) 794{ 795 simd16scalari temp = _simd16_set1_epi32(mask); 796 797 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001); 798 799 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits)); 800 801 return _simd16_castsi_ps(result); 802} 803 804#else 805 806INLINE simd16mask _simd16_scalari2mask(simd16scalari mask) 807{ 808 return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32()); 809} 810 811#if 0 812INLINE simd16mask _simd16_scalard2mask(simd16scalard mask) 813{ 814 return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64()); 815} 816#endif 817 818#define _simd16_setzero_ps _mm512_setzero_ps 819#define _simd16_setzero_si _mm512_setzero_si512 820#define _simd16_set1_ps _mm512_set1_ps 821#define _simd16_set1_epi8 _mm512_set1_epi8 822#define _simd16_set1_epi32 _mm512_set1_epi32 823 824INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) 825{ 826 return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0); 827} 828 829INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) 830{ 831 return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0); 832} 833 834INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0) 835{ 836 return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0); 837} 838 839INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) 840{ 841 return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0); 842} 843 844#define _simd16_load_ps _mm512_load_ps 845#define _simd16_loadu_ps _mm512_loadu_ps 846#if 1 847#define _simd16_load1_ps _simd16_broadcast_ss 848#endif 849#define _simd16_load_si _mm512_load_si512 850#define _simd16_loadu_si _mm512_loadu_si512 851#define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0) 852#define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0) 853#define _simd16_store_ps _mm512_store_ps 854#define _simd16_store_si _mm512_store_si512 855#define _simd16_extract_ps _mm512_extractf32x8_ps 856#define _simd16_extract_si _mm512_extracti32x8_epi32 857#define _simd16_insert_ps _mm512_insertf32x8 858#define _simd16_insert_si _mm512_inserti32x8 859 860INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a) 861{ 862 simd16mask k = _simd16_scalari2mask(mask); 863 864 _mm512_mask_store_ps(m, k, a); 865} 866 867#define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b) 868 869INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask) 870{ 871 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask)); 872 873 _mm512_mask_blend_ps(k, a, b); 874} 875 876INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask) 877{ 878 simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask)); 879 880 _mm512_mask_blend_epi32(k, a, b); 881} 882 883INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask) 884{ 885 simd16mask k = _simd16_scalari2mask(mask); 886 887 _mm512_mask_blend_epi32(k, a, b); 888} 889 890#define _simd16_mul_ps _mm512_mul_ps 891#define _simd16_add_ps _mm512_add_ps 892#define _simd16_sub_ps _mm512_sub_ps 893#define _simd16_rsqrt_ps _mm512_rsqrt14_ps 894#define _simd16_min_ps _mm512_min_ps 895#define _simd16_max_ps _mm512_max_ps 896 897INLINE simd16mask _simd16_movemask_ps(simd16scalar a) 898{ 899 return _simd16_scalari2mask(_mm512_castps_si512(a)); 900} 901 902#if 0 903INLINE simd16mask _simd16_movemask_pd(simd16scalard a) 904{ 905 return _simd16_scalard2mask(_mm512i_castpd_si512(a)); 906} 907#endif 908 909#if 0 910INLINE int _simd16_movemask_epi8(simd16scalari a) 911{ 912 return _simd16_scalar2mask(a); 913} 914#endif 915 916#define _simd16_cvtps_epi32 _mm512_cvtps_epi32 917#define _simd16_cvttps_epi32 _mm512_cvttps_epi32 918#define _simd16_cvtepi32_ps _mm512_cvtepi32_ps 919 920template <int comp> 921INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b) 922{ 923 simd16mask k = _mm512_cmpeq_ps_mask(a, b); 924 925 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF))); 926} 927 928#define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b) 929 930#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b) 931#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b) 932#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b) 933#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b) 934#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b) 935#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) 936 937#define _simd16_castsi_ps _mm512_castsi512_ps 938#define _simd16_castps_si _mm512_castps_si512 939#define _simd16_castsi_pd _mm512_castsi512_pd 940#define _simd16_castpd_si _mm512_castpd_si512 941#define _simd16_castpd_ps _mm512_castpd_ps 942#define _simd16_castps_pd _mm512_castps_pd 943 944#define _simd16_andnot_ps _mm512_andnot_ps 945 946template <int mode> 947INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) 948{ 949 return _mm512_roundscale_ps(a, mode); 950} 951 952#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a) 953 954#define _simd16_mul_epi32 _mm512_mul_epi32 955#define _simd16_mullo_epi32 _mm512_mullo_epi32 956#define _simd16_sub_epi32 _mm512_sub_epi32 957#define _simd16_sub_epi64 _mm512_sub_epi64 958#define _simd16_min_epi32 _mm512_min_epi32 959#define _simd16_max_epi32 _mm512_max_epi32 960#define _simd16_min_epu32 _mm512_min_epu32 961#define _simd16_max_epu32 _mm512_max_epu32 962#define _simd16_add_epi32 _mm512_add_epi32 963#define _simd16_and_si _mm512_and_si512 964#define _simd16_andnot_si _mm512_andnot_si512 965#define _simd16_or_si _mm512_or_si512 966#define _simd16_xor_si _mm512_xor_si512 967 968INLINE simd16scalari _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b) 969{ 970 simd16mask k = _mm512_cmpeq_epi32_mask(a, b); 971 972 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)); 973} 974 975INLINE simd16scalari _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b) 976{ 977 simd16mask k = _mm512_cmpgt_epi32_mask(a, b); 978 979 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)); 980} 981 982INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b) 983{ 984 simd16mask k = _mm512_cmplt_epi32_mask(a, b); 985 986 return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)); 987} 988 989#if 0 990INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) 991{ 992 int lo = _mm256_testz_ps(a.lo, b.lo); 993 int hi = _mm256_testz_ps(a.hi, b.hi); 994 995 return lo & hi; 996} 997 998#endif 999 1000#define _simd16_unpacklo_ps _mm512_unpacklo_ps 1001#define _simd16_unpackhi_ps _mm512_unpackhi_ps 1002#define _simd16_unpacklo_pd _mm512_unpacklo_pd 1003#define _simd16_unpackhi_pd _mm512_unpackhi_pd 1004#define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8 1005#define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8 1006#define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16 1007#define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16 1008#define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32 1009#define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32 1010#define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64 1011#define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64 1012#define _simd16_slli_epi32 _mm512_slli_epi32 1013#define _simd16_srli_epi32 _mm512_srli_epi32 1014#define _simd16_srai_epi32 _mm512_srai_epi32 1015#define _simd16_fmadd_ps _mm512_fmadd_ps 1016#define _simd16_fmsub_ps _mm512_fmsub_ps 1017#define _simd16_adds_epu8 _mm512_adds_epu8 1018#define _simd16_subs_epu8 _mm512_subs_epu8 1019#define _simd16_add_epi8 _mm512_add_epi8 1020#define _simd16_shuffle_epi8 _mm512_shuffle_epi8 1021 1022#define _simd16_fmadd_ps _mm512_fmadd_ps 1023#define _simd16_fmsub_ps _mm512_fmsub_ps 1024 1025#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale) 1026#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale) 1027 1028#define _simd16_abs_epi32 _mm512_abs_epi32 1029#define _simd16_cmpeq_epi64 _mm512_abs_epi32 1030 1031INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b) 1032{ 1033 __mmask8 k = _mm512_cmpeq_epi64_mask(a, b); 1034 1035 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF)); 1036} 1037 1038INLINE simd16scalari _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b) 1039{ 1040 __mmask8 k = _mm512_cmpgt_epi64_mask(a, b); 1041 1042 return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF)); 1043} 1044 1045INLINE simd16scalari _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b) 1046{ 1047 __mmask32 k = _mm512_cmpeq_epi16_mask(a, b); 1048 1049 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF)); 1050} 1051 1052INLINE simd16scalari _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b) 1053{ 1054 __mmask32 k = _mm512_cmpgt_epi16_mask(a, b); 1055 1056 return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF)); 1057} 1058 1059INLINE simd16scalari _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b) 1060{ 1061 __mmask64 k = _mm512_cmpeq_epi8_mask(a, b); 1062 1063 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF)); 1064} 1065 1066INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b) 1067{ 1068 __mmask64 k = _mm512_cmpgt_epi8_mask(a, b); 1069 1070 return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF)); 1071} 1072 1073#define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a) 1074#define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a) 1075#define _simd16_sllv_epi32 _mm512_srlv_epi32 1076#define _simd16_srlv_epi32 _mm512_sllv_epi32 1077#define _simd16_permute2f128_ps _mm512_shuffle_f32x4 1078#define _simd16_permute2f128_pd _mm512_shuffle_f64x2 1079#define _simd16_permute2f128_si _mm512_shuffle_i32x4 1080#define _simd16_shuffle_ps _mm512_shuffle_ps 1081#define _simd16_shuffle_pd _mm512_shuffle_pd 1082#define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16 1083#define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32 1084#define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32 1085#define _simd16_packus_epi16 _mm512_packus_epi16 1086#define _simd16_packs_epi16 _mm512_packs_epi16 1087#define _simd16_packus_epi32 _mm512_packus_epi32 1088#define _simd16_packs_epi32 _mm512_packs_epi32 1089 1090template <int imm8> 1091INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b) 1092{ 1093 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8)); 1094} 1095 1096#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b) 1097 1098template <int imm8> 1099INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b) 1100{ 1101 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8)); 1102} 1103 1104#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b) 1105 1106INLINE simd16mask _simd16_int2mask(int mask) 1107{ 1108 return _mm512_int2mask(mask); 1109} 1110 1111INLINE int _simd16_mask2int(simd16mask mask) 1112{ 1113 return _mm512_mask2int(mask); 1114} 1115 1116INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b) 1117{ 1118 return _mm512_cmplt_ps_mask(a, b); 1119} 1120 1121// convert bitmask to vector mask 1122INLINE simd16scalar vMask16(int32_t mask) 1123{ 1124 simd16scalari temp = _simd16_set1_epi32(mask); 1125 1126 simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001); 1127 1128 simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits)); 1129 1130 return _simd16_castsi_ps(result); 1131} 1132 1133#endif//ENABLE_AVX512_EMULATION 1134 1135#endif//ENABLE_AVX512_SIMD16 1136 1137#endif//__SWR_SIMD16INTRIN_H_ 1138