utils.h revision 33fa4c99f7fa68fd8c33c75c4fe66c4cca76779f
1/**************************************************************************** 2* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22* 23* @file utils.h 24* 25* @brief Utilities used by SWR core. 26* 27******************************************************************************/ 28#pragma once 29 30#include <string.h> 31#include <type_traits> 32#include <algorithm> 33#include "common/os.h" 34#include "common/simdintrin.h" 35#include "common/swr_assert.h" 36#include "core/api.h" 37 38#if defined(_WIN64) || defined(__x86_64__) 39#define _MM_INSERT_EPI64 _mm_insert_epi64 40#define _MM_EXTRACT_EPI64 _mm_extract_epi64 41#else 42INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) 43{ 44 OSALIGNLINE(uint32_t) elems[4]; 45 _mm_store_si128((__m128i*)elems, a); 46 if (ndx == 0) 47 { 48 uint64_t foo = elems[0]; 49 foo |= (uint64_t)elems[1] << 32; 50 return foo; 51 } 52 else 53 { 54 uint64_t foo = elems[2]; 55 foo |= (uint64_t)elems[3] << 32; 56 return foo; 57 } 58} 59 60INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx) 61{ 62 OSALIGNLINE(int64_t) elems[2]; 63 _mm_store_si128((__m128i*)elems, a); 64 if (ndx == 0) 65 { 66 elems[0] = b; 67 } 68 else 69 { 70 elems[1] = b; 71 } 72 __m128i out; 73 out = _mm_load_si128((const __m128i*)elems); 74 return out; 75} 76#endif 77 78struct simdBBox 79{ 80 simdscalari ymin; 81 simdscalari ymax; 82 simdscalari xmin; 83 simdscalari xmax; 84}; 85 86INLINE 87void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) 88{ 89 __m128i row0i = _mm_castps_si128(row0); 90 __m128i row1i = _mm_castps_si128(row1); 91 __m128i row2i = _mm_castps_si128(row2); 92 __m128i row3i = _mm_castps_si128(row3); 93 94 __m128i vTemp = row2i; 95 row2i = _mm_unpacklo_epi32(row2i, row3i); 96 vTemp = _mm_unpackhi_epi32(vTemp, row3i); 97 98 row3i = row0i; 99 row0i = _mm_unpacklo_epi32(row0i, row1i); 100 row3i = _mm_unpackhi_epi32(row3i, row1i); 101 102 row1i = row0i; 103 row0i = _mm_unpacklo_epi64(row0i, row2i); 104 row1i = _mm_unpackhi_epi64(row1i, row2i); 105 106 row2i = row3i; 107 row2i = _mm_unpacklo_epi64(row2i, vTemp); 108 row3i = _mm_unpackhi_epi64(row3i, vTemp); 109 110 row0 = _mm_castsi128_ps(row0i); 111 row1 = _mm_castsi128_ps(row1i); 112 row2 = _mm_castsi128_ps(row2i); 113 row3 = _mm_castsi128_ps(row3i); 114} 115 116INLINE 117void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) 118{ 119 __m128i vTemp = row2; 120 row2 = _mm_unpacklo_epi32(row2, row3); 121 vTemp = _mm_unpackhi_epi32(vTemp, row3); 122 123 row3 = row0; 124 row0 = _mm_unpacklo_epi32(row0, row1); 125 row3 = _mm_unpackhi_epi32(row3, row1); 126 127 row1 = row0; 128 row0 = _mm_unpacklo_epi64(row0, row2); 129 row1 = _mm_unpackhi_epi64(row1, row2); 130 131 row2 = row3; 132 row2 = _mm_unpacklo_epi64(row2, vTemp); 133 row3 = _mm_unpackhi_epi64(row3, vTemp); 134} 135 136#define GCC_VERSION (__GNUC__ * 10000 \ 137 + __GNUC_MINOR__ * 100 \ 138 + __GNUC_PATCHLEVEL__) 139 140#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) 141#define _mm_undefined_ps _mm_setzero_ps 142#define _mm_undefined_si128 _mm_setzero_si128 143#if KNOB_SIMD_WIDTH == 8 144#define _mm256_undefined_ps _mm256_setzero_ps 145#endif 146#endif 147 148#if KNOB_SIMD_WIDTH == 8 149INLINE 150void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2) 151{ 152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 156 157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 161 162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 166 167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 171} 172 173INLINE 174void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3) 175{ 176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 180 181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 185 186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 190 191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 195} 196 197INLINE 198void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) 199{ 200 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); 201 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); 202 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); 203 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); 204 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); 205 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); 206 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); 207 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); 208 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 209 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 210 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 211 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 212 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 213 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 214 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 215 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 216 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); 217 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); 218 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); 219 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); 220 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); 221 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); 222 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); 223 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); 224} 225 226INLINE 227void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) 228{ 229 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 230 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); 231} 232#endif 233 234////////////////////////////////////////////////////////////////////////// 235/// TranposeSingleComponent 236////////////////////////////////////////////////////////////////////////// 237template<uint32_t bpp> 238struct TransposeSingleComponent 239{ 240 ////////////////////////////////////////////////////////////////////////// 241 /// @brief Pass-thru for single component. 242 /// @param pSrc - source data in SOA form 243 /// @param pDst - output data in AOS form 244 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 245 { 246 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 247 } 248#if ENABLE_AVX512_SIMD16 249 250 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 251 { 252 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); 253 } 254#endif 255}; 256 257////////////////////////////////////////////////////////////////////////// 258/// Transpose8_8_8_8 259////////////////////////////////////////////////////////////////////////// 260struct Transpose8_8_8_8 261{ 262 ////////////////////////////////////////////////////////////////////////// 263 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 264 /// @param pSrc - source data in SOA form 265 /// @param pDst - output data in AOS form 266 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 267 { 268 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 269 270#if KNOB_SIMD_WIDTH == 8 271#if KNOB_ARCH == KNOB_ARCH_AVX 272 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg 273 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa 274 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 275 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 276 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 277 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa 278 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba 279 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba 280 _mm_store_si128((__m128i*)pDst, c0123lo); 281 _mm_store_si128((__m128i*)(pDst + 16), c0123hi); 282#elif KNOB_ARCH == KNOB_ARCH_AVX2 283 simdscalari dst01 = _mm256_shuffle_epi8(src, 284 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 285 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 286 dst23 = _mm256_shuffle_epi8(dst23, 287 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 288 simdscalari dst = _mm256_or_si256(dst01, dst23); 289 _simd_store_si((simdscalari*)pDst, dst); 290#endif 291#else 292#error Unsupported vector width 293#endif 294 } 295#if ENABLE_AVX512_SIMD16 296 297 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 298 { 299 simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); 300 301 simd16scalari mask0 = _simd16_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800); 302 303 simd16scalari dst01 = _simd16_shuffle_epi8(src, mask0); 304 305 simd16scalari perm1 = _simd16_permute2f128_si(src, src, 1); 306 307 simd16scalari mask1 = _simd16_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080); 308 309 simd16scalari dst23 = _simd16_shuffle_epi8(perm1, mask1); 310 311 simd16scalari dst = _simd16_or_si(dst01, dst23); 312 313 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); 314 } 315#endif 316}; 317 318////////////////////////////////////////////////////////////////////////// 319/// Transpose8_8_8 320////////////////////////////////////////////////////////////////////////// 321struct Transpose8_8_8 322{ 323 ////////////////////////////////////////////////////////////////////////// 324 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 325 /// @param pSrc - source data in SOA form 326 /// @param pDst - output data in AOS form 327 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 328#if ENABLE_AVX512_SIMD16 329 330 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 331#endif 332}; 333 334////////////////////////////////////////////////////////////////////////// 335/// Transpose8_8 336////////////////////////////////////////////////////////////////////////// 337struct Transpose8_8 338{ 339 ////////////////////////////////////////////////////////////////////////// 340 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 341 /// @param pSrc - source data in SOA form 342 /// @param pDst - output data in AOS form 343 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 344 { 345#if KNOB_SIMD_WIDTH == 8 346 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 347 348 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg 349 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg 350 rg = _mm_unpacklo_epi8(rg, g); 351 _mm_store_si128((__m128i*)pDst, rg); 352#else 353#error Unsupported vector width 354#endif 355 } 356#if ENABLE_AVX512_SIMD16 357 358 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 359 { 360 simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg 361 362 simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx 363 364 r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx 365 366 g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx 367 368 simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg 369 370 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); 371 } 372#endif 373}; 374 375////////////////////////////////////////////////////////////////////////// 376/// Transpose32_32_32_32 377////////////////////////////////////////////////////////////////////////// 378struct Transpose32_32_32_32 379{ 380 ////////////////////////////////////////////////////////////////////////// 381 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 382 /// @param pSrc - source data in SOA form 383 /// @param pDst - output data in AOS form 384 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 385 { 386#if KNOB_SIMD_WIDTH == 8 387 simdscalar src0 = _simd_load_ps((const float*)pSrc); 388 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 389 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 390 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 391 392 __m128 vDst[8]; 393 vTranspose4x8(vDst, src0, src1, src2, src3); 394 _mm_store_ps((float*)pDst, vDst[0]); 395 _mm_store_ps((float*)pDst+4, vDst[1]); 396 _mm_store_ps((float*)pDst+8, vDst[2]); 397 _mm_store_ps((float*)pDst+12, vDst[3]); 398 _mm_store_ps((float*)pDst+16, vDst[4]); 399 _mm_store_ps((float*)pDst+20, vDst[5]); 400 _mm_store_ps((float*)pDst+24, vDst[6]); 401 _mm_store_ps((float*)pDst+28, vDst[7]); 402#else 403#error Unsupported vector width 404#endif 405 } 406#if ENABLE_AVX512_SIMD16 407 408 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 409 { 410 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 411 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 412 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 413 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48); 414 415 __m128 vDst[8]; 416 417 vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0)); 418 419 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]); 420 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]); 421 422 vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1)); 423 424 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[0]); 425 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[1]); 426 } 427#endif 428}; 429 430////////////////////////////////////////////////////////////////////////// 431/// Transpose32_32_32 432////////////////////////////////////////////////////////////////////////// 433struct Transpose32_32_32 434{ 435 ////////////////////////////////////////////////////////////////////////// 436 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 437 /// @param pSrc - source data in SOA form 438 /// @param pDst - output data in AOS form 439 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 440 { 441#if KNOB_SIMD_WIDTH == 8 442 simdscalar src0 = _simd_load_ps((const float*)pSrc); 443 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 444 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 445 446 __m128 vDst[8]; 447 vTranspose3x8(vDst, src0, src1, src2); 448 _mm_store_ps((float*)pDst, vDst[0]); 449 _mm_store_ps((float*)pDst + 4, vDst[1]); 450 _mm_store_ps((float*)pDst + 8, vDst[2]); 451 _mm_store_ps((float*)pDst + 12, vDst[3]); 452 _mm_store_ps((float*)pDst + 16, vDst[4]); 453 _mm_store_ps((float*)pDst + 20, vDst[5]); 454 _mm_store_ps((float*)pDst + 24, vDst[6]); 455 _mm_store_ps((float*)pDst + 28, vDst[7]); 456#else 457#error Unsupported vector width 458#endif 459 } 460#if ENABLE_AVX512_SIMD16 461 462 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 463 { 464 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 465 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 466 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 467 468 __m128 vDst[8]; 469 470 vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0)); 471 472 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]); 473 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]); 474 475 vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1)); 476 477 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[0]); 478 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[1]); 479 } 480#endif 481}; 482 483////////////////////////////////////////////////////////////////////////// 484/// Transpose32_32 485////////////////////////////////////////////////////////////////////////// 486struct Transpose32_32 487{ 488 ////////////////////////////////////////////////////////////////////////// 489 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 490 /// @param pSrc - source data in SOA form 491 /// @param pDst - output data in AOS form 492 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 493 { 494#if KNOB_SIMD_WIDTH == 8 495 const float* pfSrc = (const float*)pSrc; 496 __m128 src_r0 = _mm_load_ps(pfSrc + 0); 497 __m128 src_r1 = _mm_load_ps(pfSrc + 4); 498 __m128 src_g0 = _mm_load_ps(pfSrc + 8); 499 __m128 src_g1 = _mm_load_ps(pfSrc + 12); 500 501 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); 502 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); 503 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); 504 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); 505 506 float* pfDst = (float*)pDst; 507 _mm_store_ps(pfDst + 0, dst0); 508 _mm_store_ps(pfDst + 4, dst1); 509 _mm_store_ps(pfDst + 8, dst2); 510 _mm_store_ps(pfDst + 12, dst3); 511#else 512#error Unsupported vector width 513#endif 514 } 515#if ENABLE_AVX512_SIMD16 516 517 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 518 { 519 simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc)); 520 simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 8); 521 simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 522 simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24); 523 524 simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0); 525 simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0); 526 simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1); 527 simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1); 528 529 _simd_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); 530 _simd_store_ps(reinterpret_cast<float *>(pDst) + 8, dst1); 531 _simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2); 532 _simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3); 533 } 534#endif 535}; 536 537////////////////////////////////////////////////////////////////////////// 538/// Transpose16_16_16_16 539////////////////////////////////////////////////////////////////////////// 540struct Transpose16_16_16_16 541{ 542 ////////////////////////////////////////////////////////////////////////// 543 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 544 /// @param pSrc - source data in SOA form 545 /// @param pDst - output data in AOS form 546 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 547 { 548#if KNOB_SIMD_WIDTH == 8 549 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 550 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 551 552 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 553 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 554 __m128i src_b = _mm256_extractf128_si256(src_ba, 0); 555 __m128i src_a = _mm256_extractf128_si256(src_ba, 1); 556 557 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 558 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 559 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 560 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 561 562 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 563 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 564 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 565 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 566 567 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 568 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 569 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 570 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 571#else 572#error Unsupported vector width 573#endif 574 } 575#if ENABLE_AVX512_SIMD16 576 577 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 578 { 579 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); 580 simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari))); 581 582 simdscalari src_r = _simd16_extract_si(src_rg, 0); 583 simdscalari src_g = _simd16_extract_si(src_rg, 1); 584 simdscalari src_b = _simd16_extract_si(src_ba, 0); 585 simdscalari src_a = _simd16_extract_si(src_ba, 1); 586 587 simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g); 588 simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g); 589 simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a); 590 simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a); 591 592 simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0); 593 simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0); 594 simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1); 595 simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1); 596 597 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); 598 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); 599 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); 600 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); 601 } 602#endif 603}; 604 605////////////////////////////////////////////////////////////////////////// 606/// Transpose16_16_16 607////////////////////////////////////////////////////////////////////////// 608struct Transpose16_16_16 609{ 610 ////////////////////////////////////////////////////////////////////////// 611 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 612 /// @param pSrc - source data in SOA form 613 /// @param pDst - output data in AOS form 614 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 615 { 616#if KNOB_SIMD_WIDTH == 8 617 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 618 619 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 620 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 621 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); 622 __m128i src_a = _mm_undefined_si128(); 623 624 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 625 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 626 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 627 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 628 629 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 630 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 631 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 632 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 633 634 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 635 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 636 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 637 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 638#else 639#error Unsupported vector width 640#endif 641 } 642#if ENABLE_AVX512_SIMD16 643 644 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 645 { 646 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); 647 648 simdscalari src_r = _simd16_extract_si(src_rg, 0); 649 simdscalari src_g = _simd16_extract_si(src_rg, 1); 650 simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari))); 651 simdscalari src_a = _mm256_undefined_si256(); 652 653 simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g); 654 simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g); 655 simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a); 656 simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a); 657 658 simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0); 659 simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0); 660 simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1); 661 simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1); 662 663 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); 664 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); 665 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); 666 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); 667 } 668#endif 669}; 670 671////////////////////////////////////////////////////////////////////////// 672/// Transpose16_16 673////////////////////////////////////////////////////////////////////////// 674struct Transpose16_16 675{ 676 ////////////////////////////////////////////////////////////////////////// 677 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 678 /// @param pSrc - source data in SOA form 679 /// @param pDst - output data in AOS form 680 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 681 { 682#if KNOB_SIMD_WIDTH == 8 683 simdscalar src = _simd_load_ps((const float*)pSrc); 684 685 __m128 comp0 = _mm256_castps256_ps128(src); 686 __m128 comp1 = _mm256_extractf128_ps(src, 1); 687 688 __m128i comp0i = _mm_castps_si128(comp0); 689 __m128i comp1i = _mm_castps_si128(comp1); 690 691 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); 692 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); 693 694 _mm_store_si128((__m128i*)pDst, resLo); 695 _mm_store_si128((__m128i*)pDst + 1, resHi); 696#else 697#error Unsupported vector width 698#endif 699 } 700#if ENABLE_AVX512_SIMD16 701 702 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 703 { 704 simd16scalari result = _simd16_setzero_si(); 705 706 simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); 707 708 simdscalari srclo = _simd16_extract_si(src, 0); 709 simdscalari srchi = _simd16_extract_si(src, 1); 710 711 result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0); 712 result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1); 713 714 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result); 715 } 716#endif 717}; 718 719////////////////////////////////////////////////////////////////////////// 720/// Transpose24_8 721////////////////////////////////////////////////////////////////////////// 722struct Transpose24_8 723{ 724 ////////////////////////////////////////////////////////////////////////// 725 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 726 /// @param pSrc - source data in SOA form 727 /// @param pDst - output data in AOS form 728 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 729#if ENABLE_AVX512_SIMD16 730 731 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 732#endif 733}; 734 735////////////////////////////////////////////////////////////////////////// 736/// Transpose32_8_24 737////////////////////////////////////////////////////////////////////////// 738struct Transpose32_8_24 739{ 740 ////////////////////////////////////////////////////////////////////////// 741 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 742 /// @param pSrc - source data in SOA form 743 /// @param pDst - output data in AOS form 744 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 745#if ENABLE_AVX512_SIMD16 746 747 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 748#endif 749}; 750 751////////////////////////////////////////////////////////////////////////// 752/// Transpose4_4_4_4 753////////////////////////////////////////////////////////////////////////// 754struct Transpose4_4_4_4 755{ 756 ////////////////////////////////////////////////////////////////////////// 757 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 758 /// @param pSrc - source data in SOA form 759 /// @param pDst - output data in AOS form 760 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 761#if ENABLE_AVX512_SIMD16 762 763 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 764#endif 765}; 766 767////////////////////////////////////////////////////////////////////////// 768/// Transpose5_6_5 769////////////////////////////////////////////////////////////////////////// 770struct Transpose5_6_5 771{ 772 ////////////////////////////////////////////////////////////////////////// 773 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 774 /// @param pSrc - source data in SOA form 775 /// @param pDst - output data in AOS form 776 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 777#if ENABLE_AVX512_SIMD16 778 779 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 780#endif 781}; 782 783////////////////////////////////////////////////////////////////////////// 784/// Transpose9_9_9_5 785////////////////////////////////////////////////////////////////////////// 786struct Transpose9_9_9_5 787{ 788 ////////////////////////////////////////////////////////////////////////// 789 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 790 /// @param pSrc - source data in SOA form 791 /// @param pDst - output data in AOS form 792 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 793#if ENABLE_AVX512_SIMD16 794 795 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 796#endif 797}; 798 799////////////////////////////////////////////////////////////////////////// 800/// Transpose5_5_5_1 801////////////////////////////////////////////////////////////////////////// 802struct Transpose5_5_5_1 803{ 804 ////////////////////////////////////////////////////////////////////////// 805 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 806 /// @param pSrc - source data in SOA form 807 /// @param pDst - output data in AOS form 808 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 809#if ENABLE_AVX512_SIMD16 810 811 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 812#endif 813}; 814 815////////////////////////////////////////////////////////////////////////// 816/// Transpose1_5_5_5 817////////////////////////////////////////////////////////////////////////// 818struct Transpose1_5_5_5 819{ 820 ////////////////////////////////////////////////////////////////////////// 821 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 822 /// @param pSrc - source data in SOA form 823 /// @param pDst - output data in AOS form 824 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 825}; 826 827////////////////////////////////////////////////////////////////////////// 828/// Transpose10_10_10_2 829////////////////////////////////////////////////////////////////////////// 830struct Transpose10_10_10_2 831{ 832 ////////////////////////////////////////////////////////////////////////// 833 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 834 /// @param pSrc - source data in SOA form 835 /// @param pDst - output data in AOS form 836 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 837#if ENABLE_AVX512_SIMD16 838 839 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 840#endif 841}; 842 843////////////////////////////////////////////////////////////////////////// 844/// Transpose11_11_10 845////////////////////////////////////////////////////////////////////////// 846struct Transpose11_11_10 847{ 848 ////////////////////////////////////////////////////////////////////////// 849 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 850 /// @param pSrc - source data in SOA form 851 /// @param pDst - output data in AOS form 852 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 853#if ENABLE_AVX512_SIMD16 854 855 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 856#endif 857}; 858 859////////////////////////////////////////////////////////////////////////// 860/// Transpose64 861////////////////////////////////////////////////////////////////////////// 862struct Transpose64 863{ 864 ////////////////////////////////////////////////////////////////////////// 865 /// @brief Performs an SOA to AOS conversion 866 /// @param pSrc - source data in SOA form 867 /// @param pDst - output data in AOS form 868 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 869#if ENABLE_AVX512_SIMD16 870 871 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 872#endif 873}; 874 875////////////////////////////////////////////////////////////////////////// 876/// Transpose64_64 877////////////////////////////////////////////////////////////////////////// 878struct Transpose64_64 879{ 880 ////////////////////////////////////////////////////////////////////////// 881 /// @brief Performs an SOA to AOS conversion 882 /// @param pSrc - source data in SOA form 883 /// @param pDst - output data in AOS form 884 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 885#if ENABLE_AVX512_SIMD16 886 887 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 888#endif 889}; 890 891////////////////////////////////////////////////////////////////////////// 892/// Transpose64_64_64 893////////////////////////////////////////////////////////////////////////// 894struct Transpose64_64_64 895{ 896 ////////////////////////////////////////////////////////////////////////// 897 /// @brief Performs an SOA to AOS conversion 898 /// @param pSrc - source data in SOA form 899 /// @param pDst - output data in AOS form 900 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 901#if ENABLE_AVX512_SIMD16 902 903 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 904#endif 905}; 906 907////////////////////////////////////////////////////////////////////////// 908/// Transpose64_64_64_64 909////////////////////////////////////////////////////////////////////////// 910struct Transpose64_64_64_64 911{ 912 ////////////////////////////////////////////////////////////////////////// 913 /// @brief Performs an SOA to AOS conversion 914 /// @param pSrc - source data in SOA form 915 /// @param pDst - output data in AOS form 916 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 917#if ENABLE_AVX512_SIMD16 918 919 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 920#endif 921}; 922 923// helper function to unroll loops 924template<int Begin, int End, int Step = 1> 925struct UnrollerL { 926 template<typename Lambda> 927 INLINE static void step(Lambda& func) { 928 func(Begin); 929 UnrollerL<Begin + Step, End, Step>::step(func); 930 } 931}; 932 933template<int End, int Step> 934struct UnrollerL<End, End, Step> { 935 template<typename Lambda> 936 static void step(Lambda& func) { 937 } 938}; 939 940// helper function to unroll loops, with mask to skip specific iterations 941template<int Begin, int End, int Step = 1, int Mask = 0x7f> 942struct UnrollerLMask { 943 template<typename Lambda> 944 INLINE static void step(Lambda& func) { 945 if(Mask & (1 << Begin)) 946 { 947 func(Begin); 948 } 949 UnrollerL<Begin + Step, End, Step>::step(func); 950 } 951}; 952 953template<int End, int Step, int Mask> 954struct UnrollerLMask<End, End, Step, Mask> { 955 template<typename Lambda> 956 static void step(Lambda& func) { 957 } 958}; 959 960// general CRC compute 961INLINE 962uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) 963{ 964#if defined(_WIN64) || defined(__x86_64__) 965 uint32_t sizeInQwords = size / sizeof(uint64_t); 966 uint32_t sizeRemainderBytes = size % sizeof(uint64_t); 967 uint64_t* pDataWords = (uint64_t*)pData; 968 for (uint32_t i = 0; i < sizeInQwords; ++i) 969 { 970 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); 971 } 972#else 973 uint32_t sizeInDwords = size / sizeof(uint32_t); 974 uint32_t sizeRemainderBytes = size % sizeof(uint32_t); 975 uint32_t* pDataWords = (uint32_t*)pData; 976 for (uint32_t i = 0; i < sizeInDwords; ++i) 977 { 978 crc = _mm_crc32_u32(crc, *pDataWords++); 979 } 980#endif 981 982 uint8_t* pRemainderBytes = (uint8_t*)pDataWords; 983 for (uint32_t i = 0; i < sizeRemainderBytes; ++i) 984 { 985 crc = _mm_crc32_u8(crc, *pRemainderBytes++); 986 } 987 988 return crc; 989} 990 991////////////////////////////////////////////////////////////////////////// 992/// Add byte offset to any-type pointer 993////////////////////////////////////////////////////////////////////////// 994template <typename T> 995INLINE 996static T* PtrAdd(T* p, intptr_t offset) 997{ 998 intptr_t intp = reinterpret_cast<intptr_t>(p); 999 return reinterpret_cast<T*>(intp + offset); 1000} 1001 1002////////////////////////////////////////////////////////////////////////// 1003/// Is a power-of-2? 1004////////////////////////////////////////////////////////////////////////// 1005template <typename T> 1006INLINE 1007static bool IsPow2(T value) 1008{ 1009 return value == (value & (0 - value)); 1010} 1011 1012////////////////////////////////////////////////////////////////////////// 1013/// Align down to specified alignment 1014/// Note: IsPow2(alignment) MUST be true 1015////////////////////////////////////////////////////////////////////////// 1016template <typename T1, typename T2> 1017INLINE 1018static T1 AlignDownPow2(T1 value, T2 alignment) 1019{ 1020 SWR_ASSERT(IsPow2(alignment)); 1021 return value & ~T1(alignment - 1); 1022} 1023 1024////////////////////////////////////////////////////////////////////////// 1025/// Align up to specified alignment 1026/// Note: IsPow2(alignment) MUST be true 1027////////////////////////////////////////////////////////////////////////// 1028template <typename T1, typename T2> 1029INLINE 1030static T1 AlignUpPow2(T1 value, T2 alignment) 1031{ 1032 return AlignDownPow2(value + T1(alignment - 1), alignment); 1033} 1034 1035////////////////////////////////////////////////////////////////////////// 1036/// Align up ptr to specified alignment 1037/// Note: IsPow2(alignment) MUST be true 1038////////////////////////////////////////////////////////////////////////// 1039template <typename T1, typename T2> 1040INLINE 1041static T1* AlignUpPow2(T1* value, T2 alignment) 1042{ 1043 return reinterpret_cast<T1*>( 1044 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); 1045} 1046 1047////////////////////////////////////////////////////////////////////////// 1048/// Align down to specified alignment 1049////////////////////////////////////////////////////////////////////////// 1050template <typename T1, typename T2> 1051INLINE 1052static T1 AlignDown(T1 value, T2 alignment) 1053{ 1054 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } 1055 return value - T1(value % alignment); 1056} 1057 1058////////////////////////////////////////////////////////////////////////// 1059/// Align down to specified alignment 1060////////////////////////////////////////////////////////////////////////// 1061template <typename T1, typename T2> 1062INLINE 1063static T1* AlignDown(T1* value, T2 alignment) 1064{ 1065 return (T1*)AlignDown(uintptr_t(value), alignment); 1066} 1067 1068////////////////////////////////////////////////////////////////////////// 1069/// Align up to specified alignment 1070/// Note: IsPow2(alignment) MUST be true 1071////////////////////////////////////////////////////////////////////////// 1072template <typename T1, typename T2> 1073INLINE 1074static T1 AlignUp(T1 value, T2 alignment) 1075{ 1076 return AlignDown(value + T1(alignment - 1), alignment); 1077} 1078 1079////////////////////////////////////////////////////////////////////////// 1080/// Align up to specified alignment 1081/// Note: IsPow2(alignment) MUST be true 1082////////////////////////////////////////////////////////////////////////// 1083template <typename T1, typename T2> 1084INLINE 1085static T1* AlignUp(T1* value, T2 alignment) 1086{ 1087 return AlignDown(PtrAdd(value, alignment - 1), alignment); 1088} 1089 1090////////////////////////////////////////////////////////////////////////// 1091/// Helper structure used to access an array of elements that don't 1092/// correspond to a typical word size. 1093////////////////////////////////////////////////////////////////////////// 1094template<typename T, size_t BitsPerElementT, size_t ArrayLenT> 1095class BitsArray 1096{ 1097private: 1098 static const size_t BITS_PER_WORD = sizeof(size_t) * 8; 1099 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; 1100 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; 1101 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; 1102 1103 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, 1104 "Element size must an integral fraction of pointer size"); 1105 1106 size_t m_words[NUM_WORDS] = {}; 1107 1108public: 1109 1110 T operator[] (size_t elementIndex) const 1111 { 1112 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; 1113 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); 1114 return T(word & ELEMENT_MASK); 1115 } 1116}; 1117 1118// Ranged integer argument for TemplateArgUnroller 1119template <uint32_t TMin, uint32_t TMax> 1120struct IntArg 1121{ 1122 uint32_t val; 1123}; 1124 1125// Recursive template used to auto-nest conditionals. Converts dynamic boolean function 1126// arguments to static template arguments. 1127template <typename TermT, typename... ArgsB> 1128struct TemplateArgUnroller 1129{ 1130 //----------------------------------------- 1131 // Boolean value 1132 //----------------------------------------- 1133 1134 // Last Arg Terminator 1135 static typename TermT::FuncType GetFunc(bool bArg) 1136 { 1137 if (bArg) 1138 { 1139 return TermT::template GetFunc<ArgsB..., std::true_type>(); 1140 } 1141 1142 return TermT::template GetFunc<ArgsB..., std::false_type>(); 1143 } 1144 1145 // Recursively parse args 1146 template <typename... TArgsT> 1147 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs) 1148 { 1149 if (bArg) 1150 { 1151 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...); 1152 } 1153 1154 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...); 1155 } 1156 1157 //----------------------------------------- 1158 // Integer value (within specified range) 1159 //----------------------------------------- 1160 1161 // Last Arg Terminator 1162 template <uint32_t TMin, uint32_t TMax> 1163 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg) 1164 { 1165 if (iArg.val == TMax) 1166 { 1167 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>(); 1168 } 1169 if (TMax > TMin) 1170 { 1171 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val}); 1172 } 1173 SWR_ASSUME(false); return nullptr; 1174 } 1175 template <uint32_t TVal> 1176 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg) 1177 { 1178 SWR_ASSERT(iArg.val == TVal); 1179 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>(); 1180 } 1181 1182 // Recursively parse args 1183 template <uint32_t TMin, uint32_t TMax, typename... TArgsT> 1184 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs) 1185 { 1186 if (iArg.val == TMax) 1187 { 1188 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...); 1189 } 1190 if (TMax > TMin) 1191 { 1192 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...); 1193 } 1194 SWR_ASSUME(false); return nullptr; 1195 } 1196 template <uint32_t TVal, typename... TArgsT> 1197 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs) 1198 { 1199 SWR_ASSERT(iArg.val == TVal); 1200 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...); 1201 } 1202}; 1203 1204 1205