utils.h revision 812b45d04958e31e7a3bfc7331308374e8b73afa
1/**************************************************************************** 2* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22* 23* @file utils.h 24* 25* @brief Utilities used by SWR core. 26* 27******************************************************************************/ 28#pragma once 29 30#include <string.h> 31#include <type_traits> 32#include <algorithm> 33#include "common/os.h" 34#include "common/simdintrin.h" 35#include "common/swr_assert.h" 36 37#if defined(_WIN64) || defined(__x86_64__) 38#define _MM_INSERT_EPI64 _mm_insert_epi64 39#define _MM_EXTRACT_EPI64 _mm_extract_epi64 40#else 41INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) 42{ 43 OSALIGNLINE(uint32_t) elems[4]; 44 _mm_store_si128((__m128i*)elems, a); 45 if (ndx == 0) 46 { 47 uint64_t foo = elems[0]; 48 foo |= (uint64_t)elems[1] << 32; 49 return foo; 50 } 51 else 52 { 53 uint64_t foo = elems[2]; 54 foo |= (uint64_t)elems[3] << 32; 55 return foo; 56 } 57} 58 59INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) 60{ 61 OSALIGNLINE(int64_t) elems[2]; 62 _mm_store_si128((__m128i*)elems, a); 63 if (ndx == 0) 64 { 65 elems[0] = b; 66 } 67 else 68 { 69 elems[1] = b; 70 } 71 __m128i out; 72 out = _mm_load_si128((const __m128i*)elems); 73 return out; 74} 75#endif 76 77OSALIGNLINE(struct) BBOX 78{ 79 int top{ 0 }; 80 int bottom{ 0 }; 81 int left{ 0 }; 82 int right{ 0 }; 83 84 BBOX() {} 85 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} 86 87 bool operator==(const BBOX& rhs) 88 { 89 return (this->top == rhs.top && 90 this->bottom == rhs.bottom && 91 this->left == rhs.left && 92 this->right == rhs.right); 93 } 94 95 bool operator!=(const BBOX& rhs) 96 { 97 return !(*this == rhs); 98 } 99 100 BBOX& Intersect(const BBOX& other) 101 { 102 this->top = std::max(this->top, other.top); 103 this->bottom = std::min(this->bottom, other.bottom); 104 this->left = std::max(this->left, other.left); 105 this->right = std::min(this->right, other.right); 106 107 if (right - left < 0 || 108 bottom - top < 0) 109 { 110 // Zero area 111 top = bottom = left = right = 0; 112 } 113 114 return *this; 115 } 116}; 117 118struct simdBBox 119{ 120 simdscalari top; 121 simdscalari bottom; 122 simdscalari left; 123 simdscalari right; 124}; 125 126INLINE 127void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) 128{ 129 __m128i row0i = _mm_castps_si128(row0); 130 __m128i row1i = _mm_castps_si128(row1); 131 __m128i row2i = _mm_castps_si128(row2); 132 __m128i row3i = _mm_castps_si128(row3); 133 134 __m128i vTemp = row2i; 135 row2i = _mm_unpacklo_epi32(row2i, row3i); 136 vTemp = _mm_unpackhi_epi32(vTemp, row3i); 137 138 row3i = row0i; 139 row0i = _mm_unpacklo_epi32(row0i, row1i); 140 row3i = _mm_unpackhi_epi32(row3i, row1i); 141 142 row1i = row0i; 143 row0i = _mm_unpacklo_epi64(row0i, row2i); 144 row1i = _mm_unpackhi_epi64(row1i, row2i); 145 146 row2i = row3i; 147 row2i = _mm_unpacklo_epi64(row2i, vTemp); 148 row3i = _mm_unpackhi_epi64(row3i, vTemp); 149 150 row0 = _mm_castsi128_ps(row0i); 151 row1 = _mm_castsi128_ps(row1i); 152 row2 = _mm_castsi128_ps(row2i); 153 row3 = _mm_castsi128_ps(row3i); 154} 155 156INLINE 157void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) 158{ 159 __m128i vTemp = row2; 160 row2 = _mm_unpacklo_epi32(row2, row3); 161 vTemp = _mm_unpackhi_epi32(vTemp, row3); 162 163 row3 = row0; 164 row0 = _mm_unpacklo_epi32(row0, row1); 165 row3 = _mm_unpackhi_epi32(row3, row1); 166 167 row1 = row0; 168 row0 = _mm_unpacklo_epi64(row0, row2); 169 row1 = _mm_unpackhi_epi64(row1, row2); 170 171 row2 = row3; 172 row2 = _mm_unpacklo_epi64(row2, vTemp); 173 row3 = _mm_unpackhi_epi64(row3, vTemp); 174} 175 176#define GCC_VERSION (__GNUC__ * 10000 \ 177 + __GNUC_MINOR__ * 100 \ 178 + __GNUC_PATCHLEVEL__) 179 180#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) 181#define _mm_undefined_ps _mm_setzero_ps 182#define _mm_undefined_si128 _mm_setzero_si128 183#if KNOB_SIMD_WIDTH == 8 184#define _mm256_undefined_ps _mm256_setzero_ps 185#endif 186#endif 187 188#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16 189INLINE 190void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) 191{ 192 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 193 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 194 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 195 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 196 197 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 198 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 199 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 200 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 201 202 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 203 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 204 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 205 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 206 207 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 208 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 209 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 210 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 211} 212 213INLINE 214void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) 215{ 216 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 217 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 218 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 219 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 220 221 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 222 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 223 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 224 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 225 226 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 227 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 228 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 229 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 230 231 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 232 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 233 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 234 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 235} 236 237INLINE 238void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) 239{ 240 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); 241 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); 242 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); 243 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); 244 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); 245 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); 246 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); 247 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); 248 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 249 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 250 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 251 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 252 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 253 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 254 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 255 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 256 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); 257 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); 258 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); 259 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); 260 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); 261 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); 262 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); 263 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); 264} 265 266INLINE 267void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) 268{ 269 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 270 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); 271} 272#endif 273 274////////////////////////////////////////////////////////////////////////// 275/// TranposeSingleComponent 276////////////////////////////////////////////////////////////////////////// 277template<uint32_t bpp> 278struct TransposeSingleComponent 279{ 280 ////////////////////////////////////////////////////////////////////////// 281 /// @brief Pass-thru for single component. 282 /// @param pSrc - source data in SOA form 283 /// @param pDst - output data in AOS form 284 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 285 { 286 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 287 } 288}; 289 290////////////////////////////////////////////////////////////////////////// 291/// Transpose8_8_8_8 292////////////////////////////////////////////////////////////////////////// 293struct Transpose8_8_8_8 294{ 295 ////////////////////////////////////////////////////////////////////////// 296 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 297 /// @param pSrc - source data in SOA form 298 /// @param pDst - output data in AOS form 299 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 300 { 301 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 302 303#if KNOB_SIMD_WIDTH == 8 304#if KNOB_ARCH == KNOB_ARCH_AVX 305 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg 306 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa 307 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 308 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 309 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 310 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa 311 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba 312 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba 313 _mm_store_si128((__m128i*)pDst, c0123lo); 314 _mm_store_si128((__m128i*)(pDst + 16), c0123hi); 315#elif KNOB_ARCH == KNOB_ARCH_AVX2 316 simdscalari dst01 = _mm256_shuffle_epi8(src, 317 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 318 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 319 dst23 = _mm256_shuffle_epi8(dst23, 320 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 321 simdscalari dst = _mm256_or_si256(dst01, dst23); 322 _simd_store_si((simdscalari*)pDst, dst); 323#endif 324#elif KNOB_SIMD_WIDTH == 16 325 simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800); 326 327 simdscalari dst01 = _simd_shuffle_epi8(src, mask0); 328 329 simdscalari perm1 = _simd_permute_128(src, src, 1); 330 331 simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080); 332 333 simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1); 334 335 simdscalari dst = _simd_or_si(dst01, dst23); 336 337 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); 338#else 339#error Unsupported vector width 340#endif 341 } 342}; 343 344////////////////////////////////////////////////////////////////////////// 345/// Transpose8_8_8 346////////////////////////////////////////////////////////////////////////// 347struct Transpose8_8_8 348{ 349 ////////////////////////////////////////////////////////////////////////// 350 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 351 /// @param pSrc - source data in SOA form 352 /// @param pDst - output data in AOS form 353 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 354}; 355 356////////////////////////////////////////////////////////////////////////// 357/// Transpose8_8 358////////////////////////////////////////////////////////////////////////// 359struct Transpose8_8 360{ 361 ////////////////////////////////////////////////////////////////////////// 362 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 363 /// @param pSrc - source data in SOA form 364 /// @param pDst - output data in AOS form 365 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 366 { 367#if KNOB_SIMD_WIDTH == 8 368 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 369 370 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg 371 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg 372 rg = _mm_unpacklo_epi8(rg, g); 373 _mm_store_si128((__m128i*)pDst, rg); 374#elif KNOB_SIMD_WIDTH == 16 375 __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg 376 377 __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx 378 379 __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx 380 381 __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg 382 383 _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst); 384#else 385#error Unsupported vector width 386#endif 387 } 388}; 389 390////////////////////////////////////////////////////////////////////////// 391/// Transpose32_32_32_32 392////////////////////////////////////////////////////////////////////////// 393struct Transpose32_32_32_32 394{ 395 ////////////////////////////////////////////////////////////////////////// 396 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 397 /// @param pSrc - source data in SOA form 398 /// @param pDst - output data in AOS form 399 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 400 { 401#if KNOB_SIMD_WIDTH == 8 402 simdscalar src0 = _simd_load_ps((const float*)pSrc); 403 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 404 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 405 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 406 407 __m128 vDst[8]; 408 vTranspose4x8(vDst, src0, src1, src2, src3); 409 _mm_store_ps((float*)pDst, vDst[0]); 410 _mm_store_ps((float*)pDst+4, vDst[1]); 411 _mm_store_ps((float*)pDst+8, vDst[2]); 412 _mm_store_ps((float*)pDst+12, vDst[3]); 413 _mm_store_ps((float*)pDst+16, vDst[4]); 414 _mm_store_ps((float*)pDst+20, vDst[5]); 415 _mm_store_ps((float*)pDst+24, vDst[6]); 416 _mm_store_ps((float*)pDst+28, vDst[7]); 417#elif KNOB_SIMD_WIDTH == 16 418#if ENABLE_AVX512_EMULATION 419 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); 420 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16); 421 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32); 422 simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48); 423 424 __m128 vDst[8]; 425 426 vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo); 427 428 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]); 429 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]); 430 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]); 431 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]); 432 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]); 433 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]); 434 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]); 435 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]); 436 437 vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi); 438 439 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]); 440 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]); 441 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]); 442 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]); 443 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]); 444 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]); 445 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]); 446 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]); 447#endif 448#else 449#error Unsupported vector width 450#endif 451 } 452}; 453 454////////////////////////////////////////////////////////////////////////// 455/// Transpose32_32_32 456////////////////////////////////////////////////////////////////////////// 457struct Transpose32_32_32 458{ 459 ////////////////////////////////////////////////////////////////////////// 460 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 461 /// @param pSrc - source data in SOA form 462 /// @param pDst - output data in AOS form 463 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 464 { 465#if KNOB_SIMD_WIDTH == 8 466 simdscalar src0 = _simd_load_ps((const float*)pSrc); 467 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 468 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 469 470 __m128 vDst[8]; 471 vTranspose3x8(vDst, src0, src1, src2); 472 _mm_store_ps((float*)pDst, vDst[0]); 473 _mm_store_ps((float*)pDst + 4, vDst[1]); 474 _mm_store_ps((float*)pDst + 8, vDst[2]); 475 _mm_store_ps((float*)pDst + 12, vDst[3]); 476 _mm_store_ps((float*)pDst + 16, vDst[4]); 477 _mm_store_ps((float*)pDst + 20, vDst[5]); 478 _mm_store_ps((float*)pDst + 24, vDst[6]); 479 _mm_store_ps((float*)pDst + 28, vDst[7]); 480#elif KNOB_SIMD_WIDTH == 16 481#if ENABLE_AVX512_EMULATION 482 simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); 483 simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16); 484 simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32); 485 486 __m128 vDst[8]; 487 488 vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo); 489 490 _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]); 491 _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]); 492 _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]); 493 _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]); 494 _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]); 495 _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]); 496 _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]); 497 _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]); 498 499 vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi); 500 501 _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]); 502 _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]); 503 _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]); 504 _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]); 505 _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]); 506 _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]); 507 _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]); 508 _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]); 509#endif 510#else 511#error Unsupported vector width 512#endif 513 } 514}; 515 516////////////////////////////////////////////////////////////////////////// 517/// Transpose32_32 518////////////////////////////////////////////////////////////////////////// 519struct Transpose32_32 520{ 521 ////////////////////////////////////////////////////////////////////////// 522 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 523 /// @param pSrc - source data in SOA form 524 /// @param pDst - output data in AOS form 525 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 526 { 527#if KNOB_SIMD_WIDTH == 8 528 const float* pfSrc = (const float*)pSrc; 529 __m128 src_r0 = _mm_load_ps(pfSrc + 0); 530 __m128 src_r1 = _mm_load_ps(pfSrc + 4); 531 __m128 src_g0 = _mm_load_ps(pfSrc + 8); 532 __m128 src_g1 = _mm_load_ps(pfSrc + 12); 533 534 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); 535 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); 536 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); 537 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); 538 539 float* pfDst = (float*)pDst; 540 _mm_store_ps(pfDst + 0, dst0); 541 _mm_store_ps(pfDst + 4, dst1); 542 _mm_store_ps(pfDst + 8, dst2); 543 _mm_store_ps(pfDst + 12, dst3); 544#elif KNOB_SIMD_WIDTH == 16 545 const float* pfSrc = (const float*)pSrc; 546 __m256 src_r0 = _mm256_load_ps(pfSrc + 0); 547 __m256 src_r1 = _mm256_load_ps(pfSrc + 8); 548 __m256 src_g0 = _mm256_load_ps(pfSrc + 16); 549 __m256 src_g1 = _mm256_load_ps(pfSrc + 24); 550 551 __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0); 552 __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0); 553 __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1); 554 __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1); 555 556 float* pfDst = (float*)pDst; 557 _mm256_store_ps(pfDst + 0, dst0); 558 _mm256_store_ps(pfDst + 8, dst1); 559 _mm256_store_ps(pfDst + 16, dst2); 560 _mm256_store_ps(pfDst + 24, dst3); 561#else 562#error Unsupported vector width 563#endif 564 } 565}; 566 567////////////////////////////////////////////////////////////////////////// 568/// Transpose16_16_16_16 569////////////////////////////////////////////////////////////////////////// 570struct Transpose16_16_16_16 571{ 572 ////////////////////////////////////////////////////////////////////////// 573 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 574 /// @param pSrc - source data in SOA form 575 /// @param pDst - output data in AOS form 576 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 577 { 578#if KNOB_SIMD_WIDTH == 8 579 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 580 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 581 582 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 583 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 584 __m128i src_b = _mm256_extractf128_si256(src_ba, 0); 585 __m128i src_a = _mm256_extractf128_si256(src_ba, 1); 586 587 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 588 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 589 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 590 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 591 592 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 593 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 594 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 595 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 596 597 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 598 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 599 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 600 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 601#elif KNOB_SIMD_WIDTH == 16 602#if ENABLE_AVX512_EMULATION 603 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); 604 simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari))); 605 606 __m256i src_r = src_rg.lo; 607 __m256i src_g = src_rg.hi; 608 __m256i src_b = src_ba.lo; 609 __m256i src_a = src_ba.hi; 610 611 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); 612 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); 613 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); 614 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); 615 616 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); 617 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); 618 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); 619 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); 620 621 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); 622 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); 623 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); 624 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); 625#endif 626#else 627#error Unsupported vector width 628#endif 629 } 630}; 631 632////////////////////////////////////////////////////////////////////////// 633/// Transpose16_16_16 634////////////////////////////////////////////////////////////////////////// 635struct Transpose16_16_16 636{ 637 ////////////////////////////////////////////////////////////////////////// 638 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 639 /// @param pSrc - source data in SOA form 640 /// @param pDst - output data in AOS form 641 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 642 { 643#if KNOB_SIMD_WIDTH == 8 644 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 645 646 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 647 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 648 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); 649 __m128i src_a = _mm_undefined_si128(); 650 651 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 652 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 653 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 654 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 655 656 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 657 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 658 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 659 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 660 661 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 662 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 663 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 664 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 665#elif KNOB_SIMD_WIDTH == 16 666#if ENABLE_AVX512_EMULATION 667 simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); 668 669 __m256i src_r = src_rg.lo; 670 __m256i src_g = src_rg.hi; 671 __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari))); 672 __m256i src_a = _mm256_undefined_si256(); 673 674 __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); 675 __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); 676 __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); 677 __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); 678 679 __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); 680 __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); 681 __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); 682 __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); 683 684 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); 685 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); 686 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); 687 _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); 688#endif 689#else 690#error Unsupported vector width 691#endif 692 } 693}; 694 695////////////////////////////////////////////////////////////////////////// 696/// Transpose16_16 697////////////////////////////////////////////////////////////////////////// 698struct Transpose16_16 699{ 700 ////////////////////////////////////////////////////////////////////////// 701 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 702 /// @param pSrc - source data in SOA form 703 /// @param pDst - output data in AOS form 704 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 705 { 706#if KNOB_SIMD_WIDTH == 8 707 simdscalar src = _simd_load_ps((const float*)pSrc); 708 709 __m128 comp0 = _mm256_castps256_ps128(src); 710 __m128 comp1 = _mm256_extractf128_ps(src, 1); 711 712 __m128i comp0i = _mm_castps_si128(comp0); 713 __m128i comp1i = _mm_castps_si128(comp1); 714 715 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); 716 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); 717 718 _mm_store_si128((__m128i*)pDst, resLo); 719 _mm_store_si128((__m128i*)pDst + 1, resHi); 720#elif KNOB_SIMD_WIDTH == 16 721#if ENABLE_AVX512_EMULATION 722 simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc))); 723 724 simdscalari result; 725 726 result.lo = _mm256_unpacklo_epi16(src.lo, src.hi); 727 result.hi = _mm256_unpackhi_epi16(src.lo, src.hi); 728 729 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result); 730#endif 731#else 732#error Unsupported vector width 733#endif 734 } 735}; 736 737////////////////////////////////////////////////////////////////////////// 738/// Transpose24_8 739////////////////////////////////////////////////////////////////////////// 740struct Transpose24_8 741{ 742 ////////////////////////////////////////////////////////////////////////// 743 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 744 /// @param pSrc - source data in SOA form 745 /// @param pDst - output data in AOS form 746 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 747}; 748 749////////////////////////////////////////////////////////////////////////// 750/// Transpose32_8_24 751////////////////////////////////////////////////////////////////////////// 752struct Transpose32_8_24 753{ 754 ////////////////////////////////////////////////////////////////////////// 755 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 756 /// @param pSrc - source data in SOA form 757 /// @param pDst - output data in AOS form 758 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 759}; 760 761 762 763////////////////////////////////////////////////////////////////////////// 764/// Transpose4_4_4_4 765////////////////////////////////////////////////////////////////////////// 766struct Transpose4_4_4_4 767{ 768 ////////////////////////////////////////////////////////////////////////// 769 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 770 /// @param pSrc - source data in SOA form 771 /// @param pDst - output data in AOS form 772 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 773}; 774 775////////////////////////////////////////////////////////////////////////// 776/// Transpose5_6_5 777////////////////////////////////////////////////////////////////////////// 778struct Transpose5_6_5 779{ 780 ////////////////////////////////////////////////////////////////////////// 781 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 782 /// @param pSrc - source data in SOA form 783 /// @param pDst - output data in AOS form 784 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 785}; 786 787////////////////////////////////////////////////////////////////////////// 788/// Transpose9_9_9_5 789////////////////////////////////////////////////////////////////////////// 790struct Transpose9_9_9_5 791{ 792 ////////////////////////////////////////////////////////////////////////// 793 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 794 /// @param pSrc - source data in SOA form 795 /// @param pDst - output data in AOS form 796 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 797}; 798 799////////////////////////////////////////////////////////////////////////// 800/// Transpose5_5_5_1 801////////////////////////////////////////////////////////////////////////// 802struct Transpose5_5_5_1 803{ 804 ////////////////////////////////////////////////////////////////////////// 805 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 806 /// @param pSrc - source data in SOA form 807 /// @param pDst - output data in AOS form 808 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 809}; 810 811////////////////////////////////////////////////////////////////////////// 812/// Transpose10_10_10_2 813////////////////////////////////////////////////////////////////////////// 814struct Transpose10_10_10_2 815{ 816 ////////////////////////////////////////////////////////////////////////// 817 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 818 /// @param pSrc - source data in SOA form 819 /// @param pDst - output data in AOS form 820 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 821}; 822 823////////////////////////////////////////////////////////////////////////// 824/// Transpose11_11_10 825////////////////////////////////////////////////////////////////////////// 826struct Transpose11_11_10 827{ 828 ////////////////////////////////////////////////////////////////////////// 829 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 830 /// @param pSrc - source data in SOA form 831 /// @param pDst - output data in AOS form 832 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 833}; 834 835// helper function to unroll loops 836template<int Begin, int End, int Step = 1> 837struct UnrollerL { 838 template<typename Lambda> 839 INLINE static void step(Lambda& func) { 840 func(Begin); 841 UnrollerL<Begin + Step, End, Step>::step(func); 842 } 843}; 844 845template<int End, int Step> 846struct UnrollerL<End, End, Step> { 847 template<typename Lambda> 848 static void step(Lambda& func) { 849 } 850}; 851 852// helper function to unroll loops, with mask to skip specific iterations 853template<int Begin, int End, int Step = 1, int Mask = 0x7f> 854struct UnrollerLMask { 855 template<typename Lambda> 856 INLINE static void step(Lambda& func) { 857 if(Mask & (1 << Begin)) 858 { 859 func(Begin); 860 } 861 UnrollerL<Begin + Step, End, Step>::step(func); 862 } 863}; 864 865template<int End, int Step, int Mask> 866struct UnrollerLMask<End, End, Step, Mask> { 867 template<typename Lambda> 868 static void step(Lambda& func) { 869 } 870}; 871 872// general CRC compute 873INLINE 874uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) 875{ 876#if defined(_WIN64) || defined(__x86_64__) 877 uint32_t sizeInQwords = size / sizeof(uint64_t); 878 uint32_t sizeRemainderBytes = size % sizeof(uint64_t); 879 uint64_t* pDataWords = (uint64_t*)pData; 880 for (uint32_t i = 0; i < sizeInQwords; ++i) 881 { 882 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); 883 } 884#else 885 uint32_t sizeInDwords = size / sizeof(uint32_t); 886 uint32_t sizeRemainderBytes = size % sizeof(uint32_t); 887 uint32_t* pDataWords = (uint32_t*)pData; 888 for (uint32_t i = 0; i < sizeInDwords; ++i) 889 { 890 crc = _mm_crc32_u32(crc, *pDataWords++); 891 } 892#endif 893 894 uint8_t* pRemainderBytes = (uint8_t*)pDataWords; 895 for (uint32_t i = 0; i < sizeRemainderBytes; ++i) 896 { 897 crc = _mm_crc32_u8(crc, *pRemainderBytes++); 898 } 899 900 return crc; 901} 902 903////////////////////////////////////////////////////////////////////////// 904/// Add byte offset to any-type pointer 905////////////////////////////////////////////////////////////////////////// 906template <typename T> 907INLINE 908static T* PtrAdd(T* p, intptr_t offset) 909{ 910 intptr_t intp = reinterpret_cast<intptr_t>(p); 911 return reinterpret_cast<T*>(intp + offset); 912} 913 914////////////////////////////////////////////////////////////////////////// 915/// Is a power-of-2? 916////////////////////////////////////////////////////////////////////////// 917template <typename T> 918INLINE 919static bool IsPow2(T value) 920{ 921 return value == (value & (0 - value)); 922} 923 924////////////////////////////////////////////////////////////////////////// 925/// Align down to specified alignment 926/// Note: IsPow2(alignment) MUST be true 927////////////////////////////////////////////////////////////////////////// 928template <typename T1, typename T2> 929INLINE 930static T1 AlignDownPow2(T1 value, T2 alignment) 931{ 932 SWR_ASSERT(IsPow2(alignment)); 933 return value & ~T1(alignment - 1); 934} 935 936////////////////////////////////////////////////////////////////////////// 937/// Align up to specified alignment 938/// Note: IsPow2(alignment) MUST be true 939////////////////////////////////////////////////////////////////////////// 940template <typename T1, typename T2> 941INLINE 942static T1 AlignUpPow2(T1 value, T2 alignment) 943{ 944 return AlignDownPow2(value + T1(alignment - 1), alignment); 945} 946 947////////////////////////////////////////////////////////////////////////// 948/// Align up ptr to specified alignment 949/// Note: IsPow2(alignment) MUST be true 950////////////////////////////////////////////////////////////////////////// 951template <typename T1, typename T2> 952INLINE 953static T1* AlignUpPow2(T1* value, T2 alignment) 954{ 955 return reinterpret_cast<T1*>( 956 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); 957} 958 959////////////////////////////////////////////////////////////////////////// 960/// Align down to specified alignment 961////////////////////////////////////////////////////////////////////////// 962template <typename T1, typename T2> 963INLINE 964static T1 AlignDown(T1 value, T2 alignment) 965{ 966 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } 967 return value - T1(value % alignment); 968} 969 970////////////////////////////////////////////////////////////////////////// 971/// Align down to specified alignment 972////////////////////////////////////////////////////////////////////////// 973template <typename T1, typename T2> 974INLINE 975static T1* AlignDown(T1* value, T2 alignment) 976{ 977 return (T1*)AlignDown(uintptr_t(value), alignment); 978} 979 980////////////////////////////////////////////////////////////////////////// 981/// Align up to specified alignment 982/// Note: IsPow2(alignment) MUST be true 983////////////////////////////////////////////////////////////////////////// 984template <typename T1, typename T2> 985INLINE 986static T1 AlignUp(T1 value, T2 alignment) 987{ 988 return AlignDown(value + T1(alignment - 1), alignment); 989} 990 991////////////////////////////////////////////////////////////////////////// 992/// Align up to specified alignment 993/// Note: IsPow2(alignment) MUST be true 994////////////////////////////////////////////////////////////////////////// 995template <typename T1, typename T2> 996INLINE 997static T1* AlignUp(T1* value, T2 alignment) 998{ 999 return AlignDown(PtrAdd(value, alignment - 1), alignment); 1000} 1001 1002////////////////////////////////////////////////////////////////////////// 1003/// Helper structure used to access an array of elements that don't 1004/// correspond to a typical word size. 1005////////////////////////////////////////////////////////////////////////// 1006template<typename T, size_t BitsPerElementT, size_t ArrayLenT> 1007class BitsArray 1008{ 1009private: 1010 static const size_t BITS_PER_WORD = sizeof(size_t) * 8; 1011 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; 1012 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; 1013 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; 1014 1015 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, 1016 "Element size must an integral fraction of pointer size"); 1017 1018 size_t m_words[NUM_WORDS] = {}; 1019 1020public: 1021 1022 T operator[] (size_t elementIndex) const 1023 { 1024 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; 1025 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); 1026 return T(word & ELEMENT_MASK); 1027 } 1028}; 1029 1030// Ranged integer argument for TemplateArgUnroller 1031template <uint32_t TMin, uint32_t TMax> 1032struct IntArg 1033{ 1034 uint32_t val; 1035}; 1036 1037// Recursive template used to auto-nest conditionals. Converts dynamic boolean function 1038// arguments to static template arguments. 1039template <typename TermT, typename... ArgsB> 1040struct TemplateArgUnroller 1041{ 1042 //----------------------------------------- 1043 // Boolean value 1044 //----------------------------------------- 1045 1046 // Last Arg Terminator 1047 static typename TermT::FuncType GetFunc(bool bArg) 1048 { 1049 if (bArg) 1050 { 1051 return TermT::template GetFunc<ArgsB..., std::true_type>(); 1052 } 1053 1054 return TermT::template GetFunc<ArgsB..., std::false_type>(); 1055 } 1056 1057 // Recursively parse args 1058 template <typename... TArgsT> 1059 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs) 1060 { 1061 if (bArg) 1062 { 1063 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...); 1064 } 1065 1066 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...); 1067 } 1068 1069 //----------------------------------------- 1070 // Integer value (within specified range) 1071 //----------------------------------------- 1072 1073 // Last Arg Terminator 1074 template <uint32_t TMin, uint32_t TMax> 1075 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg) 1076 { 1077 if (iArg.val == TMax) 1078 { 1079 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>(); 1080 } 1081 if (TMax > TMin) 1082 { 1083 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val}); 1084 } 1085 SWR_ASSUME(false); return nullptr; 1086 } 1087 template <uint32_t TVal> 1088 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg) 1089 { 1090 SWR_ASSERT(iArg.val == TVal); 1091 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>(); 1092 } 1093 1094 // Recursively parse args 1095 template <uint32_t TMin, uint32_t TMax, typename... TArgsT> 1096 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs) 1097 { 1098 if (iArg.val == TMax) 1099 { 1100 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...); 1101 } 1102 if (TMax > TMin) 1103 { 1104 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...); 1105 } 1106 SWR_ASSUME(false); return nullptr; 1107 } 1108 template <uint32_t TVal, typename... TArgsT> 1109 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs) 1110 { 1111 SWR_ASSERT(iArg.val == TVal); 1112 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...); 1113 } 1114}; 1115 1116 1117