utils.h revision 3252fe3705376063f94a7717c07b9824b5d43f46
1/**************************************************************************** 2* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22* 23* @file utils.h 24* 25* @brief Utilities used by SWR core. 26* 27******************************************************************************/ 28#pragma once 29 30#include <string.h> 31#include "common/os.h" 32#include "common/simdintrin.h" 33#include "common/swr_assert.h" 34 35#if defined(_WIN32) 36void SaveImageToPNGFile( 37 const WCHAR *pFilename, 38 void *pBuffer, 39 uint32_t width, 40 uint32_t height); 41 42void OpenBitmapFromFile( 43 const WCHAR *pFilename, 44 void **pBuffer, 45 uint32_t *width, 46 uint32_t *height); 47#endif 48 49#if defined(_WIN64) || defined(__x86_64__) 50#define _MM_INSERT_EPI64 _mm_insert_epi64 51#define _MM_EXTRACT_EPI64 _mm_extract_epi64 52#else 53INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) 54{ 55 OSALIGNLINE(uint32_t) elems[4]; 56 _mm_store_si128((__m128i*)elems, a); 57 if (ndx == 0) 58 { 59 uint64_t foo = elems[0]; 60 foo |= (uint64_t)elems[1] << 32; 61 return foo; 62 } 63 else 64 { 65 uint64_t foo = elems[2]; 66 foo |= (uint64_t)elems[3] << 32; 67 return foo; 68 } 69} 70 71INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) 72{ 73 OSALIGNLINE(int64_t) elems[2]; 74 _mm_store_si128((__m128i*)elems, a); 75 if (ndx == 0) 76 { 77 elems[0] = b; 78 } 79 else 80 { 81 elems[1] = b; 82 } 83 __m128i out; 84 out = _mm_load_si128((const __m128i*)elems); 85 return out; 86} 87#endif 88 89OSALIGNLINE(struct) BBOX 90{ 91 int top{ 0 }; 92 int bottom{ 0 }; 93 int left{ 0 }; 94 int right{ 0 }; 95 96 BBOX() {} 97 BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} 98 99 bool operator==(const BBOX& rhs) 100 { 101 return (this->top == rhs.top && 102 this->bottom == rhs.bottom && 103 this->left == rhs.left && 104 this->right == rhs.right); 105 } 106 107 bool operator!=(const BBOX& rhs) 108 { 109 return !(*this == rhs); 110 } 111}; 112 113struct simdBBox 114{ 115 simdscalari top; 116 simdscalari bottom; 117 simdscalari left; 118 simdscalari right; 119}; 120 121INLINE 122void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) 123{ 124 __m128i row0i = _mm_castps_si128(row0); 125 __m128i row1i = _mm_castps_si128(row1); 126 __m128i row2i = _mm_castps_si128(row2); 127 __m128i row3i = _mm_castps_si128(row3); 128 129 __m128i vTemp = row2i; 130 row2i = _mm_unpacklo_epi32(row2i, row3i); 131 vTemp = _mm_unpackhi_epi32(vTemp, row3i); 132 133 row3i = row0i; 134 row0i = _mm_unpacklo_epi32(row0i, row1i); 135 row3i = _mm_unpackhi_epi32(row3i, row1i); 136 137 row1i = row0i; 138 row0i = _mm_unpacklo_epi64(row0i, row2i); 139 row1i = _mm_unpackhi_epi64(row1i, row2i); 140 141 row2i = row3i; 142 row2i = _mm_unpacklo_epi64(row2i, vTemp); 143 row3i = _mm_unpackhi_epi64(row3i, vTemp); 144 145 row0 = _mm_castsi128_ps(row0i); 146 row1 = _mm_castsi128_ps(row1i); 147 row2 = _mm_castsi128_ps(row2i); 148 row3 = _mm_castsi128_ps(row3i); 149} 150 151INLINE 152void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) 153{ 154 __m128i vTemp = row2; 155 row2 = _mm_unpacklo_epi32(row2, row3); 156 vTemp = _mm_unpackhi_epi32(vTemp, row3); 157 158 row3 = row0; 159 row0 = _mm_unpacklo_epi32(row0, row1); 160 row3 = _mm_unpackhi_epi32(row3, row1); 161 162 row1 = row0; 163 row0 = _mm_unpacklo_epi64(row0, row2); 164 row1 = _mm_unpackhi_epi64(row1, row2); 165 166 row2 = row3; 167 row2 = _mm_unpacklo_epi64(row2, vTemp); 168 row3 = _mm_unpackhi_epi64(row3, vTemp); 169} 170 171#define GCC_VERSION (__GNUC__ * 10000 \ 172 + __GNUC_MINOR__ * 100 \ 173 + __GNUC_PATCHLEVEL__) 174 175#if defined(__GNUC__) && (GCC_VERSION < 40900) 176#define _mm_undefined_ps _mm_setzero_ps 177#define _mm_undefined_si128 _mm_setzero_si128 178#if KNOB_SIMD_WIDTH == 8 179#define _mm256_undefined_ps _mm256_setzero_ps 180#endif 181#endif 182 183#if KNOB_SIMD_WIDTH == 8 184INLINE 185void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) 186{ 187 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 188 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 189 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 190 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 191 192 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 193 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 194 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 195 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 196 197 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 198 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 199 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 200 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 201 202 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 203 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 204 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 205 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 206} 207 208INLINE 209void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) 210{ 211 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 212 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 213 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 214 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 215 216 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 217 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 218 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 219 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 220 221 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 222 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 223 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 224 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 225 226 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 227 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 228 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 229 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 230} 231 232INLINE 233void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) 234{ 235 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); 236 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); 237 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); 238 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); 239 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); 240 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); 241 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); 242 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); 243 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 244 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 245 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 246 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 247 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 248 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 249 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 250 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 251 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); 252 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); 253 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); 254 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); 255 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); 256 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); 257 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); 258 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); 259} 260 261INLINE 262void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) 263{ 264 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 265 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); 266} 267#endif 268 269////////////////////////////////////////////////////////////////////////// 270/// TranposeSingleComponent 271////////////////////////////////////////////////////////////////////////// 272template<uint32_t bpp> 273struct TransposeSingleComponent 274{ 275 ////////////////////////////////////////////////////////////////////////// 276 /// @brief Pass-thru for single component. 277 /// @param pSrc - source data in SOA form 278 /// @param pDst - output data in AOS form 279 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 280 { 281 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 282 } 283}; 284 285////////////////////////////////////////////////////////////////////////// 286/// Transpose8_8_8_8 287////////////////////////////////////////////////////////////////////////// 288struct Transpose8_8_8_8 289{ 290 ////////////////////////////////////////////////////////////////////////// 291 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 292 /// @param pSrc - source data in SOA form 293 /// @param pDst - output data in AOS form 294 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 295 { 296 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 297#if KNOB_SIMD_WIDTH == 8 298#if KNOB_ARCH == KNOB_ARCH_AVX 299 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg 300 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa 301 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 302 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 303 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 304 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa 305 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba 306 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba 307 _mm_store_si128((__m128i*)pDst, c0123lo); 308 _mm_store_si128((__m128i*)(pDst + 16), c0123hi); 309#elif KNOB_ARCH == KNOB_ARCH_AVX2 310 simdscalari dst01 = _mm256_shuffle_epi8(src, 311 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 312 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 313 dst23 = _mm256_shuffle_epi8(dst23, 314 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 315 simdscalari dst = _mm256_or_si256(dst01, dst23); 316 _simd_store_si((simdscalari*)pDst, dst); 317#endif 318#else 319#error Unsupported vector width 320#endif 321 } 322}; 323 324////////////////////////////////////////////////////////////////////////// 325/// Transpose8_8_8 326////////////////////////////////////////////////////////////////////////// 327struct Transpose8_8_8 328{ 329 ////////////////////////////////////////////////////////////////////////// 330 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 331 /// @param pSrc - source data in SOA form 332 /// @param pDst - output data in AOS form 333 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 334}; 335 336////////////////////////////////////////////////////////////////////////// 337/// Transpose8_8 338////////////////////////////////////////////////////////////////////////// 339struct Transpose8_8 340{ 341 ////////////////////////////////////////////////////////////////////////// 342 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 343 /// @param pSrc - source data in SOA form 344 /// @param pDst - output data in AOS form 345 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 346 { 347 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 348 349#if KNOB_SIMD_WIDTH == 8 350 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg 351 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg 352 rg = _mm_unpacklo_epi8(rg, g); 353 _mm_store_si128((__m128i*)pDst, rg); 354#else 355#error Unsupported vector width 356#endif 357 } 358}; 359 360////////////////////////////////////////////////////////////////////////// 361/// Transpose32_32_32_32 362////////////////////////////////////////////////////////////////////////// 363struct Transpose32_32_32_32 364{ 365 ////////////////////////////////////////////////////////////////////////// 366 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 367 /// @param pSrc - source data in SOA form 368 /// @param pDst - output data in AOS form 369 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 370 { 371#if KNOB_SIMD_WIDTH == 8 372 simdscalar src0 = _simd_load_ps((const float*)pSrc); 373 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 374 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 375 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 376 377 __m128 vDst[8]; 378 vTranspose4x8(vDst, src0, src1, src2, src3); 379 _mm_store_ps((float*)pDst, vDst[0]); 380 _mm_store_ps((float*)pDst+4, vDst[1]); 381 _mm_store_ps((float*)pDst+8, vDst[2]); 382 _mm_store_ps((float*)pDst+12, vDst[3]); 383 _mm_store_ps((float*)pDst+16, vDst[4]); 384 _mm_store_ps((float*)pDst+20, vDst[5]); 385 _mm_store_ps((float*)pDst+24, vDst[6]); 386 _mm_store_ps((float*)pDst+28, vDst[7]); 387#else 388#error Unsupported vector width 389#endif 390 } 391}; 392 393////////////////////////////////////////////////////////////////////////// 394/// Transpose32_32_32 395////////////////////////////////////////////////////////////////////////// 396struct Transpose32_32_32 397{ 398 ////////////////////////////////////////////////////////////////////////// 399 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 400 /// @param pSrc - source data in SOA form 401 /// @param pDst - output data in AOS form 402 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 403 { 404#if KNOB_SIMD_WIDTH == 8 405 simdscalar src0 = _simd_load_ps((const float*)pSrc); 406 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 407 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 408 409 __m128 vDst[8]; 410 vTranspose3x8(vDst, src0, src1, src2); 411 _mm_store_ps((float*)pDst, vDst[0]); 412 _mm_store_ps((float*)pDst + 4, vDst[1]); 413 _mm_store_ps((float*)pDst + 8, vDst[2]); 414 _mm_store_ps((float*)pDst + 12, vDst[3]); 415 _mm_store_ps((float*)pDst + 16, vDst[4]); 416 _mm_store_ps((float*)pDst + 20, vDst[5]); 417 _mm_store_ps((float*)pDst + 24, vDst[6]); 418 _mm_store_ps((float*)pDst + 28, vDst[7]); 419#else 420#error Unsupported vector width 421#endif 422 } 423}; 424 425////////////////////////////////////////////////////////////////////////// 426/// Transpose32_32 427////////////////////////////////////////////////////////////////////////// 428struct Transpose32_32 429{ 430 ////////////////////////////////////////////////////////////////////////// 431 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 432 /// @param pSrc - source data in SOA form 433 /// @param pDst - output data in AOS form 434 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 435 { 436 const float* pfSrc = (const float*)pSrc; 437 __m128 src_r0 = _mm_load_ps(pfSrc + 0); 438 __m128 src_r1 = _mm_load_ps(pfSrc + 4); 439 __m128 src_g0 = _mm_load_ps(pfSrc + 8); 440 __m128 src_g1 = _mm_load_ps(pfSrc + 12); 441 442 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); 443 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); 444 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); 445 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); 446 447 float* pfDst = (float*)pDst; 448 _mm_store_ps(pfDst + 0, dst0); 449 _mm_store_ps(pfDst + 4, dst1); 450 _mm_store_ps(pfDst + 8, dst2); 451 _mm_store_ps(pfDst + 12, dst3); 452 } 453}; 454 455////////////////////////////////////////////////////////////////////////// 456/// Transpose16_16_16_16 457////////////////////////////////////////////////////////////////////////// 458struct Transpose16_16_16_16 459{ 460 ////////////////////////////////////////////////////////////////////////// 461 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 462 /// @param pSrc - source data in SOA form 463 /// @param pDst - output data in AOS form 464 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 465 { 466#if KNOB_SIMD_WIDTH == 8 467 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 468 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 469 470 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 471 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 472 __m128i src_b = _mm256_extractf128_si256(src_ba, 0); 473 __m128i src_a = _mm256_extractf128_si256(src_ba, 1); 474 475 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 476 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 477 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 478 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 479 480 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 481 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 482 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 483 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 484 485 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 486 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 487 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 488 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 489#else 490#error Unsupported vector width 491#endif 492 } 493}; 494 495////////////////////////////////////////////////////////////////////////// 496/// Transpose16_16_16 497////////////////////////////////////////////////////////////////////////// 498struct Transpose16_16_16 499{ 500 ////////////////////////////////////////////////////////////////////////// 501 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 502 /// @param pSrc - source data in SOA form 503 /// @param pDst - output data in AOS form 504 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 505 { 506#if KNOB_SIMD_WIDTH == 8 507 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 508 509 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 510 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 511 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); 512 __m128i src_a = _mm_undefined_si128(); 513 514 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 515 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 516 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 517 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 518 519 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 520 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 521 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 522 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 523 524 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 525 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 526 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 527 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 528#else 529#error Unsupported vector width 530#endif 531 } 532}; 533 534////////////////////////////////////////////////////////////////////////// 535/// Transpose16_16 536////////////////////////////////////////////////////////////////////////// 537struct Transpose16_16 538{ 539 ////////////////////////////////////////////////////////////////////////// 540 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 541 /// @param pSrc - source data in SOA form 542 /// @param pDst - output data in AOS form 543 INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) 544 { 545 simdscalar src = _simd_load_ps((const float*)pSrc); 546 547#if KNOB_SIMD_WIDTH == 8 548 __m128 comp0 = _mm256_castps256_ps128(src); 549 __m128 comp1 = _mm256_extractf128_ps(src, 1); 550 551 __m128i comp0i = _mm_castps_si128(comp0); 552 __m128i comp1i = _mm_castps_si128(comp1); 553 554 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); 555 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); 556 557 _mm_store_si128((__m128i*)pDst, resLo); 558 _mm_store_si128((__m128i*)pDst + 1, resHi); 559#else 560#error Unsupported vector width 561#endif 562 } 563}; 564 565////////////////////////////////////////////////////////////////////////// 566/// Transpose24_8 567////////////////////////////////////////////////////////////////////////// 568struct Transpose24_8 569{ 570 ////////////////////////////////////////////////////////////////////////// 571 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 572 /// @param pSrc - source data in SOA form 573 /// @param pDst - output data in AOS form 574 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 575}; 576 577////////////////////////////////////////////////////////////////////////// 578/// Transpose32_8_24 579////////////////////////////////////////////////////////////////////////// 580struct Transpose32_8_24 581{ 582 ////////////////////////////////////////////////////////////////////////// 583 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 584 /// @param pSrc - source data in SOA form 585 /// @param pDst - output data in AOS form 586 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 587}; 588 589 590 591////////////////////////////////////////////////////////////////////////// 592/// Transpose4_4_4_4 593////////////////////////////////////////////////////////////////////////// 594struct Transpose4_4_4_4 595{ 596 ////////////////////////////////////////////////////////////////////////// 597 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 598 /// @param pSrc - source data in SOA form 599 /// @param pDst - output data in AOS form 600 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 601}; 602 603////////////////////////////////////////////////////////////////////////// 604/// Transpose5_6_5 605////////////////////////////////////////////////////////////////////////// 606struct Transpose5_6_5 607{ 608 ////////////////////////////////////////////////////////////////////////// 609 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 610 /// @param pSrc - source data in SOA form 611 /// @param pDst - output data in AOS form 612 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 613}; 614 615////////////////////////////////////////////////////////////////////////// 616/// Transpose9_9_9_5 617////////////////////////////////////////////////////////////////////////// 618struct Transpose9_9_9_5 619{ 620 ////////////////////////////////////////////////////////////////////////// 621 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 622 /// @param pSrc - source data in SOA form 623 /// @param pDst - output data in AOS form 624 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 625}; 626 627////////////////////////////////////////////////////////////////////////// 628/// Transpose5_5_5_1 629////////////////////////////////////////////////////////////////////////// 630struct Transpose5_5_5_1 631{ 632 ////////////////////////////////////////////////////////////////////////// 633 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 634 /// @param pSrc - source data in SOA form 635 /// @param pDst - output data in AOS form 636 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 637}; 638 639////////////////////////////////////////////////////////////////////////// 640/// Transpose10_10_10_2 641////////////////////////////////////////////////////////////////////////// 642struct Transpose10_10_10_2 643{ 644 ////////////////////////////////////////////////////////////////////////// 645 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 646 /// @param pSrc - source data in SOA form 647 /// @param pDst - output data in AOS form 648 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 649}; 650 651////////////////////////////////////////////////////////////////////////// 652/// Transpose11_11_10 653////////////////////////////////////////////////////////////////////////// 654struct Transpose11_11_10 655{ 656 ////////////////////////////////////////////////////////////////////////// 657 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 658 /// @param pSrc - source data in SOA form 659 /// @param pDst - output data in AOS form 660 static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; 661}; 662 663// helper function to unroll loops 664template<int Begin, int End, int Step = 1> 665struct UnrollerL { 666 template<typename Lambda> 667 INLINE static void step(Lambda& func) { 668 func(Begin); 669 UnrollerL<Begin + Step, End, Step>::step(func); 670 } 671}; 672 673template<int End, int Step> 674struct UnrollerL<End, End, Step> { 675 template<typename Lambda> 676 static void step(Lambda& func) { 677 } 678}; 679 680// general CRC compute 681INLINE 682uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) 683{ 684#if defined(_WIN64) || defined(__x86_64__) 685 uint32_t sizeInQwords = size / sizeof(uint64_t); 686 uint32_t sizeRemainderBytes = size % sizeof(uint64_t); 687 uint64_t* pDataWords = (uint64_t*)pData; 688 for (uint32_t i = 0; i < sizeInQwords; ++i) 689 { 690 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); 691 } 692#else 693 uint32_t sizeInDwords = size / sizeof(uint32_t); 694 uint32_t sizeRemainderBytes = size % sizeof(uint32_t); 695 uint32_t* pDataWords = (uint32_t*)pData; 696 for (uint32_t i = 0; i < sizeInDwords; ++i) 697 { 698 crc = _mm_crc32_u32(crc, *pDataWords++); 699 } 700#endif 701 702 BYTE* pRemainderBytes = (BYTE*)pDataWords; 703 for (uint32_t i = 0; i < sizeRemainderBytes; ++i) 704 { 705 crc = _mm_crc32_u8(crc, *pRemainderBytes++); 706 } 707 708 return crc; 709} 710 711////////////////////////////////////////////////////////////////////////// 712/// Add byte offset to any-type pointer 713////////////////////////////////////////////////////////////////////////// 714template <typename T> 715INLINE 716static T* PtrAdd(T* p, intptr_t offset) 717{ 718 intptr_t intp = reinterpret_cast<intptr_t>(p); 719 return reinterpret_cast<T*>(intp + offset); 720} 721 722////////////////////////////////////////////////////////////////////////// 723/// Is a power-of-2? 724////////////////////////////////////////////////////////////////////////// 725template <typename T> 726INLINE 727static bool IsPow2(T value) 728{ 729 return value == (value & (0 - value)); 730} 731 732////////////////////////////////////////////////////////////////////////// 733/// Align down to specified alignment 734/// Note: IsPow2(alignment) MUST be true 735////////////////////////////////////////////////////////////////////////// 736template <typename T1, typename T2> 737INLINE 738static T1 AlignDownPow2(T1 value, T2 alignment) 739{ 740 SWR_ASSERT(IsPow2(alignment)); 741 return value & ~T1(alignment - 1); 742} 743 744////////////////////////////////////////////////////////////////////////// 745/// Align up to specified alignment 746/// Note: IsPow2(alignment) MUST be true 747////////////////////////////////////////////////////////////////////////// 748template <typename T1, typename T2> 749INLINE 750static T1 AlignUpPow2(T1 value, T2 alignment) 751{ 752 return AlignDownPow2(value + T1(alignment - 1), alignment); 753} 754 755////////////////////////////////////////////////////////////////////////// 756/// Align up ptr to specified alignment 757/// Note: IsPow2(alignment) MUST be true 758////////////////////////////////////////////////////////////////////////// 759template <typename T1, typename T2> 760INLINE 761static T1* AlignUpPow2(T1* value, T2 alignment) 762{ 763 return reinterpret_cast<T1*>( 764 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); 765} 766 767////////////////////////////////////////////////////////////////////////// 768/// Align down to specified alignment 769////////////////////////////////////////////////////////////////////////// 770template <typename T1, typename T2> 771INLINE 772static T1 AlignDown(T1 value, T2 alignment) 773{ 774 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } 775 return value - T1(value % alignment); 776} 777 778////////////////////////////////////////////////////////////////////////// 779/// Align down to specified alignment 780////////////////////////////////////////////////////////////////////////// 781template <typename T1, typename T2> 782INLINE 783static T1* AlignDown(T1* value, T2 alignment) 784{ 785 return (T1*)AlignDown(uintptr_t(value), alignment); 786} 787 788////////////////////////////////////////////////////////////////////////// 789/// Align up to specified alignment 790/// Note: IsPow2(alignment) MUST be true 791////////////////////////////////////////////////////////////////////////// 792template <typename T1, typename T2> 793INLINE 794static T1 AlignUp(T1 value, T2 alignment) 795{ 796 return AlignDown(value + T1(alignment - 1), alignment); 797} 798 799////////////////////////////////////////////////////////////////////////// 800/// Align up to specified alignment 801/// Note: IsPow2(alignment) MUST be true 802////////////////////////////////////////////////////////////////////////// 803template <typename T1, typename T2> 804INLINE 805static T1* AlignUp(T1* value, T2 alignment) 806{ 807 return AlignDown(PtrAdd(value, alignment - 1), alignment); 808} 809 810////////////////////////////////////////////////////////////////////////// 811/// Helper structure used to access an array of elements that don't 812/// correspond to a typical word size. 813////////////////////////////////////////////////////////////////////////// 814template<typename T, size_t BitsPerElementT, size_t ArrayLenT> 815class BitsArray 816{ 817private: 818 static const size_t BITS_PER_WORD = sizeof(size_t) * 8; 819 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; 820 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; 821 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; 822 823 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, 824 "Element size must an integral fraction of pointer size"); 825 826 size_t m_words[NUM_WORDS] = {}; 827 828public: 829 830 T operator[] (size_t elementIndex) const 831 { 832 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; 833 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); 834 return T(word & ELEMENT_MASK); 835 } 836}; 837