utils.h revision 33fa4c99f7fa68fd8c33c75c4fe66c4cca76779f
1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file utils.h
24*
25* @brief Utilities used by SWR core.
26*
27******************************************************************************/
28#pragma once
29
30#include <string.h>
31#include <type_traits>
32#include <algorithm>
33#include "common/os.h"
34#include "common/simdintrin.h"
35#include "common/swr_assert.h"
36#include "core/api.h"
37
38#if defined(_WIN64) || defined(__x86_64__)
39#define _MM_INSERT_EPI64 _mm_insert_epi64
40#define _MM_EXTRACT_EPI64 _mm_extract_epi64
41#else
42INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43{
44    OSALIGNLINE(uint32_t) elems[4];
45    _mm_store_si128((__m128i*)elems, a);
46    if (ndx == 0)
47    {
48        uint64_t foo = elems[0];
49        foo |= (uint64_t)elems[1] << 32;
50        return foo;
51    }
52    else
53    {
54        uint64_t foo = elems[2];
55        foo |= (uint64_t)elems[3] << 32;
56        return foo;
57    }
58}
59
60INLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61{
62    OSALIGNLINE(int64_t) elems[2];
63    _mm_store_si128((__m128i*)elems, a);
64    if (ndx == 0)
65    {
66        elems[0] = b;
67    }
68    else
69    {
70        elems[1] = b;
71    }
72    __m128i out;
73    out = _mm_load_si128((const __m128i*)elems);
74    return out;
75}
76#endif
77
78struct simdBBox
79{
80    simdscalari ymin;
81    simdscalari ymax;
82    simdscalari xmin;
83    simdscalari xmax;
84};
85
86INLINE
87void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88{
89    __m128i row0i = _mm_castps_si128(row0);
90    __m128i row1i = _mm_castps_si128(row1);
91    __m128i row2i = _mm_castps_si128(row2);
92    __m128i row3i = _mm_castps_si128(row3);
93
94    __m128i vTemp = row2i;
95    row2i = _mm_unpacklo_epi32(row2i, row3i);
96    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98    row3i = row0i;
99    row0i = _mm_unpacklo_epi32(row0i, row1i);
100    row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102    row1i = row0i;
103    row0i = _mm_unpacklo_epi64(row0i, row2i);
104    row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106    row2i = row3i;
107    row2i = _mm_unpacklo_epi64(row2i, vTemp);
108    row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110    row0 = _mm_castsi128_ps(row0i);
111    row1 = _mm_castsi128_ps(row1i);
112    row2 = _mm_castsi128_ps(row2i);
113    row3 = _mm_castsi128_ps(row3i);
114}
115
116INLINE
117void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118{
119    __m128i vTemp = row2;
120    row2 = _mm_unpacklo_epi32(row2, row3);
121    vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123    row3 = row0;
124    row0 = _mm_unpacklo_epi32(row0, row1);
125    row3 = _mm_unpackhi_epi32(row3, row1);
126
127    row1 = row0;
128    row0 = _mm_unpacklo_epi64(row0, row2);
129    row1 = _mm_unpackhi_epi64(row1, row2);
130
131    row2 = row3;
132    row2 = _mm_unpacklo_epi64(row2, vTemp);
133    row3 = _mm_unpackhi_epi64(row3, vTemp);
134}
135
136#define GCC_VERSION (__GNUC__ * 10000 \
137                     + __GNUC_MINOR__ * 100 \
138                     + __GNUC_PATCHLEVEL__)
139
140#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141#define _mm_undefined_ps _mm_setzero_ps
142#define _mm_undefined_si128 _mm_setzero_si128
143#if KNOB_SIMD_WIDTH == 8
144#define _mm256_undefined_ps _mm256_setzero_ps
145#endif
146#endif
147
148#if KNOB_SIMD_WIDTH == 8
149INLINE
150void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
151{
152    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
153    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
154    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
155    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
156
157    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
158    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
159    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
160    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
161
162    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171}
172
173INLINE
174void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
175{
176    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
177    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
178    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
179    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
180
181    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
182    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
183    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
184    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
185
186    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195}
196
197INLINE
198void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
199{
200    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
201    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
202    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
203    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
204    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
205    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
206    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
207    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
208    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
209    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
210    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
211    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
212    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
213    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
214    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
215    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
216    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
217    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
218    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
219    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
220    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
221    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
222    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
223    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
224}
225
226INLINE
227void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
228{
229    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
230        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
231}
232#endif
233
234//////////////////////////////////////////////////////////////////////////
235/// TranposeSingleComponent
236//////////////////////////////////////////////////////////////////////////
237template<uint32_t bpp>
238struct TransposeSingleComponent
239{
240    //////////////////////////////////////////////////////////////////////////
241    /// @brief Pass-thru for single component.
242    /// @param pSrc - source data in SOA form
243    /// @param pDst - output data in AOS form
244    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
245    {
246        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
247    }
248#if ENABLE_AVX512_SIMD16
249
250    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
251    {
252        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
253    }
254#endif
255};
256
257//////////////////////////////////////////////////////////////////////////
258/// Transpose8_8_8_8
259//////////////////////////////////////////////////////////////////////////
260struct Transpose8_8_8_8
261{
262    //////////////////////////////////////////////////////////////////////////
263    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
264    /// @param pSrc - source data in SOA form
265    /// @param pDst - output data in AOS form
266    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
267    {
268        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
269
270#if KNOB_SIMD_WIDTH == 8
271#if KNOB_ARCH == KNOB_ARCH_AVX
272        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
273        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
274        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
275        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
276        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
277        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
278        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
279        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
280        _mm_store_si128((__m128i*)pDst, c0123lo);
281        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
282#elif KNOB_ARCH == KNOB_ARCH_AVX2
283        simdscalari dst01 = _mm256_shuffle_epi8(src,
284            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
285        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
286        dst23 = _mm256_shuffle_epi8(dst23,
287            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
288        simdscalari dst = _mm256_or_si256(dst01, dst23);
289        _simd_store_si((simdscalari*)pDst, dst);
290#endif
291#else
292#error Unsupported vector width
293#endif
294    }
295#if ENABLE_AVX512_SIMD16
296
297    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
298    {
299        simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
300
301        simd16scalari mask0 = _simd16_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
302
303        simd16scalari dst01 = _simd16_shuffle_epi8(src, mask0);
304
305        simd16scalari perm1 = _simd16_permute2f128_si(src, src, 1);
306
307        simd16scalari mask1 = _simd16_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
308
309        simd16scalari dst23 = _simd16_shuffle_epi8(perm1, mask1);
310
311        simd16scalari dst = _simd16_or_si(dst01, dst23);
312
313        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);
314    }
315#endif
316};
317
318//////////////////////////////////////////////////////////////////////////
319/// Transpose8_8_8
320//////////////////////////////////////////////////////////////////////////
321struct Transpose8_8_8
322{
323    //////////////////////////////////////////////////////////////////////////
324    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
325    /// @param pSrc - source data in SOA form
326    /// @param pDst - output data in AOS form
327    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
328#if ENABLE_AVX512_SIMD16
329
330    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
331#endif
332};
333
334//////////////////////////////////////////////////////////////////////////
335/// Transpose8_8
336//////////////////////////////////////////////////////////////////////////
337struct Transpose8_8
338{
339    //////////////////////////////////////////////////////////////////////////
340    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
341    /// @param pSrc - source data in SOA form
342    /// @param pDst - output data in AOS form
343    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
344    {
345#if KNOB_SIMD_WIDTH == 8
346        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
347
348        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
349        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
350        rg = _mm_unpacklo_epi8(rg, g);
351        _mm_store_si128((__m128i*)pDst, rg);
352#else
353#error Unsupported vector width
354#endif
355    }
356#if ENABLE_AVX512_SIMD16
357
358    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
359    {
360        simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));     // rrrrrrrrrrrrrrrrgggggggggggggggg
361
362        simdscalari g = _simd_permute2f128_si(r, r, 1);                                 // ggggggggggggggggxxxxxxxxxxxxxxxx
363
364        r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1);   // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
365
366        g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1);   // ggggggggxxxxxxxxggggggggxxxxxxxx
367
368        simdscalari dst = _simd_unpacklo_epi8(r, g);                                    // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
369
370        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
371    }
372#endif
373};
374
375//////////////////////////////////////////////////////////////////////////
376/// Transpose32_32_32_32
377//////////////////////////////////////////////////////////////////////////
378struct Transpose32_32_32_32
379{
380    //////////////////////////////////////////////////////////////////////////
381    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
382    /// @param pSrc - source data in SOA form
383    /// @param pDst - output data in AOS form
384    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
385    {
386#if KNOB_SIMD_WIDTH == 8
387        simdscalar src0 = _simd_load_ps((const float*)pSrc);
388        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
389        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
390        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
391
392        __m128 vDst[8];
393        vTranspose4x8(vDst, src0, src1, src2, src3);
394        _mm_store_ps((float*)pDst, vDst[0]);
395        _mm_store_ps((float*)pDst+4, vDst[1]);
396        _mm_store_ps((float*)pDst+8, vDst[2]);
397        _mm_store_ps((float*)pDst+12, vDst[3]);
398        _mm_store_ps((float*)pDst+16, vDst[4]);
399        _mm_store_ps((float*)pDst+20, vDst[5]);
400        _mm_store_ps((float*)pDst+24, vDst[6]);
401        _mm_store_ps((float*)pDst+28, vDst[7]);
402#else
403#error Unsupported vector width
404#endif
405    }
406#if ENABLE_AVX512_SIMD16
407
408    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
409    {
410        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
411        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
412        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
413        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
414
415        __m128 vDst[8];
416
417        vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0));
418
419        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, reinterpret_cast<simd16scalar *>(vDst)[0]);
420        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
421
422        vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1));
423
424        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[0]);
425        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[1]);
426    }
427#endif
428};
429
430//////////////////////////////////////////////////////////////////////////
431/// Transpose32_32_32
432//////////////////////////////////////////////////////////////////////////
433struct Transpose32_32_32
434{
435    //////////////////////////////////////////////////////////////////////////
436    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
437    /// @param pSrc - source data in SOA form
438    /// @param pDst - output data in AOS form
439    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
440    {
441#if KNOB_SIMD_WIDTH == 8
442        simdscalar src0 = _simd_load_ps((const float*)pSrc);
443        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
444        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
445
446        __m128 vDst[8];
447        vTranspose3x8(vDst, src0, src1, src2);
448        _mm_store_ps((float*)pDst, vDst[0]);
449        _mm_store_ps((float*)pDst + 4, vDst[1]);
450        _mm_store_ps((float*)pDst + 8, vDst[2]);
451        _mm_store_ps((float*)pDst + 12, vDst[3]);
452        _mm_store_ps((float*)pDst + 16, vDst[4]);
453        _mm_store_ps((float*)pDst + 20, vDst[5]);
454        _mm_store_ps((float*)pDst + 24, vDst[6]);
455        _mm_store_ps((float*)pDst + 28, vDst[7]);
456#else
457#error Unsupported vector width
458#endif
459    }
460#if ENABLE_AVX512_SIMD16
461
462    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
463    {
464        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
465        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
466        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
467
468        __m128 vDst[8];
469
470        vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0));
471
472        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, reinterpret_cast<simd16scalar *>(vDst)[0]);
473        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
474
475        vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));
476
477        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[0]);
478        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[1]);
479    }
480#endif
481};
482
483//////////////////////////////////////////////////////////////////////////
484/// Transpose32_32
485//////////////////////////////////////////////////////////////////////////
486struct Transpose32_32
487{
488    //////////////////////////////////////////////////////////////////////////
489    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
490    /// @param pSrc - source data in SOA form
491    /// @param pDst - output data in AOS form
492    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
493    {
494#if KNOB_SIMD_WIDTH == 8
495        const float* pfSrc = (const float*)pSrc;
496        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
497        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
498        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
499        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
500
501        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
502        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
503        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
504        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
505
506        float* pfDst = (float*)pDst;
507        _mm_store_ps(pfDst + 0, dst0);
508        _mm_store_ps(pfDst + 4, dst1);
509        _mm_store_ps(pfDst + 8, dst2);
510        _mm_store_ps(pfDst + 12, dst3);
511#else
512#error Unsupported vector width
513#endif
514    }
515#if ENABLE_AVX512_SIMD16
516
517    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
518    {
519        simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
520        simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) +  8);
521        simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
522        simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24);
523
524        simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0);
525        simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0);
526        simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1);
527        simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1);
528
529        _simd_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);
530        _simd_store_ps(reinterpret_cast<float *>(pDst) +  8, dst1);
531        _simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2);
532        _simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3);
533    }
534#endif
535};
536
537//////////////////////////////////////////////////////////////////////////
538/// Transpose16_16_16_16
539//////////////////////////////////////////////////////////////////////////
540struct Transpose16_16_16_16
541{
542    //////////////////////////////////////////////////////////////////////////
543    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
544    /// @param pSrc - source data in SOA form
545    /// @param pDst - output data in AOS form
546    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
547    {
548#if KNOB_SIMD_WIDTH == 8
549        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
550        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
551
552        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
553        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
554        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
555        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
556
557        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
558        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
559        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
560        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
561
562        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
563        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
564        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
565        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
566
567        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
568        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
569        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
570        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
571#else
572#error Unsupported vector width
573#endif
574    }
575#if ENABLE_AVX512_SIMD16
576
577    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
578    {
579        simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
580        simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari)));
581
582        simdscalari src_r = _simd16_extract_si(src_rg, 0);
583        simdscalari src_g = _simd16_extract_si(src_rg, 1);
584        simdscalari src_b = _simd16_extract_si(src_ba, 0);
585        simdscalari src_a = _simd16_extract_si(src_ba, 1);
586
587        simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
588        simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
589        simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
590        simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
591
592        simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
593        simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
594        simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
595        simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
596
597        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
598        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
599        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
600        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
601    }
602#endif
603};
604
605//////////////////////////////////////////////////////////////////////////
606/// Transpose16_16_16
607//////////////////////////////////////////////////////////////////////////
608struct Transpose16_16_16
609{
610    //////////////////////////////////////////////////////////////////////////
611    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
612    /// @param pSrc - source data in SOA form
613    /// @param pDst - output data in AOS form
614    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
615    {
616#if KNOB_SIMD_WIDTH == 8
617        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
618
619        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
620        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
621        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
622        __m128i src_a = _mm_undefined_si128();
623
624        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
625        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
626        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
627        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
628
629        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
630        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
631        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
632        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
633
634        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
635        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
636        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
637        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
638#else
639#error Unsupported vector width
640#endif
641    }
642#if ENABLE_AVX512_SIMD16
643
644    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
645    {
646        simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
647
648        simdscalari src_r = _simd16_extract_si(src_rg, 0);
649        simdscalari src_g = _simd16_extract_si(src_rg, 1);
650        simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari)));
651        simdscalari src_a = _mm256_undefined_si256();
652
653        simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
654        simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
655        simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
656        simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
657
658        simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
659        simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
660        simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
661        simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
662
663        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
664        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
665        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
666        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
667    }
668#endif
669};
670
671//////////////////////////////////////////////////////////////////////////
672/// Transpose16_16
673//////////////////////////////////////////////////////////////////////////
674struct Transpose16_16
675{
676    //////////////////////////////////////////////////////////////////////////
677    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
678    /// @param pSrc - source data in SOA form
679    /// @param pDst - output data in AOS form
680    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
681    {
682#if KNOB_SIMD_WIDTH == 8
683        simdscalar src = _simd_load_ps((const float*)pSrc);
684
685        __m128 comp0 = _mm256_castps256_ps128(src);
686        __m128 comp1 = _mm256_extractf128_ps(src, 1);
687
688        __m128i comp0i = _mm_castps_si128(comp0);
689        __m128i comp1i = _mm_castps_si128(comp1);
690
691        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
692        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
693
694        _mm_store_si128((__m128i*)pDst, resLo);
695        _mm_store_si128((__m128i*)pDst + 1, resHi);
696#else
697#error Unsupported vector width
698#endif
699    }
700#if ENABLE_AVX512_SIMD16
701
702    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
703    {
704        simd16scalari result = _simd16_setzero_si();
705
706        simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
707
708        simdscalari srclo = _simd16_extract_si(src, 0);
709        simdscalari srchi = _simd16_extract_si(src, 1);
710
711        result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0);
712        result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1);
713
714        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result);
715    }
716#endif
717};
718
719//////////////////////////////////////////////////////////////////////////
720/// Transpose24_8
721//////////////////////////////////////////////////////////////////////////
722struct Transpose24_8
723{
724    //////////////////////////////////////////////////////////////////////////
725    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
726    /// @param pSrc - source data in SOA form
727    /// @param pDst - output data in AOS form
728    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
729#if ENABLE_AVX512_SIMD16
730
731    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
732#endif
733};
734
735//////////////////////////////////////////////////////////////////////////
736/// Transpose32_8_24
737//////////////////////////////////////////////////////////////////////////
738struct Transpose32_8_24
739{
740    //////////////////////////////////////////////////////////////////////////
741    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
742    /// @param pSrc - source data in SOA form
743    /// @param pDst - output data in AOS form
744    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
745#if ENABLE_AVX512_SIMD16
746
747    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
748#endif
749};
750
751//////////////////////////////////////////////////////////////////////////
752/// Transpose4_4_4_4
753//////////////////////////////////////////////////////////////////////////
754struct Transpose4_4_4_4
755{
756    //////////////////////////////////////////////////////////////////////////
757    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
758    /// @param pSrc - source data in SOA form
759    /// @param pDst - output data in AOS form
760    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
761#if ENABLE_AVX512_SIMD16
762
763    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
764#endif
765};
766
767//////////////////////////////////////////////////////////////////////////
768/// Transpose5_6_5
769//////////////////////////////////////////////////////////////////////////
770struct Transpose5_6_5
771{
772    //////////////////////////////////////////////////////////////////////////
773    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
774    /// @param pSrc - source data in SOA form
775    /// @param pDst - output data in AOS form
776    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
777#if ENABLE_AVX512_SIMD16
778
779    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
780#endif
781};
782
783//////////////////////////////////////////////////////////////////////////
784/// Transpose9_9_9_5
785//////////////////////////////////////////////////////////////////////////
786struct Transpose9_9_9_5
787{
788    //////////////////////////////////////////////////////////////////////////
789    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
790    /// @param pSrc - source data in SOA form
791    /// @param pDst - output data in AOS form
792    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
793#if ENABLE_AVX512_SIMD16
794
795    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
796#endif
797};
798
799//////////////////////////////////////////////////////////////////////////
800/// Transpose5_5_5_1
801//////////////////////////////////////////////////////////////////////////
802struct Transpose5_5_5_1
803{
804    //////////////////////////////////////////////////////////////////////////
805    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
806    /// @param pSrc - source data in SOA form
807    /// @param pDst - output data in AOS form
808    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
809#if ENABLE_AVX512_SIMD16
810
811    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
812#endif
813};
814
815//////////////////////////////////////////////////////////////////////////
816/// Transpose1_5_5_5
817//////////////////////////////////////////////////////////////////////////
818struct Transpose1_5_5_5
819{
820    //////////////////////////////////////////////////////////////////////////
821    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
822    /// @param pSrc - source data in SOA form
823    /// @param pDst - output data in AOS form
824    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
825};
826
827//////////////////////////////////////////////////////////////////////////
828/// Transpose10_10_10_2
829//////////////////////////////////////////////////////////////////////////
830struct Transpose10_10_10_2
831{
832    //////////////////////////////////////////////////////////////////////////
833    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
834    /// @param pSrc - source data in SOA form
835    /// @param pDst - output data in AOS form
836    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
837#if ENABLE_AVX512_SIMD16
838
839    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
840#endif
841};
842
843//////////////////////////////////////////////////////////////////////////
844/// Transpose11_11_10
845//////////////////////////////////////////////////////////////////////////
846struct Transpose11_11_10
847{
848    //////////////////////////////////////////////////////////////////////////
849    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
850    /// @param pSrc - source data in SOA form
851    /// @param pDst - output data in AOS form
852    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
853#if ENABLE_AVX512_SIMD16
854
855    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
856#endif
857};
858
859//////////////////////////////////////////////////////////////////////////
860/// Transpose64
861//////////////////////////////////////////////////////////////////////////
862struct Transpose64
863{
864    //////////////////////////////////////////////////////////////////////////
865    /// @brief Performs an SOA to AOS conversion
866    /// @param pSrc - source data in SOA form
867    /// @param pDst - output data in AOS form
868    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
869#if ENABLE_AVX512_SIMD16
870
871    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
872#endif
873};
874
875//////////////////////////////////////////////////////////////////////////
876/// Transpose64_64
877//////////////////////////////////////////////////////////////////////////
878struct Transpose64_64
879{
880    //////////////////////////////////////////////////////////////////////////
881    /// @brief Performs an SOA to AOS conversion
882    /// @param pSrc - source data in SOA form
883    /// @param pDst - output data in AOS form
884    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
885#if ENABLE_AVX512_SIMD16
886
887    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
888#endif
889};
890
891//////////////////////////////////////////////////////////////////////////
892/// Transpose64_64_64
893//////////////////////////////////////////////////////////////////////////
894struct Transpose64_64_64
895{
896    //////////////////////////////////////////////////////////////////////////
897    /// @brief Performs an SOA to AOS conversion
898    /// @param pSrc - source data in SOA form
899    /// @param pDst - output data in AOS form
900    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
901#if ENABLE_AVX512_SIMD16
902
903    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
904#endif
905};
906
907//////////////////////////////////////////////////////////////////////////
908/// Transpose64_64_64_64
909//////////////////////////////////////////////////////////////////////////
910struct Transpose64_64_64_64
911{
912    //////////////////////////////////////////////////////////////////////////
913    /// @brief Performs an SOA to AOS conversion
914    /// @param pSrc - source data in SOA form
915    /// @param pDst - output data in AOS form
916    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
917#if ENABLE_AVX512_SIMD16
918
919    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
920#endif
921};
922
923// helper function to unroll loops
924template<int Begin, int End, int Step = 1>
925struct UnrollerL {
926    template<typename Lambda>
927    INLINE static void step(Lambda& func) {
928        func(Begin);
929        UnrollerL<Begin + Step, End, Step>::step(func);
930    }
931};
932
933template<int End, int Step>
934struct UnrollerL<End, End, Step> {
935    template<typename Lambda>
936    static void step(Lambda& func) {
937    }
938};
939
940// helper function to unroll loops, with mask to skip specific iterations
941template<int Begin, int End, int Step = 1, int Mask = 0x7f>
942struct UnrollerLMask {
943    template<typename Lambda>
944    INLINE static void step(Lambda& func) {
945        if(Mask & (1 << Begin))
946        {
947            func(Begin);
948        }
949        UnrollerL<Begin + Step, End, Step>::step(func);
950    }
951};
952
953template<int End, int Step, int Mask>
954struct UnrollerLMask<End, End, Step, Mask> {
955    template<typename Lambda>
956    static void step(Lambda& func) {
957    }
958};
959
960// general CRC compute
961INLINE
962uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
963{
964#if defined(_WIN64) || defined(__x86_64__)
965    uint32_t sizeInQwords = size / sizeof(uint64_t);
966    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
967    uint64_t* pDataWords = (uint64_t*)pData;
968    for (uint32_t i = 0; i < sizeInQwords; ++i)
969    {
970        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
971    }
972#else
973    uint32_t sizeInDwords = size / sizeof(uint32_t);
974    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
975    uint32_t* pDataWords = (uint32_t*)pData;
976    for (uint32_t i = 0; i < sizeInDwords; ++i)
977    {
978        crc = _mm_crc32_u32(crc, *pDataWords++);
979    }
980#endif
981
982    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
983    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
984    {
985        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
986    }
987
988    return crc;
989}
990
991//////////////////////////////////////////////////////////////////////////
992/// Add byte offset to any-type pointer
993//////////////////////////////////////////////////////////////////////////
994template <typename T>
995INLINE
996static T* PtrAdd(T* p, intptr_t offset)
997{
998    intptr_t intp = reinterpret_cast<intptr_t>(p);
999    return reinterpret_cast<T*>(intp + offset);
1000}
1001
1002//////////////////////////////////////////////////////////////////////////
1003/// Is a power-of-2?
1004//////////////////////////////////////////////////////////////////////////
1005template <typename T>
1006INLINE
1007static bool IsPow2(T value)
1008{
1009    return value == (value & (0 - value));
1010}
1011
1012//////////////////////////////////////////////////////////////////////////
1013/// Align down to specified alignment
1014/// Note: IsPow2(alignment) MUST be true
1015//////////////////////////////////////////////////////////////////////////
1016template <typename T1, typename T2>
1017INLINE
1018static T1 AlignDownPow2(T1 value, T2 alignment)
1019{
1020    SWR_ASSERT(IsPow2(alignment));
1021    return value & ~T1(alignment - 1);
1022}
1023
1024//////////////////////////////////////////////////////////////////////////
1025/// Align up to specified alignment
1026/// Note: IsPow2(alignment) MUST be true
1027//////////////////////////////////////////////////////////////////////////
1028template <typename T1, typename T2>
1029INLINE
1030static T1 AlignUpPow2(T1 value, T2 alignment)
1031{
1032    return AlignDownPow2(value + T1(alignment - 1), alignment);
1033}
1034
1035//////////////////////////////////////////////////////////////////////////
1036/// Align up ptr to specified alignment
1037/// Note: IsPow2(alignment) MUST be true
1038//////////////////////////////////////////////////////////////////////////
1039template <typename T1, typename T2>
1040INLINE
1041static T1* AlignUpPow2(T1* value, T2 alignment)
1042{
1043    return reinterpret_cast<T1*>(
1044        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1045}
1046
1047//////////////////////////////////////////////////////////////////////////
1048/// Align down to specified alignment
1049//////////////////////////////////////////////////////////////////////////
1050template <typename T1, typename T2>
1051INLINE
1052static T1 AlignDown(T1 value, T2 alignment)
1053{
1054    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1055    return value - T1(value % alignment);
1056}
1057
1058//////////////////////////////////////////////////////////////////////////
1059/// Align down to specified alignment
1060//////////////////////////////////////////////////////////////////////////
1061template <typename T1, typename T2>
1062INLINE
1063static T1* AlignDown(T1* value, T2 alignment)
1064{
1065    return (T1*)AlignDown(uintptr_t(value), alignment);
1066}
1067
1068//////////////////////////////////////////////////////////////////////////
1069/// Align up to specified alignment
1070/// Note: IsPow2(alignment) MUST be true
1071//////////////////////////////////////////////////////////////////////////
1072template <typename T1, typename T2>
1073INLINE
1074static T1 AlignUp(T1 value, T2 alignment)
1075{
1076    return AlignDown(value + T1(alignment - 1), alignment);
1077}
1078
1079//////////////////////////////////////////////////////////////////////////
1080/// Align up to specified alignment
1081/// Note: IsPow2(alignment) MUST be true
1082//////////////////////////////////////////////////////////////////////////
1083template <typename T1, typename T2>
1084INLINE
1085static T1* AlignUp(T1* value, T2 alignment)
1086{
1087    return AlignDown(PtrAdd(value, alignment - 1), alignment);
1088}
1089
1090//////////////////////////////////////////////////////////////////////////
1091/// Helper structure used to access an array of elements that don't
1092/// correspond to a typical word size.
1093//////////////////////////////////////////////////////////////////////////
1094template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1095class BitsArray
1096{
1097private:
1098    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1099    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1100    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1101    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1102
1103    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1104        "Element size must an integral fraction of pointer size");
1105
1106    size_t              m_words[NUM_WORDS] = {};
1107
1108public:
1109
1110    T operator[] (size_t elementIndex) const
1111    {
1112        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1113        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1114        return T(word & ELEMENT_MASK);
1115    }
1116};
1117
1118// Ranged integer argument for TemplateArgUnroller
1119template <uint32_t TMin, uint32_t TMax>
1120struct IntArg
1121{
1122    uint32_t val;
1123};
1124
1125// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
1126// arguments to static template arguments.
1127template <typename TermT, typename... ArgsB>
1128struct TemplateArgUnroller
1129{
1130    //-----------------------------------------
1131    // Boolean value
1132    //-----------------------------------------
1133
1134    // Last Arg Terminator
1135    static typename TermT::FuncType GetFunc(bool bArg)
1136    {
1137        if (bArg)
1138        {
1139            return TermT::template GetFunc<ArgsB..., std::true_type>();
1140        }
1141
1142        return TermT::template GetFunc<ArgsB..., std::false_type>();
1143    }
1144
1145    // Recursively parse args
1146    template <typename... TArgsT>
1147    static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1148    {
1149        if (bArg)
1150        {
1151            return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1152        }
1153
1154        return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1155    }
1156
1157    //-----------------------------------------
1158    // Integer value (within specified range)
1159    //-----------------------------------------
1160
1161    // Last Arg Terminator
1162    template <uint32_t TMin, uint32_t TMax>
1163    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1164    {
1165        if (iArg.val == TMax)
1166        {
1167            return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1168        }
1169        if (TMax > TMin)
1170        {
1171            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1172        }
1173        SWR_ASSUME(false); return nullptr;
1174    }
1175    template <uint32_t TVal>
1176    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1177    {
1178        SWR_ASSERT(iArg.val == TVal);
1179        return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1180    }
1181
1182    // Recursively parse args
1183    template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1184    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1185    {
1186        if (iArg.val == TMax)
1187        {
1188            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1189        }
1190        if (TMax > TMin)
1191        {
1192            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1193        }
1194        SWR_ASSUME(false); return nullptr;
1195    }
1196    template <uint32_t TVal, typename... TArgsT>
1197    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1198    {
1199        SWR_ASSERT(iArg.val == TVal);
1200        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1201    }
1202};
1203
1204
1205