utils.h revision 812b45d04958e31e7a3bfc7331308374e8b73afa
1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file utils.h
24*
25* @brief Utilities used by SWR core.
26*
27******************************************************************************/
28#pragma once
29
30#include <string.h>
31#include <type_traits>
32#include <algorithm>
33#include "common/os.h"
34#include "common/simdintrin.h"
35#include "common/swr_assert.h"
36
37#if defined(_WIN64) || defined(__x86_64__)
38#define _MM_INSERT_EPI64 _mm_insert_epi64
39#define _MM_EXTRACT_EPI64 _mm_extract_epi64
40#else
41INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
42{
43    OSALIGNLINE(uint32_t) elems[4];
44    _mm_store_si128((__m128i*)elems, a);
45    if (ndx == 0)
46    {
47        uint64_t foo = elems[0];
48        foo |= (uint64_t)elems[1] << 32;
49        return foo;
50    }
51    else
52    {
53        uint64_t foo = elems[2];
54        foo |= (uint64_t)elems[3] << 32;
55        return foo;
56    }
57}
58
59INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
60{
61    OSALIGNLINE(int64_t) elems[2];
62    _mm_store_si128((__m128i*)elems, a);
63    if (ndx == 0)
64    {
65        elems[0] = b;
66    }
67    else
68    {
69        elems[1] = b;
70    }
71    __m128i out;
72    out = _mm_load_si128((const __m128i*)elems);
73    return out;
74}
75#endif
76
77OSALIGNLINE(struct) BBOX
78{
79    int top{ 0 };
80    int bottom{ 0 };
81    int left{ 0 };
82    int right{ 0 };
83
84    BBOX() {}
85    BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
86
87    bool operator==(const BBOX& rhs)
88    {
89        return (this->top == rhs.top &&
90            this->bottom == rhs.bottom &&
91            this->left == rhs.left &&
92            this->right == rhs.right);
93    }
94
95    bool operator!=(const BBOX& rhs)
96    {
97        return !(*this == rhs);
98    }
99
100    BBOX& Intersect(const BBOX& other)
101    {
102        this->top = std::max(this->top, other.top);
103        this->bottom = std::min(this->bottom, other.bottom);
104        this->left = std::max(this->left, other.left);
105        this->right = std::min(this->right, other.right);
106
107        if (right - left < 0 ||
108            bottom - top < 0)
109        {
110            // Zero area
111            top = bottom = left = right = 0;
112        }
113
114        return *this;
115    }
116};
117
118struct simdBBox
119{
120    simdscalari top;
121    simdscalari bottom;
122    simdscalari left;
123    simdscalari right;
124};
125
126INLINE
127void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
128{
129    __m128i row0i = _mm_castps_si128(row0);
130    __m128i row1i = _mm_castps_si128(row1);
131    __m128i row2i = _mm_castps_si128(row2);
132    __m128i row3i = _mm_castps_si128(row3);
133
134    __m128i vTemp = row2i;
135    row2i = _mm_unpacklo_epi32(row2i, row3i);
136    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
137
138    row3i = row0i;
139    row0i = _mm_unpacklo_epi32(row0i, row1i);
140    row3i = _mm_unpackhi_epi32(row3i, row1i);
141
142    row1i = row0i;
143    row0i = _mm_unpacklo_epi64(row0i, row2i);
144    row1i = _mm_unpackhi_epi64(row1i, row2i);
145
146    row2i = row3i;
147    row2i = _mm_unpacklo_epi64(row2i, vTemp);
148    row3i = _mm_unpackhi_epi64(row3i, vTemp);
149
150    row0 = _mm_castsi128_ps(row0i);
151    row1 = _mm_castsi128_ps(row1i);
152    row2 = _mm_castsi128_ps(row2i);
153    row3 = _mm_castsi128_ps(row3i);
154}
155
156INLINE
157void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
158{
159    __m128i vTemp = row2;
160    row2 = _mm_unpacklo_epi32(row2, row3);
161    vTemp = _mm_unpackhi_epi32(vTemp, row3);
162
163    row3 = row0;
164    row0 = _mm_unpacklo_epi32(row0, row1);
165    row3 = _mm_unpackhi_epi32(row3, row1);
166
167    row1 = row0;
168    row0 = _mm_unpacklo_epi64(row0, row2);
169    row1 = _mm_unpackhi_epi64(row1, row2);
170
171    row2 = row3;
172    row2 = _mm_unpacklo_epi64(row2, vTemp);
173    row3 = _mm_unpackhi_epi64(row3, vTemp);
174}
175
176#define GCC_VERSION (__GNUC__ * 10000 \
177                     + __GNUC_MINOR__ * 100 \
178                     + __GNUC_PATCHLEVEL__)
179
180#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
181#define _mm_undefined_ps _mm_setzero_ps
182#define _mm_undefined_si128 _mm_setzero_si128
183#if KNOB_SIMD_WIDTH == 8
184#define _mm256_undefined_ps _mm256_setzero_ps
185#endif
186#endif
187
188#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
189INLINE
190void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
191{
192    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
193    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
194    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
195    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
196
197    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
198    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
199    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
200    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
201
202    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
203    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
204    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
205    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
206
207    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
208    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
209    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
210    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
211}
212
213INLINE
214void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
215{
216    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
217    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
218    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
219    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
220
221    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
222    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
223    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
224    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
225
226    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
227    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
228    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
229    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
230
231    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
232    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
233    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
234    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
235}
236
237INLINE
238void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
239{
240    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
241    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
242    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
243    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
244    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
245    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
246    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
247    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
248    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
249    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
250    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
251    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
252    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
253    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
254    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
255    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
256    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
257    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
258    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
259    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
260    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
261    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
262    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
263    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
264}
265
266INLINE
267void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
268{
269    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
270        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
271}
272#endif
273
274//////////////////////////////////////////////////////////////////////////
275/// TranposeSingleComponent
276//////////////////////////////////////////////////////////////////////////
277template<uint32_t bpp>
278struct TransposeSingleComponent
279{
280    //////////////////////////////////////////////////////////////////////////
281    /// @brief Pass-thru for single component.
282    /// @param pSrc - source data in SOA form
283    /// @param pDst - output data in AOS form
284    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
285    {
286        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
287    }
288};
289
290//////////////////////////////////////////////////////////////////////////
291/// Transpose8_8_8_8
292//////////////////////////////////////////////////////////////////////////
293struct Transpose8_8_8_8
294{
295    //////////////////////////////////////////////////////////////////////////
296    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
297    /// @param pSrc - source data in SOA form
298    /// @param pDst - output data in AOS form
299    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
300    {
301        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
302
303#if KNOB_SIMD_WIDTH == 8
304#if KNOB_ARCH == KNOB_ARCH_AVX
305        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
306        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
307        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
308        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
309        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
310        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
311        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
312        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
313        _mm_store_si128((__m128i*)pDst, c0123lo);
314        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
315#elif KNOB_ARCH == KNOB_ARCH_AVX2
316        simdscalari dst01 = _mm256_shuffle_epi8(src,
317            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
318        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
319        dst23 = _mm256_shuffle_epi8(dst23,
320            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
321        simdscalari dst = _mm256_or_si256(dst01, dst23);
322        _simd_store_si((simdscalari*)pDst, dst);
323#endif
324#elif KNOB_SIMD_WIDTH == 16
325        simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
326
327        simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
328
329        simdscalari perm1 = _simd_permute_128(src, src, 1);
330
331        simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
332
333        simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
334
335        simdscalari dst = _simd_or_si(dst01, dst23);
336
337        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
338#else
339#error Unsupported vector width
340#endif
341    }
342};
343
344//////////////////////////////////////////////////////////////////////////
345/// Transpose8_8_8
346//////////////////////////////////////////////////////////////////////////
347struct Transpose8_8_8
348{
349    //////////////////////////////////////////////////////////////////////////
350    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
351    /// @param pSrc - source data in SOA form
352    /// @param pDst - output data in AOS form
353    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
354};
355
356//////////////////////////////////////////////////////////////////////////
357/// Transpose8_8
358//////////////////////////////////////////////////////////////////////////
359struct Transpose8_8
360{
361    //////////////////////////////////////////////////////////////////////////
362    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
363    /// @param pSrc - source data in SOA form
364    /// @param pDst - output data in AOS form
365    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
366    {
367#if KNOB_SIMD_WIDTH == 8
368        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
369
370        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
371        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
372        rg = _mm_unpacklo_epi8(rg, g);
373        _mm_store_si128((__m128i*)pDst, rg);
374#elif KNOB_SIMD_WIDTH == 16
375        __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc));   // rrrrrrrrrrrrrrrrgggggggggggggggg
376
377        __m256i r = _mm256_permute4x64_epi64(src, 0x50);    // 0x50 = 01010000b     // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
378
379        __m256i g = _mm256_permute4x64_epi64(src, 0xFA);    // 0xFA = 11111010b     // ggggggggxxxxxxxxggggggggxxxxxxxx
380
381        __m256i dst = _mm256_unpacklo_epi8(r, g);                                   // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
382
383        _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
384#else
385#error Unsupported vector width
386#endif
387    }
388};
389
390//////////////////////////////////////////////////////////////////////////
391/// Transpose32_32_32_32
392//////////////////////////////////////////////////////////////////////////
393struct Transpose32_32_32_32
394{
395    //////////////////////////////////////////////////////////////////////////
396    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
397    /// @param pSrc - source data in SOA form
398    /// @param pDst - output data in AOS form
399    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
400    {
401#if KNOB_SIMD_WIDTH == 8
402        simdscalar src0 = _simd_load_ps((const float*)pSrc);
403        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
404        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
405        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
406
407        __m128 vDst[8];
408        vTranspose4x8(vDst, src0, src1, src2, src3);
409        _mm_store_ps((float*)pDst, vDst[0]);
410        _mm_store_ps((float*)pDst+4, vDst[1]);
411        _mm_store_ps((float*)pDst+8, vDst[2]);
412        _mm_store_ps((float*)pDst+12, vDst[3]);
413        _mm_store_ps((float*)pDst+16, vDst[4]);
414        _mm_store_ps((float*)pDst+20, vDst[5]);
415        _mm_store_ps((float*)pDst+24, vDst[6]);
416        _mm_store_ps((float*)pDst+28, vDst[7]);
417#elif KNOB_SIMD_WIDTH == 16
418#if ENABLE_AVX512_EMULATION
419        simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
420        simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
421        simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
422        simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
423
424        __m128 vDst[8];
425
426        vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
427
428        _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
429        _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
430        _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
431        _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
432        _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
433        _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
434        _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
435        _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
436
437        vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
438
439        _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
440        _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
441        _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
442        _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
443        _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
444        _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
445        _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
446        _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
447#endif
448#else
449#error Unsupported vector width
450#endif
451    }
452};
453
454//////////////////////////////////////////////////////////////////////////
455/// Transpose32_32_32
456//////////////////////////////////////////////////////////////////////////
457struct Transpose32_32_32
458{
459    //////////////////////////////////////////////////////////////////////////
460    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
461    /// @param pSrc - source data in SOA form
462    /// @param pDst - output data in AOS form
463    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
464    {
465#if KNOB_SIMD_WIDTH == 8
466        simdscalar src0 = _simd_load_ps((const float*)pSrc);
467        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
468        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
469
470        __m128 vDst[8];
471        vTranspose3x8(vDst, src0, src1, src2);
472        _mm_store_ps((float*)pDst, vDst[0]);
473        _mm_store_ps((float*)pDst + 4, vDst[1]);
474        _mm_store_ps((float*)pDst + 8, vDst[2]);
475        _mm_store_ps((float*)pDst + 12, vDst[3]);
476        _mm_store_ps((float*)pDst + 16, vDst[4]);
477        _mm_store_ps((float*)pDst + 20, vDst[5]);
478        _mm_store_ps((float*)pDst + 24, vDst[6]);
479        _mm_store_ps((float*)pDst + 28, vDst[7]);
480#elif KNOB_SIMD_WIDTH == 16
481#if ENABLE_AVX512_EMULATION
482        simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
483        simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
484        simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
485
486        __m128 vDst[8];
487
488        vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
489
490        _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
491        _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
492        _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
493        _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
494        _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
495        _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
496        _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
497        _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
498
499        vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
500
501        _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
502        _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
503        _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
504        _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
505        _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
506        _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
507        _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
508        _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
509#endif
510#else
511#error Unsupported vector width
512#endif
513    }
514};
515
516//////////////////////////////////////////////////////////////////////////
517/// Transpose32_32
518//////////////////////////////////////////////////////////////////////////
519struct Transpose32_32
520{
521    //////////////////////////////////////////////////////////////////////////
522    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
523    /// @param pSrc - source data in SOA form
524    /// @param pDst - output data in AOS form
525    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
526    {
527#if KNOB_SIMD_WIDTH == 8
528        const float* pfSrc = (const float*)pSrc;
529        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
530        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
531        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
532        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
533
534        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
535        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
536        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
537        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
538
539        float* pfDst = (float*)pDst;
540        _mm_store_ps(pfDst + 0, dst0);
541        _mm_store_ps(pfDst + 4, dst1);
542        _mm_store_ps(pfDst + 8, dst2);
543        _mm_store_ps(pfDst + 12, dst3);
544#elif KNOB_SIMD_WIDTH == 16
545        const float* pfSrc = (const float*)pSrc;
546        __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
547        __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
548        __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
549        __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
550
551        __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
552        __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
553        __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
554        __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
555
556        float* pfDst = (float*)pDst;
557        _mm256_store_ps(pfDst + 0, dst0);
558        _mm256_store_ps(pfDst + 8, dst1);
559        _mm256_store_ps(pfDst + 16, dst2);
560        _mm256_store_ps(pfDst + 24, dst3);
561#else
562#error Unsupported vector width
563#endif
564    }
565};
566
567//////////////////////////////////////////////////////////////////////////
568/// Transpose16_16_16_16
569//////////////////////////////////////////////////////////////////////////
570struct Transpose16_16_16_16
571{
572    //////////////////////////////////////////////////////////////////////////
573    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
574    /// @param pSrc - source data in SOA form
575    /// @param pDst - output data in AOS form
576    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
577    {
578#if KNOB_SIMD_WIDTH == 8
579        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
580        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
581
582        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
583        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
584        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
585        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
586
587        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
588        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
589        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
590        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
591
592        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
593        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
594        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
595        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
596
597        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
598        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
599        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
600        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
601#elif KNOB_SIMD_WIDTH == 16
602#if ENABLE_AVX512_EMULATION
603        simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
604        simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
605
606        __m256i src_r = src_rg.lo;
607        __m256i src_g = src_rg.hi;
608        __m256i src_b = src_ba.lo;
609        __m256i src_a = src_ba.hi;
610
611        __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
612        __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
613        __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
614        __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
615
616        __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
617        __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
618        __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
619        __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
620
621        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
622        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
623        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
624        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
625#endif
626#else
627#error Unsupported vector width
628#endif
629    }
630};
631
632//////////////////////////////////////////////////////////////////////////
633/// Transpose16_16_16
634//////////////////////////////////////////////////////////////////////////
635struct Transpose16_16_16
636{
637    //////////////////////////////////////////////////////////////////////////
638    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
639    /// @param pSrc - source data in SOA form
640    /// @param pDst - output data in AOS form
641    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
642    {
643#if KNOB_SIMD_WIDTH == 8
644        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
645
646        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
647        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
648        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
649        __m128i src_a = _mm_undefined_si128();
650
651        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
652        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
653        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
654        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
655
656        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
657        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
658        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
659        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
660
661        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
662        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
663        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
664        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
665#elif KNOB_SIMD_WIDTH == 16
666#if ENABLE_AVX512_EMULATION
667        simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
668
669        __m256i src_r = src_rg.lo;
670        __m256i src_g = src_rg.hi;
671        __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
672        __m256i src_a = _mm256_undefined_si256();
673
674        __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
675        __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
676        __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
677        __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
678
679        __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
680        __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
681        __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
682        __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
683
684        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
685        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
686        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
687        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
688#endif
689#else
690#error Unsupported vector width
691#endif
692    }
693};
694
695//////////////////////////////////////////////////////////////////////////
696/// Transpose16_16
697//////////////////////////////////////////////////////////////////////////
698struct Transpose16_16
699{
700    //////////////////////////////////////////////////////////////////////////
701    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
702    /// @param pSrc - source data in SOA form
703    /// @param pDst - output data in AOS form
704    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
705    {
706#if KNOB_SIMD_WIDTH == 8
707        simdscalar src = _simd_load_ps((const float*)pSrc);
708
709        __m128 comp0 = _mm256_castps256_ps128(src);
710        __m128 comp1 = _mm256_extractf128_ps(src, 1);
711
712        __m128i comp0i = _mm_castps_si128(comp0);
713        __m128i comp1i = _mm_castps_si128(comp1);
714
715        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
716        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
717
718        _mm_store_si128((__m128i*)pDst, resLo);
719        _mm_store_si128((__m128i*)pDst + 1, resHi);
720#elif KNOB_SIMD_WIDTH == 16
721#if ENABLE_AVX512_EMULATION
722        simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
723
724        simdscalari result;
725
726        result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
727        result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
728
729        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
730#endif
731#else
732#error Unsupported vector width
733#endif
734    }
735};
736
737//////////////////////////////////////////////////////////////////////////
738/// Transpose24_8
739//////////////////////////////////////////////////////////////////////////
740struct Transpose24_8
741{
742    //////////////////////////////////////////////////////////////////////////
743    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
744    /// @param pSrc - source data in SOA form
745    /// @param pDst - output data in AOS form
746    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
747};
748
749//////////////////////////////////////////////////////////////////////////
750/// Transpose32_8_24
751//////////////////////////////////////////////////////////////////////////
752struct Transpose32_8_24
753{
754    //////////////////////////////////////////////////////////////////////////
755    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
756    /// @param pSrc - source data in SOA form
757    /// @param pDst - output data in AOS form
758    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
759};
760
761
762
763//////////////////////////////////////////////////////////////////////////
764/// Transpose4_4_4_4
765//////////////////////////////////////////////////////////////////////////
766struct Transpose4_4_4_4
767{
768    //////////////////////////////////////////////////////////////////////////
769    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
770    /// @param pSrc - source data in SOA form
771    /// @param pDst - output data in AOS form
772    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
773};
774
775//////////////////////////////////////////////////////////////////////////
776/// Transpose5_6_5
777//////////////////////////////////////////////////////////////////////////
778struct Transpose5_6_5
779{
780    //////////////////////////////////////////////////////////////////////////
781    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
782    /// @param pSrc - source data in SOA form
783    /// @param pDst - output data in AOS form
784    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
785};
786
787//////////////////////////////////////////////////////////////////////////
788/// Transpose9_9_9_5
789//////////////////////////////////////////////////////////////////////////
790struct Transpose9_9_9_5
791{
792    //////////////////////////////////////////////////////////////////////////
793    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
794    /// @param pSrc - source data in SOA form
795    /// @param pDst - output data in AOS form
796    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
797};
798
799//////////////////////////////////////////////////////////////////////////
800/// Transpose5_5_5_1
801//////////////////////////////////////////////////////////////////////////
802struct Transpose5_5_5_1
803{
804    //////////////////////////////////////////////////////////////////////////
805    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
806    /// @param pSrc - source data in SOA form
807    /// @param pDst - output data in AOS form
808    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
809};
810
811//////////////////////////////////////////////////////////////////////////
812/// Transpose10_10_10_2
813//////////////////////////////////////////////////////////////////////////
814struct Transpose10_10_10_2
815{
816    //////////////////////////////////////////////////////////////////////////
817    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
818    /// @param pSrc - source data in SOA form
819    /// @param pDst - output data in AOS form
820    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
821};
822
823//////////////////////////////////////////////////////////////////////////
824/// Transpose11_11_10
825//////////////////////////////////////////////////////////////////////////
826struct Transpose11_11_10
827{
828    //////////////////////////////////////////////////////////////////////////
829    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
830    /// @param pSrc - source data in SOA form
831    /// @param pDst - output data in AOS form
832    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
833};
834
835// helper function to unroll loops
836template<int Begin, int End, int Step = 1>
837struct UnrollerL {
838    template<typename Lambda>
839    INLINE static void step(Lambda& func) {
840        func(Begin);
841        UnrollerL<Begin + Step, End, Step>::step(func);
842    }
843};
844
845template<int End, int Step>
846struct UnrollerL<End, End, Step> {
847    template<typename Lambda>
848    static void step(Lambda& func) {
849    }
850};
851
852// helper function to unroll loops, with mask to skip specific iterations
853template<int Begin, int End, int Step = 1, int Mask = 0x7f>
854struct UnrollerLMask {
855    template<typename Lambda>
856    INLINE static void step(Lambda& func) {
857        if(Mask & (1 << Begin))
858        {
859            func(Begin);
860        }
861        UnrollerL<Begin + Step, End, Step>::step(func);
862    }
863};
864
865template<int End, int Step, int Mask>
866struct UnrollerLMask<End, End, Step, Mask> {
867    template<typename Lambda>
868    static void step(Lambda& func) {
869    }
870};
871
872// general CRC compute
873INLINE
874uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
875{
876#if defined(_WIN64) || defined(__x86_64__)
877    uint32_t sizeInQwords = size / sizeof(uint64_t);
878    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
879    uint64_t* pDataWords = (uint64_t*)pData;
880    for (uint32_t i = 0; i < sizeInQwords; ++i)
881    {
882        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
883    }
884#else
885    uint32_t sizeInDwords = size / sizeof(uint32_t);
886    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
887    uint32_t* pDataWords = (uint32_t*)pData;
888    for (uint32_t i = 0; i < sizeInDwords; ++i)
889    {
890        crc = _mm_crc32_u32(crc, *pDataWords++);
891    }
892#endif
893
894    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
895    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
896    {
897        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
898    }
899
900    return crc;
901}
902
903//////////////////////////////////////////////////////////////////////////
904/// Add byte offset to any-type pointer
905//////////////////////////////////////////////////////////////////////////
906template <typename T>
907INLINE
908static T* PtrAdd(T* p, intptr_t offset)
909{
910    intptr_t intp = reinterpret_cast<intptr_t>(p);
911    return reinterpret_cast<T*>(intp + offset);
912}
913
914//////////////////////////////////////////////////////////////////////////
915/// Is a power-of-2?
916//////////////////////////////////////////////////////////////////////////
917template <typename T>
918INLINE
919static bool IsPow2(T value)
920{
921    return value == (value & (0 - value));
922}
923
924//////////////////////////////////////////////////////////////////////////
925/// Align down to specified alignment
926/// Note: IsPow2(alignment) MUST be true
927//////////////////////////////////////////////////////////////////////////
928template <typename T1, typename T2>
929INLINE
930static T1 AlignDownPow2(T1 value, T2 alignment)
931{
932    SWR_ASSERT(IsPow2(alignment));
933    return value & ~T1(alignment - 1);
934}
935
936//////////////////////////////////////////////////////////////////////////
937/// Align up to specified alignment
938/// Note: IsPow2(alignment) MUST be true
939//////////////////////////////////////////////////////////////////////////
940template <typename T1, typename T2>
941INLINE
942static T1 AlignUpPow2(T1 value, T2 alignment)
943{
944    return AlignDownPow2(value + T1(alignment - 1), alignment);
945}
946
947//////////////////////////////////////////////////////////////////////////
948/// Align up ptr to specified alignment
949/// Note: IsPow2(alignment) MUST be true
950//////////////////////////////////////////////////////////////////////////
951template <typename T1, typename T2>
952INLINE
953static T1* AlignUpPow2(T1* value, T2 alignment)
954{
955    return reinterpret_cast<T1*>(
956        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
957}
958
959//////////////////////////////////////////////////////////////////////////
960/// Align down to specified alignment
961//////////////////////////////////////////////////////////////////////////
962template <typename T1, typename T2>
963INLINE
964static T1 AlignDown(T1 value, T2 alignment)
965{
966    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
967    return value - T1(value % alignment);
968}
969
970//////////////////////////////////////////////////////////////////////////
971/// Align down to specified alignment
972//////////////////////////////////////////////////////////////////////////
973template <typename T1, typename T2>
974INLINE
975static T1* AlignDown(T1* value, T2 alignment)
976{
977    return (T1*)AlignDown(uintptr_t(value), alignment);
978}
979
980//////////////////////////////////////////////////////////////////////////
981/// Align up to specified alignment
982/// Note: IsPow2(alignment) MUST be true
983//////////////////////////////////////////////////////////////////////////
984template <typename T1, typename T2>
985INLINE
986static T1 AlignUp(T1 value, T2 alignment)
987{
988    return AlignDown(value + T1(alignment - 1), alignment);
989}
990
991//////////////////////////////////////////////////////////////////////////
992/// Align up to specified alignment
993/// Note: IsPow2(alignment) MUST be true
994//////////////////////////////////////////////////////////////////////////
995template <typename T1, typename T2>
996INLINE
997static T1* AlignUp(T1* value, T2 alignment)
998{
999    return AlignDown(PtrAdd(value, alignment - 1), alignment);
1000}
1001
1002//////////////////////////////////////////////////////////////////////////
1003/// Helper structure used to access an array of elements that don't
1004/// correspond to a typical word size.
1005//////////////////////////////////////////////////////////////////////////
1006template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1007class BitsArray
1008{
1009private:
1010    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1011    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1012    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1013    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1014
1015    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1016        "Element size must an integral fraction of pointer size");
1017
1018    size_t              m_words[NUM_WORDS] = {};
1019
1020public:
1021
1022    T operator[] (size_t elementIndex) const
1023    {
1024        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1025        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1026        return T(word & ELEMENT_MASK);
1027    }
1028};
1029
1030// Ranged integer argument for TemplateArgUnroller
1031template <uint32_t TMin, uint32_t TMax>
1032struct IntArg
1033{
1034    uint32_t val;
1035};
1036
1037// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
1038// arguments to static template arguments.
1039template <typename TermT, typename... ArgsB>
1040struct TemplateArgUnroller
1041{
1042    //-----------------------------------------
1043    // Boolean value
1044    //-----------------------------------------
1045
1046    // Last Arg Terminator
1047    static typename TermT::FuncType GetFunc(bool bArg)
1048    {
1049        if (bArg)
1050        {
1051            return TermT::template GetFunc<ArgsB..., std::true_type>();
1052        }
1053
1054        return TermT::template GetFunc<ArgsB..., std::false_type>();
1055    }
1056
1057    // Recursively parse args
1058    template <typename... TArgsT>
1059    static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1060    {
1061        if (bArg)
1062        {
1063            return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1064        }
1065
1066        return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1067    }
1068
1069    //-----------------------------------------
1070    // Integer value (within specified range)
1071    //-----------------------------------------
1072
1073    // Last Arg Terminator
1074    template <uint32_t TMin, uint32_t TMax>
1075    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1076    {
1077        if (iArg.val == TMax)
1078        {
1079            return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1080        }
1081        if (TMax > TMin)
1082        {
1083            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1084        }
1085        SWR_ASSUME(false); return nullptr;
1086    }
1087    template <uint32_t TVal>
1088    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1089    {
1090        SWR_ASSERT(iArg.val == TVal);
1091        return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1092    }
1093
1094    // Recursively parse args
1095    template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1096    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1097    {
1098        if (iArg.val == TMax)
1099        {
1100            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1101        }
1102        if (TMax > TMin)
1103        {
1104            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1105        }
1106        SWR_ASSUME(false); return nullptr;
1107    }
1108    template <uint32_t TVal, typename... TArgsT>
1109    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1110    {
1111        SWR_ASSERT(iArg.val == TVal);
1112        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1113    }
1114};
1115
1116
1117