1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file utils.h
24*
25* @brief Utilities used by SWR core.
26*
27******************************************************************************/
28#pragma once
29
30#include <string.h>
31#include <type_traits>
32#include <algorithm>
33#include "common/os.h"
34#include "common/simdintrin.h"
35#include "common/swr_assert.h"
36#include "core/api.h"
37
38#if defined(_WIN64) || defined(__x86_64__)
39#define _MM_INSERT_EPI64 _mm_insert_epi64
40#define _MM_EXTRACT_EPI64 _mm_extract_epi64
41#else
42INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43{
44    OSALIGNLINE(uint32_t) elems[4];
45    _mm_store_si128((__m128i*)elems, a);
46    if (ndx == 0)
47    {
48        uint64_t foo = elems[0];
49        foo |= (uint64_t)elems[1] << 32;
50        return foo;
51    }
52    else
53    {
54        uint64_t foo = elems[2];
55        foo |= (uint64_t)elems[3] << 32;
56        return foo;
57    }
58}
59
60INLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61{
62    OSALIGNLINE(int64_t) elems[2];
63    _mm_store_si128((__m128i*)elems, a);
64    if (ndx == 0)
65    {
66        elems[0] = b;
67    }
68    else
69    {
70        elems[1] = b;
71    }
72    __m128i out;
73    out = _mm_load_si128((const __m128i*)elems);
74    return out;
75}
76#endif
77
78struct simdBBox
79{
80    simdscalari ymin;
81    simdscalari ymax;
82    simdscalari xmin;
83    simdscalari xmax;
84};
85
86INLINE
87void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88{
89    __m128i row0i = _mm_castps_si128(row0);
90    __m128i row1i = _mm_castps_si128(row1);
91    __m128i row2i = _mm_castps_si128(row2);
92    __m128i row3i = _mm_castps_si128(row3);
93
94    __m128i vTemp = row2i;
95    row2i = _mm_unpacklo_epi32(row2i, row3i);
96    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97
98    row3i = row0i;
99    row0i = _mm_unpacklo_epi32(row0i, row1i);
100    row3i = _mm_unpackhi_epi32(row3i, row1i);
101
102    row1i = row0i;
103    row0i = _mm_unpacklo_epi64(row0i, row2i);
104    row1i = _mm_unpackhi_epi64(row1i, row2i);
105
106    row2i = row3i;
107    row2i = _mm_unpacklo_epi64(row2i, vTemp);
108    row3i = _mm_unpackhi_epi64(row3i, vTemp);
109
110    row0 = _mm_castsi128_ps(row0i);
111    row1 = _mm_castsi128_ps(row1i);
112    row2 = _mm_castsi128_ps(row2i);
113    row3 = _mm_castsi128_ps(row3i);
114}
115
116INLINE
117void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118{
119    __m128i vTemp = row2;
120    row2 = _mm_unpacklo_epi32(row2, row3);
121    vTemp = _mm_unpackhi_epi32(vTemp, row3);
122
123    row3 = row0;
124    row0 = _mm_unpacklo_epi32(row0, row1);
125    row3 = _mm_unpackhi_epi32(row3, row1);
126
127    row1 = row0;
128    row0 = _mm_unpacklo_epi64(row0, row2);
129    row1 = _mm_unpackhi_epi64(row1, row2);
130
131    row2 = row3;
132    row2 = _mm_unpacklo_epi64(row2, vTemp);
133    row3 = _mm_unpackhi_epi64(row3, vTemp);
134}
135
136#define GCC_VERSION (__GNUC__ * 10000 \
137                     + __GNUC_MINOR__ * 100 \
138                     + __GNUC_PATCHLEVEL__)
139
140#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141#define _mm_undefined_ps _mm_setzero_ps
142#define _mm_undefined_si128 _mm_setzero_si128
143#if KNOB_SIMD_WIDTH == 8
144#define _mm256_undefined_ps _mm256_setzero_ps
145#endif
146#endif
147
148#if KNOB_SIMD_WIDTH == 8
149INLINE
150void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
151{
152    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
153    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
154    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
155    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
156
157    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
158    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
159    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
160    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
161
162    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166
167    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171}
172
173INLINE
174void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
175{
176    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
177    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
178    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
179    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
180
181    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
182    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
183    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
184    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
185
186    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190
191    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195}
196
197#if ENABLE_AVX512_SIMD16
198INLINE
199void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
200{
201    const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
202
203    simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
204    simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
205    simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
206    simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
207
208    simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
209    simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
210    simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
211    simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
212
213    dst[0] = _simd16_unpacklo_ps(rblo, galo);
214    dst[1] = _simd16_unpackhi_ps(rblo, galo);
215    dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
216    dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
217}
218
219#endif
220INLINE
221void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
222{
223    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
224    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
225    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
226    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
227    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
228    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
229    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
230    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
231    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
232    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
233    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
234    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
235    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
236    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
237    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
238    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
239    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
240    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
241    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
242    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
243    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
244    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
245    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
246    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
247}
248
249INLINE
250void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
251{
252    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
253        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
254}
255#endif
256
257//////////////////////////////////////////////////////////////////////////
258/// TranposeSingleComponent
259//////////////////////////////////////////////////////////////////////////
260template<uint32_t bpp>
261struct TransposeSingleComponent
262{
263    //////////////////////////////////////////////////////////////////////////
264    /// @brief Pass-thru for single component.
265    /// @param pSrc - source data in SOA form
266    /// @param pDst - output data in AOS form
267    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
268    {
269        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
270    }
271#if ENABLE_AVX512_SIMD16
272
273    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
274    {
275        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
276    }
277#endif
278};
279
280//////////////////////////////////////////////////////////////////////////
281/// Transpose8_8_8_8
282//////////////////////////////////////////////////////////////////////////
283struct Transpose8_8_8_8
284{
285    //////////////////////////////////////////////////////////////////////////
286    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
287    /// @param pSrc - source data in SOA form
288    /// @param pDst - output data in AOS form
289    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
290    {
291        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
292
293#if KNOB_SIMD_WIDTH == 8
294#if KNOB_ARCH == KNOB_ARCH_AVX
295        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
296        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
297        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
298        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
299        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
300        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
301        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
302        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
303        _mm_store_si128((__m128i*)pDst, c0123lo);
304        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
305#elif KNOB_ARCH == KNOB_ARCH_AVX2
306        simdscalari dst01 = _mm256_shuffle_epi8(src,
307            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
308        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
309        dst23 = _mm256_shuffle_epi8(dst23,
310            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
311        simdscalari dst = _mm256_or_si256(dst01, dst23);
312        _simd_store_si((simdscalari*)pDst, dst);
313#endif
314#else
315#error Unsupported vector width
316#endif
317    }
318#if ENABLE_AVX512_SIMD16
319
320    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
321    {
322        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
323        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
324        __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
325        __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
326
327        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
328        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
329        simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
330        simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
331
332        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
333        simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
334        simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
335
336        simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
337
338        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
339    }
340#endif
341};
342
343//////////////////////////////////////////////////////////////////////////
344/// Transpose8_8_8
345//////////////////////////////////////////////////////////////////////////
346struct Transpose8_8_8
347{
348    //////////////////////////////////////////////////////////////////////////
349    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
350    /// @param pSrc - source data in SOA form
351    /// @param pDst - output data in AOS form
352    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
353#if ENABLE_AVX512_SIMD16
354
355    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
356#endif
357};
358
359//////////////////////////////////////////////////////////////////////////
360/// Transpose8_8
361//////////////////////////////////////////////////////////////////////////
362struct Transpose8_8
363{
364    //////////////////////////////////////////////////////////////////////////
365    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
366    /// @param pSrc - source data in SOA form
367    /// @param pDst - output data in AOS form
368    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
369    {
370#if KNOB_SIMD_WIDTH == 8
371        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
372
373        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
374        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
375        rg = _mm_unpacklo_epi8(rg, g);
376        _mm_store_si128((__m128i*)pDst, rg);
377#else
378#error Unsupported vector width
379#endif
380    }
381#if ENABLE_AVX512_SIMD16
382
383    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
384    {
385        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
386        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
387
388        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
389        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
390
391        simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
392
393        simdscalari dst = _simd_or_si(cvt0, shl1);
394
395        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
396    }
397#endif
398};
399
400//////////////////////////////////////////////////////////////////////////
401/// Transpose32_32_32_32
402//////////////////////////////////////////////////////////////////////////
403struct Transpose32_32_32_32
404{
405    //////////////////////////////////////////////////////////////////////////
406    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
407    /// @param pSrc - source data in SOA form
408    /// @param pDst - output data in AOS form
409    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
410    {
411#if KNOB_SIMD_WIDTH == 8
412        simdscalar src0 = _simd_load_ps((const float*)pSrc);
413        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
414        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
415        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
416
417        __m128 vDst[8];
418        vTranspose4x8(vDst, src0, src1, src2, src3);
419        _mm_store_ps((float*)pDst, vDst[0]);
420        _mm_store_ps((float*)pDst+4, vDst[1]);
421        _mm_store_ps((float*)pDst+8, vDst[2]);
422        _mm_store_ps((float*)pDst+12, vDst[3]);
423        _mm_store_ps((float*)pDst+16, vDst[4]);
424        _mm_store_ps((float*)pDst+20, vDst[5]);
425        _mm_store_ps((float*)pDst+24, vDst[6]);
426        _mm_store_ps((float*)pDst+28, vDst[7]);
427#else
428#error Unsupported vector width
429#endif
430    }
431#if ENABLE_AVX512_SIMD16
432
433    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
434    {
435        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
436        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
437        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
438        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
439
440        simd16scalar dst[4];
441
442        vTranspose4x16(dst, src0, src1, src2, src3);
443
444        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
445        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
446        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
447        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
448    }
449#endif
450};
451
452//////////////////////////////////////////////////////////////////////////
453/// Transpose32_32_32
454//////////////////////////////////////////////////////////////////////////
455struct Transpose32_32_32
456{
457    //////////////////////////////////////////////////////////////////////////
458    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
459    /// @param pSrc - source data in SOA form
460    /// @param pDst - output data in AOS form
461    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
462    {
463#if KNOB_SIMD_WIDTH == 8
464        simdscalar src0 = _simd_load_ps((const float*)pSrc);
465        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
466        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
467
468        __m128 vDst[8];
469        vTranspose3x8(vDst, src0, src1, src2);
470        _mm_store_ps((float*)pDst, vDst[0]);
471        _mm_store_ps((float*)pDst + 4, vDst[1]);
472        _mm_store_ps((float*)pDst + 8, vDst[2]);
473        _mm_store_ps((float*)pDst + 12, vDst[3]);
474        _mm_store_ps((float*)pDst + 16, vDst[4]);
475        _mm_store_ps((float*)pDst + 20, vDst[5]);
476        _mm_store_ps((float*)pDst + 24, vDst[6]);
477        _mm_store_ps((float*)pDst + 28, vDst[7]);
478#else
479#error Unsupported vector width
480#endif
481    }
482#if ENABLE_AVX512_SIMD16
483
484    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
485    {
486        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
487        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
488        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
489        simd16scalar src3 = _simd16_setzero_ps();
490
491        simd16scalar dst[4];
492
493        vTranspose4x16(dst, src0, src1, src2, src3);
494
495        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
496        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
497        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
498        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
499    }
500#endif
501};
502
503//////////////////////////////////////////////////////////////////////////
504/// Transpose32_32
505//////////////////////////////////////////////////////////////////////////
506struct Transpose32_32
507{
508    //////////////////////////////////////////////////////////////////////////
509    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
510    /// @param pSrc - source data in SOA form
511    /// @param pDst - output data in AOS form
512    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
513    {
514#if KNOB_SIMD_WIDTH == 8
515        const float* pfSrc = (const float*)pSrc;
516        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
517        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
518        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
519        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
520
521        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
522        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
523        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
524        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
525
526        float* pfDst = (float*)pDst;
527        _mm_store_ps(pfDst + 0, dst0);
528        _mm_store_ps(pfDst + 4, dst1);
529        _mm_store_ps(pfDst + 8, dst2);
530        _mm_store_ps(pfDst + 12, dst3);
531#else
532#error Unsupported vector width
533#endif
534    }
535#if ENABLE_AVX512_SIMD16
536
537    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
538    {
539        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
540        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
541
542        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
543        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
544
545        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
546        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
547
548        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
549        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
550
551        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
552        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
553    }
554#endif
555};
556
557//////////////////////////////////////////////////////////////////////////
558/// Transpose16_16_16_16
559//////////////////////////////////////////////////////////////////////////
560struct Transpose16_16_16_16
561{
562    //////////////////////////////////////////////////////////////////////////
563    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
564    /// @param pSrc - source data in SOA form
565    /// @param pDst - output data in AOS form
566    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
567    {
568#if KNOB_SIMD_WIDTH == 8
569        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
570        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
571
572        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
573        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
574        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
575        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
576
577        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
578        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
579        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
580        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
581
582        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
583        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
584        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
585        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
586
587        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
588        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
589        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
590        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
591#else
592#error Unsupported vector width
593#endif
594    }
595#if ENABLE_AVX512_SIMD16
596
597    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
598    {
599        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
600        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
601        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
602        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
603
604        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
605        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
606        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
607        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
608
609        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
610        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
611        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
612        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
613
614        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
615        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
616        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
617        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
618
619        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
620        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
621        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
622        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
623    }
624#endif
625};
626
627//////////////////////////////////////////////////////////////////////////
628/// Transpose16_16_16
629//////////////////////////////////////////////////////////////////////////
630struct Transpose16_16_16
631{
632    //////////////////////////////////////////////////////////////////////////
633    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
634    /// @param pSrc - source data in SOA form
635    /// @param pDst - output data in AOS form
636    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
637    {
638#if KNOB_SIMD_WIDTH == 8
639        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
640
641        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
642        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
643        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
644        __m128i src_a = _mm_undefined_si128();
645
646        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
647        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
648        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
649        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
650
651        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
652        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
653        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
654        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
655
656        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
657        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
658        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
659        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
660#else
661#error Unsupported vector width
662#endif
663    }
664#if ENABLE_AVX512_SIMD16
665
666    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
667    {
668        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
669        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
670        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
671        simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
672
673        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
674        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
675        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
676        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
677
678        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
679        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
680        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
681        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
682
683        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
684        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
685        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
686        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
687
688        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
689        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
690        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
691        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
692    }
693#endif
694};
695
696//////////////////////////////////////////////////////////////////////////
697/// Transpose16_16
698//////////////////////////////////////////////////////////////////////////
699struct Transpose16_16
700{
701    //////////////////////////////////////////////////////////////////////////
702    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
703    /// @param pSrc - source data in SOA form
704    /// @param pDst - output data in AOS form
705    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
706    {
707#if KNOB_SIMD_WIDTH == 8
708        simdscalar src = _simd_load_ps((const float*)pSrc);
709
710        __m128 comp0 = _mm256_castps256_ps128(src);
711        __m128 comp1 = _mm256_extractf128_ps(src, 1);
712
713        __m128i comp0i = _mm_castps_si128(comp0);
714        __m128i comp1i = _mm_castps_si128(comp1);
715
716        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
717        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
718
719        _mm_store_si128((__m128i*)pDst, resLo);
720        _mm_store_si128((__m128i*)pDst + 1, resHi);
721#else
722#error Unsupported vector width
723#endif
724    }
725#if ENABLE_AVX512_SIMD16
726
727    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
728    {
729        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
730        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
731
732        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
733        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
734
735        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
736        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
737
738        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
739        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
740    }
741#endif
742};
743
744//////////////////////////////////////////////////////////////////////////
745/// Transpose24_8
746//////////////////////////////////////////////////////////////////////////
747struct Transpose24_8
748{
749    //////////////////////////////////////////////////////////////////////////
750    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
751    /// @param pSrc - source data in SOA form
752    /// @param pDst - output data in AOS form
753    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
754#if ENABLE_AVX512_SIMD16
755
756    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
757#endif
758};
759
760//////////////////////////////////////////////////////////////////////////
761/// Transpose32_8_24
762//////////////////////////////////////////////////////////////////////////
763struct Transpose32_8_24
764{
765    //////////////////////////////////////////////////////////////////////////
766    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
767    /// @param pSrc - source data in SOA form
768    /// @param pDst - output data in AOS form
769    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
770#if ENABLE_AVX512_SIMD16
771
772    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
773#endif
774};
775
776//////////////////////////////////////////////////////////////////////////
777/// Transpose4_4_4_4
778//////////////////////////////////////////////////////////////////////////
779struct Transpose4_4_4_4
780{
781    //////////////////////////////////////////////////////////////////////////
782    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
783    /// @param pSrc - source data in SOA form
784    /// @param pDst - output data in AOS form
785    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
786#if ENABLE_AVX512_SIMD16
787
788    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
789#endif
790};
791
792//////////////////////////////////////////////////////////////////////////
793/// Transpose5_6_5
794//////////////////////////////////////////////////////////////////////////
795struct Transpose5_6_5
796{
797    //////////////////////////////////////////////////////////////////////////
798    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
799    /// @param pSrc - source data in SOA form
800    /// @param pDst - output data in AOS form
801    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
802#if ENABLE_AVX512_SIMD16
803
804    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
805#endif
806};
807
808//////////////////////////////////////////////////////////////////////////
809/// Transpose9_9_9_5
810//////////////////////////////////////////////////////////////////////////
811struct Transpose9_9_9_5
812{
813    //////////////////////////////////////////////////////////////////////////
814    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
815    /// @param pSrc - source data in SOA form
816    /// @param pDst - output data in AOS form
817    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
818#if ENABLE_AVX512_SIMD16
819
820    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
821#endif
822};
823
824//////////////////////////////////////////////////////////////////////////
825/// Transpose5_5_5_1
826//////////////////////////////////////////////////////////////////////////
827struct Transpose5_5_5_1
828{
829    //////////////////////////////////////////////////////////////////////////
830    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
831    /// @param pSrc - source data in SOA form
832    /// @param pDst - output data in AOS form
833    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
834#if ENABLE_AVX512_SIMD16
835
836    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
837#endif
838};
839
840//////////////////////////////////////////////////////////////////////////
841/// Transpose1_5_5_5
842//////////////////////////////////////////////////////////////////////////
843struct Transpose1_5_5_5
844{
845    //////////////////////////////////////////////////////////////////////////
846    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
847    /// @param pSrc - source data in SOA form
848    /// @param pDst - output data in AOS form
849    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
850};
851
852//////////////////////////////////////////////////////////////////////////
853/// Transpose10_10_10_2
854//////////////////////////////////////////////////////////////////////////
855struct Transpose10_10_10_2
856{
857    //////////////////////////////////////////////////////////////////////////
858    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
859    /// @param pSrc - source data in SOA form
860    /// @param pDst - output data in AOS form
861    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
862#if ENABLE_AVX512_SIMD16
863
864    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
865#endif
866};
867
868//////////////////////////////////////////////////////////////////////////
869/// Transpose11_11_10
870//////////////////////////////////////////////////////////////////////////
871struct Transpose11_11_10
872{
873    //////////////////////////////////////////////////////////////////////////
874    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
875    /// @param pSrc - source data in SOA form
876    /// @param pDst - output data in AOS form
877    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
878#if ENABLE_AVX512_SIMD16
879
880    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
881#endif
882};
883
884//////////////////////////////////////////////////////////////////////////
885/// Transpose64
886//////////////////////////////////////////////////////////////////////////
887struct Transpose64
888{
889    //////////////////////////////////////////////////////////////////////////
890    /// @brief Performs an SOA to AOS conversion
891    /// @param pSrc - source data in SOA form
892    /// @param pDst - output data in AOS form
893    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
894#if ENABLE_AVX512_SIMD16
895
896    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
897#endif
898};
899
900//////////////////////////////////////////////////////////////////////////
901/// Transpose64_64
902//////////////////////////////////////////////////////////////////////////
903struct Transpose64_64
904{
905    //////////////////////////////////////////////////////////////////////////
906    /// @brief Performs an SOA to AOS conversion
907    /// @param pSrc - source data in SOA form
908    /// @param pDst - output data in AOS form
909    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
910#if ENABLE_AVX512_SIMD16
911
912    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
913#endif
914};
915
916//////////////////////////////////////////////////////////////////////////
917/// Transpose64_64_64
918//////////////////////////////////////////////////////////////////////////
919struct Transpose64_64_64
920{
921    //////////////////////////////////////////////////////////////////////////
922    /// @brief Performs an SOA to AOS conversion
923    /// @param pSrc - source data in SOA form
924    /// @param pDst - output data in AOS form
925    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
926#if ENABLE_AVX512_SIMD16
927
928    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
929#endif
930};
931
932//////////////////////////////////////////////////////////////////////////
933/// Transpose64_64_64_64
934//////////////////////////////////////////////////////////////////////////
935struct Transpose64_64_64_64
936{
937    //////////////////////////////////////////////////////////////////////////
938    /// @brief Performs an SOA to AOS conversion
939    /// @param pSrc - source data in SOA form
940    /// @param pDst - output data in AOS form
941    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
942#if ENABLE_AVX512_SIMD16
943
944    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
945#endif
946};
947
948// helper function to unroll loops
949template<int Begin, int End, int Step = 1>
950struct UnrollerL {
951    template<typename Lambda>
952    INLINE static void step(Lambda& func) {
953        func(Begin);
954        UnrollerL<Begin + Step, End, Step>::step(func);
955    }
956};
957
958template<int End, int Step>
959struct UnrollerL<End, End, Step> {
960    template<typename Lambda>
961    static void step(Lambda& func) {
962    }
963};
964
965// helper function to unroll loops, with mask to skip specific iterations
966template<int Begin, int End, int Step = 1, int Mask = 0x7f>
967struct UnrollerLMask {
968    template<typename Lambda>
969    INLINE static void step(Lambda& func) {
970        if(Mask & (1 << Begin))
971        {
972            func(Begin);
973        }
974        UnrollerL<Begin + Step, End, Step>::step(func);
975    }
976};
977
978template<int End, int Step, int Mask>
979struct UnrollerLMask<End, End, Step, Mask> {
980    template<typename Lambda>
981    static void step(Lambda& func) {
982    }
983};
984
985// general CRC compute
986INLINE
987uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
988{
989#if defined(_WIN64) || defined(__x86_64__)
990    uint32_t sizeInQwords = size / sizeof(uint64_t);
991    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
992    uint64_t* pDataWords = (uint64_t*)pData;
993    for (uint32_t i = 0; i < sizeInQwords; ++i)
994    {
995        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
996    }
997#else
998    uint32_t sizeInDwords = size / sizeof(uint32_t);
999    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
1000    uint32_t* pDataWords = (uint32_t*)pData;
1001    for (uint32_t i = 0; i < sizeInDwords; ++i)
1002    {
1003        crc = _mm_crc32_u32(crc, *pDataWords++);
1004    }
1005#endif
1006
1007    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
1008    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
1009    {
1010        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
1011    }
1012
1013    return crc;
1014}
1015
1016//////////////////////////////////////////////////////////////////////////
1017/// Add byte offset to any-type pointer
1018//////////////////////////////////////////////////////////////////////////
1019template <typename T>
1020INLINE
1021static T* PtrAdd(T* p, intptr_t offset)
1022{
1023    intptr_t intp = reinterpret_cast<intptr_t>(p);
1024    return reinterpret_cast<T*>(intp + offset);
1025}
1026
1027//////////////////////////////////////////////////////////////////////////
1028/// Is a power-of-2?
1029//////////////////////////////////////////////////////////////////////////
1030template <typename T>
1031INLINE
1032static bool IsPow2(T value)
1033{
1034    return value == (value & (0 - value));
1035}
1036
1037//////////////////////////////////////////////////////////////////////////
1038/// Align down to specified alignment
1039/// Note: IsPow2(alignment) MUST be true
1040//////////////////////////////////////////////////////////////////////////
1041template <typename T1, typename T2>
1042INLINE
1043static T1 AlignDownPow2(T1 value, T2 alignment)
1044{
1045    SWR_ASSERT(IsPow2(alignment));
1046    return value & ~T1(alignment - 1);
1047}
1048
1049//////////////////////////////////////////////////////////////////////////
1050/// Align up to specified alignment
1051/// Note: IsPow2(alignment) MUST be true
1052//////////////////////////////////////////////////////////////////////////
1053template <typename T1, typename T2>
1054INLINE
1055static T1 AlignUpPow2(T1 value, T2 alignment)
1056{
1057    return AlignDownPow2(value + T1(alignment - 1), alignment);
1058}
1059
1060//////////////////////////////////////////////////////////////////////////
1061/// Align up ptr to specified alignment
1062/// Note: IsPow2(alignment) MUST be true
1063//////////////////////////////////////////////////////////////////////////
1064template <typename T1, typename T2>
1065INLINE
1066static T1* AlignUpPow2(T1* value, T2 alignment)
1067{
1068    return reinterpret_cast<T1*>(
1069        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1070}
1071
1072//////////////////////////////////////////////////////////////////////////
1073/// Align down to specified alignment
1074//////////////////////////////////////////////////////////////////////////
1075template <typename T1, typename T2>
1076INLINE
1077static T1 AlignDown(T1 value, T2 alignment)
1078{
1079    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1080    return value - T1(value % alignment);
1081}
1082
1083//////////////////////////////////////////////////////////////////////////
1084/// Align down to specified alignment
1085//////////////////////////////////////////////////////////////////////////
1086template <typename T1, typename T2>
1087INLINE
1088static T1* AlignDown(T1* value, T2 alignment)
1089{
1090    return (T1*)AlignDown(uintptr_t(value), alignment);
1091}
1092
1093//////////////////////////////////////////////////////////////////////////
1094/// Align up to specified alignment
1095/// Note: IsPow2(alignment) MUST be true
1096//////////////////////////////////////////////////////////////////////////
1097template <typename T1, typename T2>
1098INLINE
1099static T1 AlignUp(T1 value, T2 alignment)
1100{
1101    return AlignDown(value + T1(alignment - 1), alignment);
1102}
1103
1104//////////////////////////////////////////////////////////////////////////
1105/// Align up to specified alignment
1106/// Note: IsPow2(alignment) MUST be true
1107//////////////////////////////////////////////////////////////////////////
1108template <typename T1, typename T2>
1109INLINE
1110static T1* AlignUp(T1* value, T2 alignment)
1111{
1112    return AlignDown(PtrAdd(value, alignment - 1), alignment);
1113}
1114
1115//////////////////////////////////////////////////////////////////////////
1116/// Helper structure used to access an array of elements that don't
1117/// correspond to a typical word size.
1118//////////////////////////////////////////////////////////////////////////
1119template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1120class BitsArray
1121{
1122private:
1123    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1124    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1125    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1126    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1127
1128    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1129        "Element size must an integral fraction of pointer size");
1130
1131    size_t              m_words[NUM_WORDS] = {};
1132
1133public:
1134
1135    T operator[] (size_t elementIndex) const
1136    {
1137        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1138        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1139        return T(word & ELEMENT_MASK);
1140    }
1141};
1142
1143// Ranged integer argument for TemplateArgUnroller
1144template <uint32_t TMin, uint32_t TMax>
1145struct IntArg
1146{
1147    uint32_t val;
1148};
1149
1150// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
1151// arguments to static template arguments.
1152template <typename TermT, typename... ArgsB>
1153struct TemplateArgUnroller
1154{
1155    //-----------------------------------------
1156    // Boolean value
1157    //-----------------------------------------
1158
1159    // Last Arg Terminator
1160    static typename TermT::FuncType GetFunc(bool bArg)
1161    {
1162        if (bArg)
1163        {
1164            return TermT::template GetFunc<ArgsB..., std::true_type>();
1165        }
1166
1167        return TermT::template GetFunc<ArgsB..., std::false_type>();
1168    }
1169
1170    // Recursively parse args
1171    template <typename... TArgsT>
1172    static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
1173    {
1174        if (bArg)
1175        {
1176            return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
1177        }
1178
1179        return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
1180    }
1181
1182    //-----------------------------------------
1183    // Integer value (within specified range)
1184    //-----------------------------------------
1185
1186    // Last Arg Terminator
1187    template <uint32_t TMin, uint32_t TMax>
1188    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1189    {
1190        if (iArg.val == TMax)
1191        {
1192            return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1193        }
1194        if (TMax > TMin)
1195        {
1196            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1197        }
1198        SWR_ASSUME(false); return nullptr;
1199    }
1200    template <uint32_t TVal>
1201    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1202    {
1203        SWR_ASSERT(iArg.val == TVal);
1204        return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1205    }
1206
1207    // Recursively parse args
1208    template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1209    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1210    {
1211        if (iArg.val == TMax)
1212        {
1213            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1214        }
1215        if (TMax > TMin)
1216        {
1217            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1218        }
1219        SWR_ASSUME(false); return nullptr;
1220    }
1221    template <uint32_t TVal, typename... TArgsT>
1222    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1223    {
1224        SWR_ASSERT(iArg.val == TVal);
1225        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1226    }
1227};
1228
1229
1230