utils.h revision 1b86c050adcb9c166c2aab2f4c6e41cc07686bf3
1c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/****************************************************************************
2c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
4c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a
5c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* copy of this software and associated documentation files (the "Software"),
6c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* to deal in the Software without restriction, including without limitation
7c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the
9c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software is furnished to do so, subject to the following conditions:
10c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
11c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* The above copyright notice and this permission notice (including the next
12c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* paragraph) shall be included in all copies or substantial portions of the
13c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software.
14c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
15c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IN THE SOFTWARE.
22c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
23c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @file utils.h
24c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
25c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @brief Utilities used by SWR core.
26c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
27c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley******************************************************************************/
28c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#pragma once
29c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
30c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include <string.h>
3127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley#include <type_traits>
32812b45d04958e31e7a3bfc7331308374e8b73afaTim Rowley#include <algorithm>
33c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/os.h"
34c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/simdintrin.h"
35c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/swr_assert.h"
360ff57446e3786243c6d752c91be2108595f2663eTim Rowley#include "core/api.h"
37c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
381da9c8a970207b5aac96b3161706041e781124f6Tim Rowley#if defined(_WIN64) || defined(__x86_64__)
39c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_INSERT_EPI64 _mm_insert_epi64
40c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_EXTRACT_EPI64 _mm_extract_epi64
41c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
42550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
44c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    OSALIGNLINE(uint32_t) elems[4];
45c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    _mm_store_si128((__m128i*)elems, a);
46c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (ndx == 0)
47c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
48c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint64_t foo = elems[0];
49c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        foo |= (uint64_t)elems[1] << 32;
50c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return foo;
51c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
52c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
53c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
54c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint64_t foo = elems[2];
55c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        foo |= (uint64_t)elems[3] << 32;
56c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return foo;
57c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
58c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
59c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
60550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
62c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    OSALIGNLINE(int64_t) elems[2];
63c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    _mm_store_si128((__m128i*)elems, a);
64c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (ndx == 0)
65c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
66c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        elems[0] = b;
67c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
68c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
69c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
70c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        elems[1] = b;
71c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
72c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i out;
73c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    out = _mm_load_si128((const __m128i*)elems);
74c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return out;
75c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
76c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
77c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
78c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct simdBBox
79c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
800ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari ymin;
810ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari ymax;
820ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari xmin;
830ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari xmax;
84c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
85c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
86c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
87c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
89c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row0i = _mm_castps_si128(row0);
90c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row1i = _mm_castps_si128(row1);
91c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row2i = _mm_castps_si128(row2);
92c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row3i = _mm_castps_si128(row3);
93c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
94c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i vTemp = row2i;
95c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2i = _mm_unpacklo_epi32(row2i, row3i);
96c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
98c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3i = row0i;
99c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0i = _mm_unpacklo_epi32(row0i, row1i);
100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3i = _mm_unpackhi_epi32(row3i, row1i);
101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1i = row0i;
103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0i = _mm_unpacklo_epi64(row0i, row2i);
104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1i = _mm_unpackhi_epi64(row1i, row2i);
105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2i = row3i;
107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2i = _mm_unpacklo_epi64(row2i, vTemp);
108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3i = _mm_unpackhi_epi64(row3i, vTemp);
109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0 = _mm_castsi128_ps(row0i);
111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1 = _mm_castsi128_ps(row1i);
112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = _mm_castsi128_ps(row2i);
113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = _mm_castsi128_ps(row3i);
114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i vTemp = row2;
120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = _mm_unpacklo_epi32(row2, row3);
121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vTemp = _mm_unpackhi_epi32(vTemp, row3);
122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = row0;
124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0 = _mm_unpacklo_epi32(row0, row1);
125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = _mm_unpackhi_epi32(row3, row1);
126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1 = row0;
128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0 = _mm_unpacklo_epi64(row0, row2);
129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1 = _mm_unpackhi_epi64(row1, row2);
130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = row3;
132c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = _mm_unpacklo_epi64(row2, vTemp);
133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = _mm_unpackhi_epi64(row3, vTemp);
134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define GCC_VERSION (__GNUC__ * 10000 \
137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                     + __GNUC_MINOR__ * 100 \
138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                     + __GNUC_PATCHLEVEL__)
139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
140c969ef2d427dfb3940a0d6d2b853454e65f1119fTim Rowley#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_ps _mm_setzero_ps
142c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_si128 _mm_setzero_si128
143c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
144c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm256_undefined_ps _mm256_setzero_ps
145c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
146c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
147c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1481d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16
149c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
150c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
151c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
152c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
153c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
154c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
155c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
156c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
157c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
158c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
159c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
160c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
161c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
162c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
167c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
172c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
173c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
174c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
175c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
176c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
177c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
178c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
179c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
180c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
181c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
182c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
183c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
184c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
185c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
186c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
191c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
196c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
197c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
198c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
199c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
200c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
201c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
202c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
203c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
204c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
205c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
206c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
207c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
208c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
209c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
210c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
211c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
212c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
213c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
214c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
215c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
216c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
217c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
218c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
219c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
220c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
221c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
222c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
223c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
224c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
225c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
226c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
227c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
228c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
229c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
230c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
231c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
232c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
233c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
234c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
235c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// TranposeSingleComponent
236c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
237c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<uint32_t bpp>
238c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct TransposeSingleComponent
239c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
240c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
241c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Pass-thru for single component.
242c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
243c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
244542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
245c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
246c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
247c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
248c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
249c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
250c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
251c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8_8
252c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
253c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8_8
254c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
255c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
256c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
257c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
258c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
259542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
260c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
261c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
2621d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
263c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
264c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX
265c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
266c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
267c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
268c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
269c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
270c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
271c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
272c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
273c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst, c0123lo);
274c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
275c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#elif KNOB_ARCH == KNOB_ARCH_AVX2
276c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari dst01 = _mm256_shuffle_epi8(src,
277c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
278c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
279c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dst23 = _mm256_shuffle_epi8(dst23,
280c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
281c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari dst = _mm256_or_si256(dst01, dst23);
282c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _simd_store_si((simdscalari*)pDst, dst);
283c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
2841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
2851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800);
2861d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
2871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari dst01 = _simd_shuffle_epi8(src, mask0);
2881d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
2891d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari perm1 = _simd_permute_128(src, src, 1);
2901d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
2911d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080);
2921d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
2931d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1);
2941d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
2951d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari dst = _simd_or_si(dst01, dst23);
2961d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
2971d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
298c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
299c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
300c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
301c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
302c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
303c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
304c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
305c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8
306c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
307c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8
308c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
309c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
310c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
311c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
312c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
313542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
314c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
315c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
316c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
317c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8
318c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
319c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8
320c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
321c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
322c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
323c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
324c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
325542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
326c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
3271d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8
328c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
329c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
330c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
331c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
332c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        rg = _mm_unpacklo_epi8(rg, g);
333c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst, rg);
3341d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
3351d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc));   // rrrrrrrrrrrrrrrrgggggggggggggggg
3361d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3371d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i r = _mm256_permute4x64_epi64(src, 0x50);    // 0x50 = 01010000b     // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
3381d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3391d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i g = _mm256_permute4x64_epi64(src, 0xFA);    // 0xFA = 11111010b     // ggggggggxxxxxxxxggggggggxxxxxxxx
3401d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3411d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst = _mm256_unpacklo_epi8(r, g);                                   // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
3421d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3431d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
344c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
345c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
346c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
347c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
348c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
349c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
350c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
351c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32_32
352c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
353c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32_32
354c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
355c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
356c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
357c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
358c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
359542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
360c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
361c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
362c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src0 = _simd_load_ps((const float*)pSrc);
363c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
364c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
365c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
366c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
367c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 vDst[8];
368c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        vTranspose4x8(vDst, src0, src1, src2, src3);
369c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst, vDst[0]);
370c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+4, vDst[1]);
371c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+8, vDst[2]);
372c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+12, vDst[3]);
373c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+16, vDst[4]);
374c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+20, vDst[5]);
375c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+24, vDst[6]);
376c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+28, vDst[7]);
3771d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
3781d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION
3791d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
3801d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
3811d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
3821d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
3831d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m128 vDst[8];
3851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3861d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo);
3871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3881d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
3891d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
3901d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
3911d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
3921d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
3931d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
3941d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
3951d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
3961d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3971d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi);
3981d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
3991d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
4001d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
4011d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
4021d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
4031d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
4041d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
4051d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
4061d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
4071d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
408c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
409c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
410c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
411c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
412c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
413c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
414c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
415c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32
416c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
417c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32
418c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
419c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
420c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
421c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
422c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
423542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
424c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
425c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
426c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src0 = _simd_load_ps((const float*)pSrc);
427c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
428c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
429c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
430c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 vDst[8];
431c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        vTranspose3x8(vDst, src0, src1, src2);
432c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst, vDst[0]);
433c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 4, vDst[1]);
434c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 8, vDst[2]);
435c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 12, vDst[3]);
436c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 16, vDst[4]);
437c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 20, vDst[5]);
438c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 24, vDst[6]);
439c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 28, vDst[7]);
4401d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
4411d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION
4421d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
4431d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
4441d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
4451d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
4461d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m128 vDst[8];
4471d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
4481d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo);
4491d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
4501d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]);
4511d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]);
4521d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]);
4531d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]);
4541d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]);
4551d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]);
4561d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]);
4571d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]);
4581d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
4591d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi);
4601d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
4611d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]);
4621d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]);
4631d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]);
4641d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]);
4651d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]);
4661d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]);
4671d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]);
4681d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]);
4691d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
470c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
471c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
472c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
473c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
474c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
475c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
476c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
477c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32
478c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
479c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32
480c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
481c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
482c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
483c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
484c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
485542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
486c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
4871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8
488c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        const float* pfSrc = (const float*)pSrc;
489c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
490c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
491c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
492c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
493c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
494c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
495c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
496c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
497c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
498c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
499c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        float* pfDst = (float*)pDst;
500c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 0, dst0);
501c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 4, dst1);
502c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 8, dst2);
503c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 12, dst3);
5041d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
5051d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        const float* pfSrc = (const float*)pSrc;
5061d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
5071d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
5081d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
5091d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
5101d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
5111d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
5121d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
5131d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
5141d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
5151d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
5161d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        float* pfDst = (float*)pDst;
5171d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_ps(pfDst + 0, dst0);
5181d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_ps(pfDst + 8, dst1);
5191d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_ps(pfDst + 16, dst2);
5201d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_ps(pfDst + 24, dst3);
5211d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#else
5221d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#error Unsupported vector width
5231d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
524c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
525c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
526c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
527c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
528c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16_16
529c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
530c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16_16
531c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
532c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
533c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
534c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
535c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
536542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
537c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
538c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
539c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
540c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
541c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
542c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
543c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
544c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
545c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
546c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
547c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
548c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
549c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
550c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
551c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
552c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
553c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
554c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
555c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
556c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
557c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
558c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
559c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
560c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
5611d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
5621d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION
5631d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
5641d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari)));
5651d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
5661d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_r = src_rg.lo;
5671d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_g = src_rg.hi;
5681d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_b = src_ba.lo;
5691d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_a = src_ba.hi;
5701d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
5711d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
5721d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
5731d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
5741d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
5751d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
5761d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
5771d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
5781d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
5791d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
5801d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
5811d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
5821d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
5831d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
5841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
5851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
586c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
587c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
588c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
589c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
590c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
591c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
592c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
593c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16
594c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
595c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16
596c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
597c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
598c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
599c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
600c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
601542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
602c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
603c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
604c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
605c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
606c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
607c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
608c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
609c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_a = _mm_undefined_si128();
610c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
611c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
612c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
613c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
614c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
615c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
616c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
617c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
618c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
619c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
620c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
621c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
622c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
623c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
624c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
6251d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
6261d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION
6271d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc));
6281d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6291d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_r = src_rg.lo;
6301d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_g = src_rg.hi;
6311d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari)));
6321d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i src_a = _mm256_undefined_si256();
6331d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6341d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
6351d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
6361d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
6371d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
6381d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6391d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
6401d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
6411d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
6421d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
6431d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6441d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
6451d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
6461d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
6471d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
6481d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
649c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
650c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
651c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
652c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
653c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
654c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
655c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
656c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16
657c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
658c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16
659c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
660c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
661c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
662c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
663c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
664542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
665c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
6661d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8
667c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src = _simd_load_ps((const float*)pSrc);
668c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
669c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 comp0 = _mm256_castps256_ps128(src);
670c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 comp1 = _mm256_extractf128_ps(src, 1);
671c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
672c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i comp0i = _mm_castps_si128(comp0);
673c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i comp1i = _mm_castps_si128(comp1);
674c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
675c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
676c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
677c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
678c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst, resLo);
679c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst + 1, resHi);
6801d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16
6811d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION
6821d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc)));
6831d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        simdscalari result;
6851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6861d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        result.lo = _mm256_unpacklo_epi16(src.lo, src.hi);
6871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        result.hi = _mm256_unpackhi_epi16(src.lo, src.hi);
6881d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
6891d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result);
6901d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
691c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
692c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
693c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
694c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
695c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
696c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
697c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
698c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose24_8
699c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
700c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose24_8
701c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
702c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
703c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
704c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
705c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
706542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
707c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
708c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
709c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
710c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_8_24
711c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
712c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_8_24
713c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
714c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
715c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
716c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
717c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
718542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
719c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
720c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
721c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
722c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
723c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
724c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose4_4_4_4
725c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
726c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose4_4_4_4
727c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
728c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
729c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
730c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
731c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
732542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
733c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
734c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
735c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
736c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_6_5
737c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
738c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_6_5
739c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
740c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
741c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
742c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
743c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
744542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
745c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
746c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
747c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
748c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose9_9_9_5
749c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
750c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose9_9_9_5
751c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
752c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
753c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
754c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
755c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
756542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
757c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
758c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
759c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
760c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_5_5_1
761c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
762c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_5_5_1
763c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
764c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
765c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
766c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
767c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
768542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
7691b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley};
7701b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley
7711b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley//////////////////////////////////////////////////////////////////////////
7721b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley/// Transpose1_5_5_5
7731b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley//////////////////////////////////////////////////////////////////////////
7741b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct Transpose1_5_5_5
7751b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley{
7761b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    //////////////////////////////////////////////////////////////////////////
7771b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
7781b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    /// @param pSrc - source data in SOA form
7791b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    /// @param pDst - output data in AOS form
7801b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
781c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
782c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
783c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
784c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose10_10_10_2
785c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
786c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose10_10_10_2
787c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
788c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
789c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
790c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
791c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
792542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
793c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
794c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
795c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
796c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose11_11_10
797c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
798c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose11_11_10
799c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
800c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
801c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
802c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
803c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
804542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
805c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
806c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
807c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// helper function to unroll loops
808c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int Begin, int End, int Step = 1>
809c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL {
810c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    template<typename Lambda>
811c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    INLINE static void step(Lambda& func) {
812c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        func(Begin);
813c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        UnrollerL<Begin + Step, End, Step>::step(func);
814c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
815c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
816c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
817c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int End, int Step>
818c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL<End, End, Step> {
819c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    template<typename Lambda>
820c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static void step(Lambda& func) {
821c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
822c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
823c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
8249f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley// helper function to unroll loops, with mask to skip specific iterations
8259f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int Begin, int End, int Step = 1, int Mask = 0x7f>
8269f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask {
8279f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    template<typename Lambda>
8289f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    INLINE static void step(Lambda& func) {
8299f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        if(Mask & (1 << Begin))
8309f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        {
8319f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley            func(Begin);
8329f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        }
8339f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        UnrollerL<Begin + Step, End, Step>::step(func);
8349f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    }
8359f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley};
8369f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley
8379f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int End, int Step, int Mask>
8389f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask<End, End, Step, Mask> {
8399f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    template<typename Lambda>
8409f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    static void step(Lambda& func) {
8419f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    }
8429f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley};
8439f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley
844c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// general CRC compute
845c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
846c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyuint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
847c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
84890f9df3210b5b66585007ec4836bfca498fd45f0Tim Rowley#if defined(_WIN64) || defined(__x86_64__)
849c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeInQwords = size / sizeof(uint64_t);
850c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
851c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint64_t* pDataWords = (uint64_t*)pData;
852c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < sizeInQwords; ++i)
853c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
854c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
855c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
856c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
857c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeInDwords = size / sizeof(uint32_t);
858c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
859c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pDataWords = (uint32_t*)pData;
860c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < sizeInDwords; ++i)
861c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
862c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        crc = _mm_crc32_u32(crc, *pDataWords++);
863c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
864c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
865c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
866542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
867c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
868c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
869c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
870c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
871c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
872c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return crc;
873c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
874c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
875c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
876c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Add byte offset to any-type pointer
877c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
878c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T>
879c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
880c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T* PtrAdd(T* p, intptr_t offset)
881c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
882c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    intptr_t intp = reinterpret_cast<intptr_t>(p);
883c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return reinterpret_cast<T*>(intp + offset);
884c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
885c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
886c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
887c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Is a power-of-2?
888c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
889c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T>
890c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
891c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic bool IsPow2(T value)
892c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
893c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return value == (value & (0 - value));
894c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
895c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
896c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
897c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment
898c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
899c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
900c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
901c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
902c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDownPow2(T1 value, T2 alignment)
903c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
904c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(IsPow2(alignment));
905c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return value & ~T1(alignment - 1);
906c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
907c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
908c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
909c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment
910c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
911c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
912c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
913c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
914c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUpPow2(T1 value, T2 alignment)
915c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
916c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return AlignDownPow2(value + T1(alignment - 1), alignment);
917c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
918c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
919c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
920c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up ptr to specified alignment
921c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
922c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
923c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
924c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
925c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUpPow2(T1* value, T2 alignment)
926c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
927c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return reinterpret_cast<T1*>(
928c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
929c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
930c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
931c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
932c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment
933c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
934c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
935c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
936c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDown(T1 value, T2 alignment)
937c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
938c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
939c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return value - T1(value % alignment);
940c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
941c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
942c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
943c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment
944c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
945c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
946c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
947c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignDown(T1* value, T2 alignment)
948c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
949c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return (T1*)AlignDown(uintptr_t(value), alignment);
950c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
951c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
952c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
953c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment
954c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
955c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
956c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
957c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
958c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUp(T1 value, T2 alignment)
959c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
960c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return AlignDown(value + T1(alignment - 1), alignment);
961c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
962c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
963c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
964c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment
965c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
966c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
967c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
968c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
969c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUp(T1* value, T2 alignment)
970c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
971c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return AlignDown(PtrAdd(value, alignment - 1), alignment);
972c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
973c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
974c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
975c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Helper structure used to access an array of elements that don't
976c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// correspond to a typical word size.
977c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
978c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<typename T, size_t BitsPerElementT, size_t ArrayLenT>
979c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyclass BitsArray
980c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
981c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyprivate:
982c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
983c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
984c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
985c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
986c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
987c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
988c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        "Element size must an integral fraction of pointer size");
989c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
990c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    size_t              m_words[NUM_WORDS] = {};
991c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
992c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleypublic:
993c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
994c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    T operator[] (size_t elementIndex) const
995c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
996c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
997c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
998c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return T(word & ELEMENT_MASK);
999c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1000c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
100127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
1002e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley// Ranged integer argument for TemplateArgUnroller
1003e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleytemplate <uint32_t TMin, uint32_t TMax>
1004e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleystruct IntArg
1005e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley{
1006e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    uint32_t val;
1007e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley};
1008e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley
100927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
101027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// arguments to static template arguments.
101127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleytemplate <typename TermT, typename... ArgsB>
101227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleystruct TemplateArgUnroller
101327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley{
1014e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1015e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    // Boolean value
1016e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1017e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley
101827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    // Last Arg Terminator
101927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    static typename TermT::FuncType GetFunc(bool bArg)
102027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    {
102127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        if (bArg)
102227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        {
102327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            return TermT::template GetFunc<ArgsB..., std::true_type>();
102427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        }
102527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
102627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        return TermT::template GetFunc<ArgsB..., std::false_type>();
102727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    }
102827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
102927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    // Recursively parse args
103027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    template <typename... TArgsT>
103127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
103227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    {
103327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        if (bArg)
103427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        {
103527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
103627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        }
103727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
103827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
103927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    }
1040c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley
1041e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1042e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    // Integer value (within specified range)
1043e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1044e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley
1045c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    // Last Arg Terminator
1046e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TMin, uint32_t TMax>
1047e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1048c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    {
1049e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (iArg.val == TMax)
1050e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        {
1051e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1052e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        }
1053e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (TMax > TMin)
1054c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        {
1055e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1056c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        }
1057e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSUME(false); return nullptr;
1058e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    }
1059e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TVal>
1060e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1061e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    {
1062e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSERT(iArg.val == TVal);
1063e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1064c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    }
1065c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley
1066c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    // Recursively parse args
1067e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1068e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1069c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    {
1070e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (iArg.val == TMax)
1071c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        {
1072e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1073c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        }
1074e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (TMax > TMin)
1075e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        {
1076e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1077e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        }
1078e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSUME(false); return nullptr;
1079e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    }
1080e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TVal, typename... TArgsT>
1081e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1082e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    {
1083e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSERT(iArg.val == TVal);
1084e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1085c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    }
108627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley};
108727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
108865c2abf6fdd51b0a80a72caa0c52cf3f4578e743Tim Rowley
1089