1c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/****************************************************************************
2c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
4c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a
5c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* copy of this software and associated documentation files (the "Software"),
6c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* to deal in the Software without restriction, including without limitation
7c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the
9c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software is furnished to do so, subject to the following conditions:
10c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
11c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* The above copyright notice and this permission notice (including the next
12c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* paragraph) shall be included in all copies or substantial portions of the
13c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software.
14c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
15c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IN THE SOFTWARE.
22c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
23c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @file utils.h
24c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
25c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @brief Utilities used by SWR core.
26c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
27c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley******************************************************************************/
28c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#pragma once
29c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
30c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include <string.h>
3127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley#include <type_traits>
32812b45d04958e31e7a3bfc7331308374e8b73afaTim Rowley#include <algorithm>
33c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/os.h"
34c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/simdintrin.h"
35c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/swr_assert.h"
360ff57446e3786243c6d752c91be2108595f2663eTim Rowley#include "core/api.h"
37c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
381da9c8a970207b5aac96b3161706041e781124f6Tim Rowley#if defined(_WIN64) || defined(__x86_64__)
39c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_INSERT_EPI64 _mm_insert_epi64
40c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_EXTRACT_EPI64 _mm_extract_epi64
41c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
42550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
43c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
44c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    OSALIGNLINE(uint32_t) elems[4];
45c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    _mm_store_si128((__m128i*)elems, a);
46c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (ndx == 0)
47c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
48c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint64_t foo = elems[0];
49c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        foo |= (uint64_t)elems[1] << 32;
50c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return foo;
51c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
52c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
53c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
54c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint64_t foo = elems[2];
55c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        foo |= (uint64_t)elems[3] << 32;
56c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return foo;
57c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
58c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
59c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
60550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
61c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
62c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    OSALIGNLINE(int64_t) elems[2];
63c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    _mm_store_si128((__m128i*)elems, a);
64c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (ndx == 0)
65c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
66c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        elems[0] = b;
67c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
68c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
69c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
70c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        elems[1] = b;
71c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
72c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i out;
73c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    out = _mm_load_si128((const __m128i*)elems);
74c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return out;
75c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
76c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
77c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
78c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct simdBBox
79c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
800ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari ymin;
810ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari ymax;
820ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari xmin;
830ff57446e3786243c6d752c91be2108595f2663eTim Rowley    simdscalari xmax;
84c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
85c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
86c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
87c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
88c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
89c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row0i = _mm_castps_si128(row0);
90c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row1i = _mm_castps_si128(row1);
91c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row2i = _mm_castps_si128(row2);
92c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i row3i = _mm_castps_si128(row3);
93c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
94c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i vTemp = row2i;
95c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2i = _mm_unpacklo_epi32(row2i, row3i);
96c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
97c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
98c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3i = row0i;
99c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0i = _mm_unpacklo_epi32(row0i, row1i);
100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3i = _mm_unpackhi_epi32(row3i, row1i);
101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1i = row0i;
103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0i = _mm_unpacklo_epi64(row0i, row2i);
104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1i = _mm_unpackhi_epi64(row1i, row2i);
105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2i = row3i;
107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2i = _mm_unpacklo_epi64(row2i, vTemp);
108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3i = _mm_unpackhi_epi64(row3i, vTemp);
109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0 = _mm_castsi128_ps(row0i);
111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1 = _mm_castsi128_ps(row1i);
112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = _mm_castsi128_ps(row2i);
113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = _mm_castsi128_ps(row3i);
114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m128i vTemp = row2;
120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = _mm_unpacklo_epi32(row2, row3);
121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vTemp = _mm_unpackhi_epi32(vTemp, row3);
122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = row0;
124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0 = _mm_unpacklo_epi32(row0, row1);
125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = _mm_unpackhi_epi32(row3, row1);
126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1 = row0;
128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row0 = _mm_unpacklo_epi64(row0, row2);
129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row1 = _mm_unpackhi_epi64(row1, row2);
130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = row3;
132c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row2 = _mm_unpacklo_epi64(row2, vTemp);
133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    row3 = _mm_unpackhi_epi64(row3, vTemp);
134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define GCC_VERSION (__GNUC__ * 10000 \
137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                     + __GNUC_MINOR__ * 100 \
138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                     + __GNUC_PATCHLEVEL__)
139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
140c969ef2d427dfb3940a0d6d2b853454e65f1119fTim Rowley#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_ps _mm_setzero_ps
142c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_si128 _mm_setzero_si128
143c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
144c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm256_undefined_ps _mm256_setzero_ps
145c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
146c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
147c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
14878a0a09e48baaef6369fd9034ed693896195cf57George Kyriazis#if KNOB_SIMD_WIDTH == 8
149c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
150937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleyvoid vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
151c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
152c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
153c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
154c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
155c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
156c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
157c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
158c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
159c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
160c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
161c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
162c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
163c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
164c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
165c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
166c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
167c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
168c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
169c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
170c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
171c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
172c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
173c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
174937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleyvoid vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
175c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
176c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
177c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
178c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
179c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
180c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
181c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
182c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
183c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
184c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
185c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
186c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
187c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
188c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
189c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
190c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
191c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
192c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
193c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
194c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
195c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
196c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
197bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley#if ENABLE_AVX512_SIMD16
198bd22c3d41151ce265e61d64f9034928f83d3c959Tim RowleyINLINE
199bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowleyvoid vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
200bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley{
201bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
202bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
203bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
204bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
205bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
206bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
207bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
208bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
209bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
210bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
211bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
212bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
213bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    dst[0] = _simd16_unpacklo_ps(rblo, galo);
214bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    dst[1] = _simd16_unpackhi_ps(rblo, galo);
215bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
216bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley    dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
217bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley}
218bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
219bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley#endif
220c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
221c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
222c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
223c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
224c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
225c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
226c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
227c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
228c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
229c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
230c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
231c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
232c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
233c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
234c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
235c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
236c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
237c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
238c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
239c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
240c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
241c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
242c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
243c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
244c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
245c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
246c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
247c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
248c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
249c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
250c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
251c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
252c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
253c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
254c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
255c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
256c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
257c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
258c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// TranposeSingleComponent
259c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
260c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<uint32_t bpp>
261c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct TransposeSingleComponent
262c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
263c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
264c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Pass-thru for single component.
265c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
266c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
267542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
268c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
269c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
270c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
271488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
272488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
273488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
274488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
275488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
276488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
277488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
278c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
279c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
280c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
281c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8_8
282c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
283c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8_8
284c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
285c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
286c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
287c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
288c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
289542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
290c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
291c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
2921d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley
293c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
294c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX
295c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
296c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
297c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
298c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
299c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
300c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
301c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
302c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
303c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst, c0123lo);
304c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
305c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#elif KNOB_ARCH == KNOB_ARCH_AVX2
306c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari dst01 = _mm256_shuffle_epi8(src,
307c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
308c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
309c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dst23 = _mm256_shuffle_epi8(dst23,
310c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
311c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari dst = _mm256_or_si256(dst01, dst23);
312c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _simd_store_si((simdscalari*)pDst, dst);
313c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
314c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
315c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
316c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
317c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
318488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
319488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
320488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
321488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
322bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
323bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
324bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
325bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
326488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
327bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
328bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
329bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
330bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
331488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
332bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
333bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
334bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
335488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
336bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
337488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
338bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
339488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
340488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
341c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
342c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
343c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
344c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8
345c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
346c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8
347c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
348c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
349c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
350c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
351c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
352542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
353488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
354488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
355488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
356488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
357c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
358c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
359c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
360c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8
361c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
362c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8
363c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
364c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
365c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
366c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
367c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
368542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
369c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
3701d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8
371c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
372c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
373c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
374c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
375c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        rg = _mm_unpacklo_epi8(rg, g);
376c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst, rg);
377c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
378c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
379c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
380c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
381488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
382488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
383488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
384488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
385bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
386bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
387488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
388bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
389bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
390488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
391bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
392488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
393bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst = _simd_or_si(cvt0, shl1);
394488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
395bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
396488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
397488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
398c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
399c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
400c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
401c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32_32
402c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
403c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32_32
404c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
405c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
406c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
407c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
408c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
409542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
410c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
411c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
412c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src0 = _simd_load_ps((const float*)pSrc);
413c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
414c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
415c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
416c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
417c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 vDst[8];
418c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        vTranspose4x8(vDst, src0, src1, src2, src3);
419c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst, vDst[0]);
420c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+4, vDst[1]);
421c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+8, vDst[2]);
422c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+12, vDst[3]);
423c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+16, vDst[4]);
424c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+20, vDst[5]);
425c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+24, vDst[6]);
426c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst+28, vDst[7]);
427c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
428c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
429c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
430c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
431488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
432488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
433488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
434488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
435488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
436488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
437488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
438488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
439488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
440bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar dst[4];
441488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
442bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        vTranspose4x16(dst, src0, src1, src2, src3);
443488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
444bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
445bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
446bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
447bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
448488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
449488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
450c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
451c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
452c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
453c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32
454c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
455c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32
456c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
457c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
458c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
459c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
460c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
461542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
462c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
463c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
464c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src0 = _simd_load_ps((const float*)pSrc);
465c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
466c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
467c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
468c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 vDst[8];
469c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        vTranspose3x8(vDst, src0, src1, src2);
470c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst, vDst[0]);
471c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 4, vDst[1]);
472c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 8, vDst[2]);
473c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 12, vDst[3]);
474c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 16, vDst[4]);
475c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 20, vDst[5]);
476c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 24, vDst[6]);
477c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps((float*)pDst + 28, vDst[7]);
478c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
479c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
480c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
481c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
482488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
483488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
484488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
485488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
486488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
487488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
488488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
489bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar src3 = _simd16_setzero_ps();
490488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
491bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar dst[4];
492488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
493bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        vTranspose4x16(dst, src0, src1, src2, src3);
494488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
495bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
496bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
497bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
498bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
499488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
500488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
501c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
502c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
503c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
504c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32
505c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
506c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32
507c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
508c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
509c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
510c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
511c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
512542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
513c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
5141d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8
515c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        const float* pfSrc = (const float*)pSrc;
516c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
517c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
518c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
519c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
520c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
521c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
522c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
523c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
524c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
525c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
526c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        float* pfDst = (float*)pDst;
527c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 0, dst0);
528c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 4, dst1);
529c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 8, dst2);
530c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_ps(pfDst + 12, dst3);
5311d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#else
5321d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#error Unsupported vector width
5331d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif
534c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
535488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
536488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
537488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
538488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
539bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
540bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
541bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
542bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
543bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
544bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
545bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
546bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
547bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
548bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
549bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
550bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
551bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
552bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
553488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
554488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
555c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
556c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
557c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
558c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16_16
559c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
560c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16_16
561c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
562c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
563c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
564c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
565c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
566542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
567c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
568c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
569c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
570c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
571c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
572c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
573c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
574c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
575c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
576c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
577c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
578c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
579c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
580c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
581c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
582c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
583c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
584c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
585c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
586c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
587c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
588c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
589c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
590c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
591c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
592c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
593c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
594c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
595488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
596488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
597488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
598488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
599bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
600bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
601bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
602bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
603bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
604bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
605bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
606bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
607bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
608bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
609bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
610bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
611bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
612bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
613bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
614bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
615bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
616bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
617bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
618bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
619bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
620bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
621bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
622bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
623488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
624488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
625c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
626c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
627c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
628c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16
629c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
630c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16
631c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
632c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
633c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
634c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
635c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
636542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
637c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
638c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8
639c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
640c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
641c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
642c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
643c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
644c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i src_a = _mm_undefined_si128();
645c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
646c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
647c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
648c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
649c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
650c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
651c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
652c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
653c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
654c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
655c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
656c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
657c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
658c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
659c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
660c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
661c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
662c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
663c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
664488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
665488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
666488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
667488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
668bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
669bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
670bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
671bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
672bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
673bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
674bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
675bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
676bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
677bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
678bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
679bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
680bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
681bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
682bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
683bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
684bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
685bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
686bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
687bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley
688bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
689bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
690bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
691bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
692488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
693488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
694c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
695c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
696c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
697c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16
698c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
699c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16
700c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
701c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
702c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
703c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
704c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
705542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
706c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
7071d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8
708c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalar src = _simd_load_ps((const float*)pSrc);
709c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
710c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 comp0 = _mm256_castps256_ps128(src);
711c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128 comp1 = _mm256_extractf128_ps(src, 1);
712c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
713c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i comp0i = _mm_castps_si128(comp0);
714c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i comp1i = _mm_castps_si128(comp1);
715c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
716c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
717c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
718c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
719c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst, resLo);
720c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        _mm_store_si128((__m128i*)pDst + 1, resHi);
721c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
722c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width
723c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
724c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
725488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
726488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
727488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
728488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
729bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
730bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
731488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
732bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
733bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
734488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
735fa7c5e242f5aa54223bc30012c2023db7834c1e0Tim Rowley        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
736fa7c5e242f5aa54223bc30012c2023db7834c1e0Tim Rowley        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
737488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
738bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
739bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
740488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
741488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
742c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
743c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
744c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
745c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose24_8
746c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
747c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose24_8
748c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
749c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
750c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
751c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
752c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
753542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
754488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
755488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
756488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
757488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
758c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
759c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
760c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
761c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_8_24
762c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
763c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_8_24
764c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
765c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
766c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
767c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
768c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
769542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
770488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
771c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
772488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
773488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
774488992221056edaf7111f9290afdf216c5e98d62Tim Rowley};
775c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
776c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
777c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose4_4_4_4
778c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
779c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose4_4_4_4
780c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
781c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
782c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
783c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
784c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
785542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
786488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
787488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
788488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
789488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
790c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
791c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
792c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
793c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_6_5
794c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
795c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_6_5
796c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
797c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
798c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
799c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
800c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
801542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
802488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
803488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
804488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
805488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
806c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
807c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
808c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
809c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose9_9_9_5
810c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
811c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose9_9_9_5
812c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
813c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
814c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
815c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
816c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
817542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
818488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
819488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
820488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
821488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
822c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
823c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
824c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
825c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_5_5_1
826c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
827c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_5_5_1
828c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
829c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
830c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
831c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
832c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
833542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
834488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
835488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
836488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
837488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
8381b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley};
8391b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley
8401b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley//////////////////////////////////////////////////////////////////////////
8411b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley/// Transpose1_5_5_5
8421b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley//////////////////////////////////////////////////////////////////////////
8431b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct Transpose1_5_5_5
8441b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley{
8451b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    //////////////////////////////////////////////////////////////////////////
8461b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
8471b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    /// @param pSrc - source data in SOA form
8481b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    /// @param pDst - output data in AOS form
8491b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
850c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
851c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
852c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
853c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose10_10_10_2
854c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
855c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose10_10_10_2
856c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
857c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
858c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
859c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
860c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
861542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
862488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
863488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
864488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
865488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
866c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
867c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
868c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
869c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose11_11_10
870c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
871c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose11_11_10
872c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
873c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    //////////////////////////////////////////////////////////////////////////
874c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
875c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pSrc - source data in SOA form
876c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @param pDst - output data in AOS form
877542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
878488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16
879488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
880488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
88133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif
88233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley};
88333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
88433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
88533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64
88633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
88733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64
88833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{
88933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    //////////////////////////////////////////////////////////////////////////
89033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @brief Performs an SOA to AOS conversion
89133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pSrc - source data in SOA form
89233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pDst - output data in AOS form
89333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
89433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16
89533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
89633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
89733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif
89833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley};
89933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
90033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
90133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64_64
90233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
90333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64_64
90433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{
90533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    //////////////////////////////////////////////////////////////////////////
90633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @brief Performs an SOA to AOS conversion
90733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pSrc - source data in SOA form
90833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pDst - output data in AOS form
90933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
91033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16
91133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
91233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
91333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif
91433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley};
91533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
91633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
91733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64_64_64
91833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
91933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64_64_64
92033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{
92133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    //////////////////////////////////////////////////////////////////////////
92233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @brief Performs an SOA to AOS conversion
92333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pSrc - source data in SOA form
92433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pDst - output data in AOS form
92533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
92633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16
92733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
92833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
92933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif
93033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley};
93133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
93233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
93333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64_64_64_64
93433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley//////////////////////////////////////////////////////////////////////////
93533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64_64_64_64
93633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{
93733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    //////////////////////////////////////////////////////////////////////////
93833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @brief Performs an SOA to AOS conversion
93933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pSrc - source data in SOA form
94033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    /// @param pDst - output data in AOS form
94133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
94233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16
94333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley
94433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley    static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
945488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
946c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
947c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
948c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// helper function to unroll loops
949c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int Begin, int End, int Step = 1>
950c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL {
951c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    template<typename Lambda>
952c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    INLINE static void step(Lambda& func) {
953c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        func(Begin);
954c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        UnrollerL<Begin + Step, End, Step>::step(func);
955c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
956c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
957c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
958c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int End, int Step>
959c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL<End, End, Step> {
960c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    template<typename Lambda>
961c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static void step(Lambda& func) {
962c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
963c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
964c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
9659f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley// helper function to unroll loops, with mask to skip specific iterations
9669f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int Begin, int End, int Step = 1, int Mask = 0x7f>
9679f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask {
9689f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    template<typename Lambda>
9699f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    INLINE static void step(Lambda& func) {
9709f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        if(Mask & (1 << Begin))
9719f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        {
9729f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley            func(Begin);
9739f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        }
9749f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley        UnrollerL<Begin + Step, End, Step>::step(func);
9759f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    }
9769f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley};
9779f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley
9789f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int End, int Step, int Mask>
9799f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask<End, End, Step, Mask> {
9809f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    template<typename Lambda>
9819f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    static void step(Lambda& func) {
9829f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley    }
9839f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley};
9849f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley
985c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// general CRC compute
986c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
987c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyuint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
988c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
98990f9df3210b5b66585007ec4836bfca498fd45f0Tim Rowley#if defined(_WIN64) || defined(__x86_64__)
990c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeInQwords = size / sizeof(uint64_t);
991c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
992c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint64_t* pDataWords = (uint64_t*)pData;
993c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < sizeInQwords; ++i)
994c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
995c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
996c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
997c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else
998c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeInDwords = size / sizeof(uint32_t);
999c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
1000c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pDataWords = (uint32_t*)pData;
1001c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < sizeInDwords; ++i)
1002c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1003c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        crc = _mm_crc32_u32(crc, *pDataWords++);
1004c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1005c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1006c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1007542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
1008c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
1009c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1010c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
1011c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1012c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1013c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return crc;
1014c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1015c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1016c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1017c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Add byte offset to any-type pointer
1018c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1019c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T>
1020c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1021c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T* PtrAdd(T* p, intptr_t offset)
1022c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1023c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    intptr_t intp = reinterpret_cast<intptr_t>(p);
1024c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return reinterpret_cast<T*>(intp + offset);
1025c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1026c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1027c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1028c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Is a power-of-2?
1029c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1030c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T>
1031c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1032c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic bool IsPow2(T value)
1033c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1034c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return value == (value & (0 - value));
1035c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1036c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1037c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1038c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment
1039c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
1040c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1041c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1042c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1043c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDownPow2(T1 value, T2 alignment)
1044c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1045c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(IsPow2(alignment));
1046c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return value & ~T1(alignment - 1);
1047c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1048c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1049c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1050c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment
1051c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
1052c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1053c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1054c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1055c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUpPow2(T1 value, T2 alignment)
1056c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1057c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return AlignDownPow2(value + T1(alignment - 1), alignment);
1058c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1059c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1060c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1061c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up ptr to specified alignment
1062c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
1063c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1064c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1065c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1066c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUpPow2(T1* value, T2 alignment)
1067c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1068c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return reinterpret_cast<T1*>(
1069c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
1070c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1071c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1072c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1073c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment
1074c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1075c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1076c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1077c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDown(T1 value, T2 alignment)
1078c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1079c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
1080c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return value - T1(value % alignment);
1081c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1082c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1083c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1084c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment
1085c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1086c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1087c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1088c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignDown(T1* value, T2 alignment)
1089c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1090c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return (T1*)AlignDown(uintptr_t(value), alignment);
1091c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1092c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1093c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1094c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment
1095c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
1096c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1097c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1098c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1099c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUp(T1 value, T2 alignment)
1100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return AlignDown(value + T1(alignment - 1), alignment);
1102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment
1106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true
1107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2>
1109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
1110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUp(T1* value, T2 alignment)
1111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return AlignDown(PtrAdd(value, alignment - 1), alignment);
1113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Helper structure used to access an array of elements that don't
1117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// correspond to a typical word size.
1118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<typename T, size_t BitsPerElementT, size_t ArrayLenT>
1120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyclass BitsArray
1121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyprivate:
1123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
1124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
1125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
1126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
1127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
1129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        "Element size must an integral fraction of pointer size");
1130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    size_t              m_words[NUM_WORDS] = {};
1132c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleypublic:
1134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    T operator[] (size_t elementIndex) const
1136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
1138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
1139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return T(word & ELEMENT_MASK);
1140c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
114227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
1143e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley// Ranged integer argument for TemplateArgUnroller
1144e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleytemplate <uint32_t TMin, uint32_t TMax>
1145e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleystruct IntArg
1146e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley{
1147e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    uint32_t val;
1148e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley};
1149e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley
115027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
115127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// arguments to static template arguments.
115227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleytemplate <typename TermT, typename... ArgsB>
115327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleystruct TemplateArgUnroller
115427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley{
1155e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1156e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    // Boolean value
1157e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1158e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley
115927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    // Last Arg Terminator
116027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    static typename TermT::FuncType GetFunc(bool bArg)
116127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    {
116227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        if (bArg)
116327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        {
116427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            return TermT::template GetFunc<ArgsB..., std::true_type>();
116527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        }
116627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
116727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        return TermT::template GetFunc<ArgsB..., std::false_type>();
116827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    }
116927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
117027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    // Recursively parse args
117127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    template <typename... TArgsT>
117227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
117327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    {
117427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        if (bArg)
117527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        {
117627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
117727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        }
117827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
117927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
118027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    }
1181c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley
1182e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1183e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    // Integer value (within specified range)
1184e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    //-----------------------------------------
1185e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley
1186c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    // Last Arg Terminator
1187e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TMin, uint32_t TMax>
1188e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
1189c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    {
1190e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (iArg.val == TMax)
1191e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        {
1192e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
1193e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        }
1194e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (TMax > TMin)
1195c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        {
1196e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
1197c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        }
1198e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSUME(false); return nullptr;
1199e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    }
1200e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TVal>
1201e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
1202e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    {
1203e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSERT(iArg.val == TVal);
1204e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
1205c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    }
1206c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley
1207c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    // Recursively parse args
1208e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
1209e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
1210c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    {
1211e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (iArg.val == TMax)
1212c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        {
1213e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
1214c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley        }
1215e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        if (TMax > TMin)
1216e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        {
1217e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
1218e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        }
1219e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSUME(false); return nullptr;
1220e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    }
1221e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    template <uint32_t TVal, typename... TArgsT>
1222e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
1223e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley    {
1224e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        SWR_ASSERT(iArg.val == TVal);
1225e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
1226c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley    }
122727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley};
122827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
122965c2abf6fdd51b0a80a72caa0c52cf3f4578e743Tim Rowley
1230