1c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/**************************************************************************** 2c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 4c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a 5c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* copy of this software and associated documentation files (the "Software"), 6c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* to deal in the Software without restriction, including without limitation 7c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the 9c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software is furnished to do so, subject to the following conditions: 10c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 11c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* The above copyright notice and this permission notice (including the next 12c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* paragraph) shall be included in all copies or substantial portions of the 13c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software. 14c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 15c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IN THE SOFTWARE. 22c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 23c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @file utils.h 24c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 25c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @brief Utilities used by SWR core. 26c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 27c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley******************************************************************************/ 28c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#pragma once 29c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 30c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include <string.h> 3127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley#include <type_traits> 32812b45d04958e31e7a3bfc7331308374e8b73afaTim Rowley#include <algorithm> 33c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/os.h" 34c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/simdintrin.h" 35c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/swr_assert.h" 360ff57446e3786243c6d752c91be2108595f2663eTim Rowley#include "core/api.h" 37c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 381da9c8a970207b5aac96b3161706041e781124f6Tim Rowley#if defined(_WIN64) || defined(__x86_64__) 39c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_INSERT_EPI64 _mm_insert_epi64 40c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_EXTRACT_EPI64 _mm_extract_epi64 41c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 42550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) 43c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 44c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley OSALIGNLINE(uint32_t) elems[4]; 45c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)elems, a); 46c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley if (ndx == 0) 47c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 48c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint64_t foo = elems[0]; 49c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley foo |= (uint64_t)elems[1] << 32; 50c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return foo; 51c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 52c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley else 53c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 54c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint64_t foo = elems[2]; 55c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley foo |= (uint64_t)elems[3] << 32; 56c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return foo; 57c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 58c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 59c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 60550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx) 61c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 62c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley OSALIGNLINE(int64_t) elems[2]; 63c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)elems, a); 64c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley if (ndx == 0) 65c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 66c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley elems[0] = b; 67c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 68c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley else 69c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 70c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley elems[1] = b; 71c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 72c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i out; 73c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley out = _mm_load_si128((const __m128i*)elems); 74c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return out; 75c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 76c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 77c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 78c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct simdBBox 79c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 800ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari ymin; 810ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari ymax; 820ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari xmin; 830ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari xmax; 84c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 85c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 86c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 87c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) 88c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 89c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row0i = _mm_castps_si128(row0); 90c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row1i = _mm_castps_si128(row1); 91c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row2i = _mm_castps_si128(row2); 92c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row3i = _mm_castps_si128(row3); 93c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 94c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i vTemp = row2i; 95c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2i = _mm_unpacklo_epi32(row2i, row3i); 96c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTemp = _mm_unpackhi_epi32(vTemp, row3i); 97c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 98c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3i = row0i; 99c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0i = _mm_unpacklo_epi32(row0i, row1i); 100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3i = _mm_unpackhi_epi32(row3i, row1i); 101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1i = row0i; 103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0i = _mm_unpacklo_epi64(row0i, row2i); 104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1i = _mm_unpackhi_epi64(row1i, row2i); 105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2i = row3i; 107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2i = _mm_unpacklo_epi64(row2i, vTemp); 108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3i = _mm_unpackhi_epi64(row3i, vTemp); 109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0 = _mm_castsi128_ps(row0i); 111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1 = _mm_castsi128_ps(row1i); 112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = _mm_castsi128_ps(row2i); 113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = _mm_castsi128_ps(row3i); 114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) 118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i vTemp = row2; 120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = _mm_unpacklo_epi32(row2, row3); 121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTemp = _mm_unpackhi_epi32(vTemp, row3); 122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = row0; 124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0 = _mm_unpacklo_epi32(row0, row1); 125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = _mm_unpackhi_epi32(row3, row1); 126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1 = row0; 128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0 = _mm_unpacklo_epi64(row0, row2); 129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1 = _mm_unpackhi_epi64(row1, row2); 130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = row3; 132c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = _mm_unpacklo_epi64(row2, vTemp); 133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = _mm_unpackhi_epi64(row3, vTemp); 134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define GCC_VERSION (__GNUC__ * 10000 \ 137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley + __GNUC_MINOR__ * 100 \ 138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley + __GNUC_PATCHLEVEL__) 139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 140c969ef2d427dfb3940a0d6d2b853454e65f1119fTim Rowley#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) 141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_ps _mm_setzero_ps 142c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_si128 _mm_setzero_si128 143c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 144c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm256_undefined_ps _mm256_setzero_ps 145c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 146c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 147c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 14878a0a09e48baaef6369fd9034ed693896195cf57George Kyriazis#if KNOB_SIMD_WIDTH == 8 149c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 150937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleyvoid vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2) 151c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 152c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 153c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 154c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 155c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 156c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 157c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 158c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 159c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 160c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 161c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 162c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 163c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 164c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 165c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 166c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 167c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 168c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 169c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 170c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 171c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 172c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 173c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 174937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleyvoid vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3) 175c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 176c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 177c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 178c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 179c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 180c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 181c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 182c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 183c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 184c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 185c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 186c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 187c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 188c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 189c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 190c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 191c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 192c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 193c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 194c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 195c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 196c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 197bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley#if ENABLE_AVX512_SIMD16 198bd22c3d41151ce265e61d64f9034928f83d3c959Tim RowleyINLINE 199bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowleyvoid vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) 200bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley{ 201bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking 202bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 203bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r 204bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g 205bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b 206bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a 207bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 208bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); 209bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); 210bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); 211bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); 212bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 213bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley dst[0] = _simd16_unpacklo_ps(rblo, galo); 214bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley dst[1] = _simd16_unpackhi_ps(rblo, galo); 215bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley dst[2] = _simd16_unpacklo_ps(rbhi, gahi); 216bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley dst[3] = _simd16_unpackhi_ps(rbhi, gahi); 217bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley} 218bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 219bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley#endif 220c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 221c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) 222c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 223c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); 224c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); 225c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); 226c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); 227c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); 228c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); 229c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); 230c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); 231c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 232c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 233c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 234c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 235c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 236c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 237c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 238c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 239c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); 240c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); 241c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); 242c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); 243c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); 244c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); 245c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); 246c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); 247c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 248c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 249c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 250c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) 251c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 252c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 253c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); 254c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 255c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 256c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 257c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 258c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// TranposeSingleComponent 259c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 260c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<uint32_t bpp> 261c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct TransposeSingleComponent 262c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 263c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 264c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Pass-thru for single component. 265c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 266c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 267542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 268c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 269c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 270c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 271488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 272488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 273488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 274488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 275488992221056edaf7111f9290afdf216c5e98d62Tim Rowley memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); 276488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 277488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 278c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 279c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 280c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 281c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8_8 282c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 283c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8_8 284c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 285c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 286c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 287c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 288c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 289542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 290c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 291c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src = _simd_load_si((const simdscalari*)pSrc); 2921d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 293c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 294c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX 295c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg 296c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa 297c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 298c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 299c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 300c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa 301c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba 302c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba 303c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst, c0123lo); 304c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)(pDst + 16), c0123hi); 305c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#elif KNOB_ARCH == KNOB_ARCH_AVX2 306c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari dst01 = _mm256_shuffle_epi8(src, 307c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 308c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 309c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley dst23 = _mm256_shuffle_epi8(dst23, 310c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 311c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari dst = _mm256_or_si256(dst01, dst23); 312c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _simd_store_si((simdscalari*)pDst, dst); 313c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 314c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 315c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 316c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 317c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 318488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 319488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 320488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 321488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 322bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr 323bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg 324bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 325bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa 326488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 327bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); 328bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); 329bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); 330bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); 331488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 332bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); 333bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); 334bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); 335488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 336bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); 337488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 338bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba 339488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 340488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 341c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 342c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 343c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 344c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8 345c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 346c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8 347c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 348c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 349c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 350c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 351c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 352542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 353488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 354488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 355488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 356488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 357c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 358c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 359c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 360c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8 361c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 362c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8 363c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 364c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 365c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 366c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 367c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 368542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 369c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 3701d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 371c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src = _simd_load_si((const simdscalari*)pSrc); 372c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 373c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg 374c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg 375c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley rg = _mm_unpacklo_epi8(rg, g); 376c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst, rg); 377c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 378c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 379c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 380c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 381488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 382488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 383488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 384488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 385bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr 386bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg 387488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 388bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari cvt0 = _simd_cvtepu8_epi16(src0); 389bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari cvt1 = _simd_cvtepu8_epi16(src1); 390488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 391bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari shl1 = _simd_slli_epi32(cvt1, 8); 392488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 393bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst = _simd_or_si(cvt0, shl1); 394488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 395bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg 396488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 397488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 398c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 399c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 400c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 401c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32_32 402c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 403c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32_32 404c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 405c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 406c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 407c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 408c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 409542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 410c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 411c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 412c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src0 = _simd_load_ps((const float*)pSrc); 413c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 414c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 415c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 416c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 417c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 vDst[8]; 418c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTranspose4x8(vDst, src0, src1, src2, src3); 419c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst, vDst[0]); 420c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+4, vDst[1]); 421c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+8, vDst[2]); 422c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+12, vDst[3]); 423c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+16, vDst[4]); 424c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+20, vDst[5]); 425c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+24, vDst[6]); 426c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+28, vDst[7]); 427c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 428c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 429c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 430c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 431488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 432488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 433488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 434488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 435488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 436488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 437488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 438488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48); 439488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 440bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar dst[4]; 441488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 442bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley vTranspose4x16(dst, src0, src1, src2, src3); 443488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 444bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); 445bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); 446bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); 447bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); 448488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 449488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 450c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 451c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 452c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 453c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32 454c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 455c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32 456c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 457c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 458c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 459c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 460c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 461542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 462c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 463c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 464c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src0 = _simd_load_ps((const float*)pSrc); 465c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 466c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 467c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 468c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 vDst[8]; 469c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTranspose3x8(vDst, src0, src1, src2); 470c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst, vDst[0]); 471c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 4, vDst[1]); 472c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 8, vDst[2]); 473c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 12, vDst[3]); 474c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 16, vDst[4]); 475c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 20, vDst[5]); 476c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 24, vDst[6]); 477c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 28, vDst[7]); 478c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 479c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 480c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 481c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 482488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 483488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 484488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 485488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 486488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 487488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 488488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 489bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar src3 = _simd16_setzero_ps(); 490488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 491bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar dst[4]; 492488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 493bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley vTranspose4x16(dst, src0, src1, src2, src3); 494488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 495bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); 496bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); 497bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); 498bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); 499488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 500488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 501c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 502c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 503c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 504c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32 505c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 506c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32 507c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 508c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 509c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 510c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 511c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 512542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 513c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 5141d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 515c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley const float* pfSrc = (const float*)pSrc; 516c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_r0 = _mm_load_ps(pfSrc + 0); 517c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_r1 = _mm_load_ps(pfSrc + 4); 518c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_g0 = _mm_load_ps(pfSrc + 8); 519c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_g1 = _mm_load_ps(pfSrc + 12); 520c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 521c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); 522c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); 523c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); 524c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); 525c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 526c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley float* pfDst = (float*)pDst; 527c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 0, dst0); 528c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 4, dst1); 529c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 8, dst2); 530c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 12, dst3); 5311d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#else 5321d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#error Unsupported vector width 5331d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 534c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 535488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 536488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 537488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 538488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 539bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr 540bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg 541bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 542bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD 543bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF 544bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 545bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 546bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF 547bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 548bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 549bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF 550bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 551bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg 552bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg 553488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 554488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 555c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 556c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 557c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 558c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16_16 559c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 560c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16_16 561c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 562c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 563c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 564c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 565c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 566542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 567c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 568c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 569c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 570c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 571c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 572c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 573c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 574c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_b = _mm256_extractf128_si256(src_ba, 0); 575c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_a = _mm256_extractf128_si256(src_ba, 1); 576c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 577c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 578c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 579c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 580c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 581c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 582c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 583c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 584c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 585c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 586c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 587c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 0, dst0); 588c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 1, dst1); 589c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 2, dst2); 590c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 3, dst3); 591c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 592c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 593c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 594c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 595488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 596488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 597488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 598488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 599bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 600bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 601bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 602bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa 603bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 604bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 605bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 606bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB 607bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF 608bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 609bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 610bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB 611bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD 612bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF 613bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 614bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 615bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 616bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB 617bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF 618bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 619bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba 620bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba 621bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba 622bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba 623488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 624488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 625c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 626c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 627c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 628c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16 629c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 630c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16 631c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 632c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 633c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 634c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 635c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 636542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 637c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 638c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 639c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 640c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 641c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 642c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 643c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); 644c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_a = _mm_undefined_si128(); 645c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 646c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 647c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 648c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 649c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 650c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 651c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 652c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 653c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 654c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 655c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 656c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 0, dst0); 657c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 1, dst1); 658c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 2, dst2); 659c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 3, dst3); 660c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 661c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 662c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 663c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 664488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 665488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 666488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 667488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 668bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 669bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 670bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 671bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa 672bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 673bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 674bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 675bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB 676bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF 677bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 678bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 679bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB 680bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD 681bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF 682bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 683bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 684bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 685bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB 686bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF 687bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley 688bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba 689bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba 690bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba 691bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba 692488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 693488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 694c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 695c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 696c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 697c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16 698c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 699c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16 700c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 701c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 702c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 703c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 704c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 705542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 706c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 7071d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 708c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src = _simd_load_ps((const float*)pSrc); 709c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 710c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 comp0 = _mm256_castps256_ps128(src); 711c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 comp1 = _mm256_extractf128_ps(src, 1); 712c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 713c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i comp0i = _mm_castps_si128(comp0); 714c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i comp1i = _mm_castps_si128(comp1); 715c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 716c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); 717c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); 718c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 719c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst, resLo); 720c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst + 1, resHi); 721c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 722c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 723c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 724c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 725488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 726488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 727488992221056edaf7111f9290afdf216c5e98d62Tim Rowley INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 728488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 729bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 730bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 731488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 732bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 733bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 734488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 735fa7c5e242f5aa54223bc30012c2023db7834c1e0Tim Rowley simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 736fa7c5e242f5aa54223bc30012c2023db7834c1e0Tim Rowley simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF 737488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 738bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg 739bd22c3d41151ce265e61d64f9034928f83d3c959Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg 740488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 741488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 742c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 743c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 744c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 745c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose24_8 746c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 747c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose24_8 748c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 749c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 750c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 751c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 752c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 753542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 754488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 755488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 756488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 757488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 758c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 759c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 760c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 761c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_8_24 762c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 763c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_8_24 764c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 765c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 766c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 767c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 768c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 769542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 770488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 771c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 772488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 773488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 774488992221056edaf7111f9290afdf216c5e98d62Tim Rowley}; 775c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 776c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 777c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose4_4_4_4 778c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 779c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose4_4_4_4 780c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 781c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 782c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 783c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 784c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 785542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 786488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 787488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 788488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 789488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 790c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 791c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 792c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 793c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_6_5 794c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 795c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_6_5 796c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 797c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 798c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 799c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 800c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 801542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 802488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 803488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 804488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 805488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 806c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 807c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 808c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 809c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose9_9_9_5 810c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 811c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose9_9_9_5 812c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 813c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 814c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 815c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 816c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 817542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 818488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 819488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 820488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 821488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 822c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 823c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 824c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 825c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_5_5_1 826c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 827c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_5_5_1 828c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 829c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 830c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 831c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 832c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 833542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 834488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 835488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 836488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 837488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 8381b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley}; 8391b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley 8401b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley////////////////////////////////////////////////////////////////////////// 8411b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley/// Transpose1_5_5_5 8421b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley////////////////////////////////////////////////////////////////////////// 8431b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct Transpose1_5_5_5 8441b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley{ 8451b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley ////////////////////////////////////////////////////////////////////////// 8461b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 8471b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley /// @param pSrc - source data in SOA form 8481b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley /// @param pDst - output data in AOS form 8491b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 850c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 851c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 852c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 853c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose10_10_10_2 854c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 855c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose10_10_10_2 856c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 857c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 858c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 859c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 860c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 861542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 862488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 863488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 864488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 865488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 866c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 867c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 868c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 869c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose11_11_10 870c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 871c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose11_11_10 872c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 873c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 874c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 875c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 876c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 877542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 878488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if ENABLE_AVX512_SIMD16 879488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 880488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 88133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif 88233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley}; 88333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 88433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 88533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64 88633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 88733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64 88833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{ 88933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley ////////////////////////////////////////////////////////////////////////// 89033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @brief Performs an SOA to AOS conversion 89133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pSrc - source data in SOA form 89233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pDst - output data in AOS form 89333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 89433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16 89533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 89633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 89733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif 89833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley}; 89933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 90033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 90133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64_64 90233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 90333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64_64 90433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{ 90533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley ////////////////////////////////////////////////////////////////////////// 90633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @brief Performs an SOA to AOS conversion 90733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pSrc - source data in SOA form 90833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pDst - output data in AOS form 90933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 91033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16 91133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 91233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 91333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif 91433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley}; 91533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 91633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 91733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64_64_64 91833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 91933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64_64_64 92033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{ 92133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley ////////////////////////////////////////////////////////////////////////// 92233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @brief Performs an SOA to AOS conversion 92333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pSrc - source data in SOA form 92433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pDst - output data in AOS form 92533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 92633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16 92733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 92833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 92933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#endif 93033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley}; 93133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 93233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 93333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley/// Transpose64_64_64_64 93433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley////////////////////////////////////////////////////////////////////////// 93533fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowleystruct Transpose64_64_64_64 93633fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley{ 93733fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley ////////////////////////////////////////////////////////////////////////// 93833fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @brief Performs an SOA to AOS conversion 93933fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pSrc - source data in SOA form 94033fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley /// @param pDst - output data in AOS form 94133fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 94233fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley#if ENABLE_AVX512_SIMD16 94333fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley 94433fa4c99f7fa68fd8c33c75c4fe66c4cca76779fTim Rowley static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 945488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 946c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 947c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 948c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// helper function to unroll loops 949c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int Begin, int End, int Step = 1> 950c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL { 951c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley template<typename Lambda> 952c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley INLINE static void step(Lambda& func) { 953c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley func(Begin); 954c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley UnrollerL<Begin + Step, End, Step>::step(func); 955c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 956c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 957c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 958c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int End, int Step> 959c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL<End, End, Step> { 960c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley template<typename Lambda> 961c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static void step(Lambda& func) { 962c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 963c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 964c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 9659f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley// helper function to unroll loops, with mask to skip specific iterations 9669f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int Begin, int End, int Step = 1, int Mask = 0x7f> 9679f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask { 9689f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley template<typename Lambda> 9699f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley INLINE static void step(Lambda& func) { 9709f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley if(Mask & (1 << Begin)) 9719f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley { 9729f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley func(Begin); 9739f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley } 9749f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley UnrollerL<Begin + Step, End, Step>::step(func); 9759f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley } 9769f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley}; 9779f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley 9789f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int End, int Step, int Mask> 9799f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask<End, End, Step, Mask> { 9809f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley template<typename Lambda> 9819f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley static void step(Lambda& func) { 9829f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley } 9839f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley}; 9849f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley 985c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// general CRC compute 986c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 987c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyuint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) 988c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 98990f9df3210b5b66585007ec4836bfca498fd45f0Tim Rowley#if defined(_WIN64) || defined(__x86_64__) 990c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeInQwords = size / sizeof(uint64_t); 991c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeRemainderBytes = size % sizeof(uint64_t); 992c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint64_t* pDataWords = (uint64_t*)pData; 993c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley for (uint32_t i = 0; i < sizeInQwords; ++i) 994c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 995c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); 996c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 997c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 998c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeInDwords = size / sizeof(uint32_t); 999c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeRemainderBytes = size % sizeof(uint32_t); 1000c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t* pDataWords = (uint32_t*)pData; 1001c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley for (uint32_t i = 0; i < sizeInDwords; ++i) 1002c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 1003c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley crc = _mm_crc32_u32(crc, *pDataWords++); 1004c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 1005c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 1006c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1007542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley uint8_t* pRemainderBytes = (uint8_t*)pDataWords; 1008c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley for (uint32_t i = 0; i < sizeRemainderBytes; ++i) 1009c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 1010c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley crc = _mm_crc32_u8(crc, *pRemainderBytes++); 1011c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 1012c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1013c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return crc; 1014c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1015c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1016c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1017c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Add byte offset to any-type pointer 1018c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1019c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T> 1020c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1021c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T* PtrAdd(T* p, intptr_t offset) 1022c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1023c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley intptr_t intp = reinterpret_cast<intptr_t>(p); 1024c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return reinterpret_cast<T*>(intp + offset); 1025c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1026c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1027c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1028c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Is a power-of-2? 1029c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1030c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T> 1031c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1032c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic bool IsPow2(T value) 1033c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1034c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return value == (value & (0 - value)); 1035c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1036c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1037c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1038c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment 1039c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 1040c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1041c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1042c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1043c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDownPow2(T1 value, T2 alignment) 1044c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1045c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley SWR_ASSERT(IsPow2(alignment)); 1046c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return value & ~T1(alignment - 1); 1047c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1048c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1049c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1050c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment 1051c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 1052c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1053c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1054c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1055c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUpPow2(T1 value, T2 alignment) 1056c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1057c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return AlignDownPow2(value + T1(alignment - 1), alignment); 1058c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1059c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1060c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1061c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up ptr to specified alignment 1062c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 1063c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1064c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1065c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1066c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUpPow2(T1* value, T2 alignment) 1067c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1068c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return reinterpret_cast<T1*>( 1069c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); 1070c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1071c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1072c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1073c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment 1074c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1075c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1076c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1077c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDown(T1 value, T2 alignment) 1078c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1079c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } 1080c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return value - T1(value % alignment); 1081c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1082c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1083c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1084c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment 1085c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1086c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1087c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1088c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignDown(T1* value, T2 alignment) 1089c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1090c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return (T1*)AlignDown(uintptr_t(value), alignment); 1091c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1092c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1093c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1094c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment 1095c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 1096c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1097c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1098c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1099c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUp(T1 value, T2 alignment) 1100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return AlignDown(value + T1(alignment - 1), alignment); 1102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment 1106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 1107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 1109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 1110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUp(T1* value, T2 alignment) 1111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return AlignDown(PtrAdd(value, alignment - 1), alignment); 1113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 1114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Helper structure used to access an array of elements that don't 1117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// correspond to a typical word size. 1118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 1119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<typename T, size_t BitsPerElementT, size_t ArrayLenT> 1120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyclass BitsArray 1121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 1122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyprivate: 1123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t BITS_PER_WORD = sizeof(size_t) * 8; 1124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; 1125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; 1126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; 1127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, 1129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley "Element size must an integral fraction of pointer size"); 1130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley size_t m_words[NUM_WORDS] = {}; 1132c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleypublic: 1134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley T operator[] (size_t elementIndex) const 1136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 1137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; 1138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); 1139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return T(word & ELEMENT_MASK); 1140c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 1141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 114227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 1143e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley// Ranged integer argument for TemplateArgUnroller 1144e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleytemplate <uint32_t TMin, uint32_t TMax> 1145e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleystruct IntArg 1146e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley{ 1147e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley uint32_t val; 1148e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley}; 1149e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley 115027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// Recursive template used to auto-nest conditionals. Converts dynamic boolean function 115127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// arguments to static template arguments. 115227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleytemplate <typename TermT, typename... ArgsB> 115327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleystruct TemplateArgUnroller 115427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley{ 1155e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1156e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley // Boolean value 1157e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1158e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley 115927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley // Last Arg Terminator 116027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley static typename TermT::FuncType GetFunc(bool bArg) 116127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 116227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley if (bArg) 116327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 116427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TermT::template GetFunc<ArgsB..., std::true_type>(); 116527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 116627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 116727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TermT::template GetFunc<ArgsB..., std::false_type>(); 116827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 116927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 117027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley // Recursively parse args 117127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley template <typename... TArgsT> 117227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs) 117327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 117427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley if (bArg) 117527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 117627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...); 117727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 117827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 117927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...); 118027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 1181c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley 1182e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1183e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley // Integer value (within specified range) 1184e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1185e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley 1186c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley // Last Arg Terminator 1187e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TMin, uint32_t TMax> 1188e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg) 1189c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1190e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (iArg.val == TMax) 1191e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1192e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>(); 1193e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1194e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (TMax > TMin) 1195c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1196e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val}); 1197c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 1198e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSUME(false); return nullptr; 1199e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1200e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TVal> 1201e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg) 1202e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1203e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSERT(iArg.val == TVal); 1204e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>(); 1205c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 1206c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley 1207c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley // Recursively parse args 1208e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TMin, uint32_t TMax, typename... TArgsT> 1209e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs) 1210c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1211e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (iArg.val == TMax) 1212c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1213e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...); 1214c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 1215e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (TMax > TMin) 1216e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1217e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...); 1218e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1219e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSUME(false); return nullptr; 1220e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1221e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TVal, typename... TArgsT> 1222e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs) 1223e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1224e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSERT(iArg.val == TVal); 1225e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...); 1226c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 122727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley}; 122827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 122965c2abf6fdd51b0a80a72caa0c52cf3f4578e743Tim Rowley 1230