utils.h revision 1b86c050adcb9c166c2aab2f4c6e41cc07686bf3
1c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/**************************************************************************** 2c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 4c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a 5c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* copy of this software and associated documentation files (the "Software"), 6c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* to deal in the Software without restriction, including without limitation 7c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the 9c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software is furnished to do so, subject to the following conditions: 10c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 11c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* The above copyright notice and this permission notice (including the next 12c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* paragraph) shall be included in all copies or substantial portions of the 13c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software. 14c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 15c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IN THE SOFTWARE. 22c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 23c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @file utils.h 24c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 25c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @brief Utilities used by SWR core. 26c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* 27c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley******************************************************************************/ 28c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#pragma once 29c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 30c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include <string.h> 3127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley#include <type_traits> 32812b45d04958e31e7a3bfc7331308374e8b73afaTim Rowley#include <algorithm> 33c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/os.h" 34c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/simdintrin.h" 35c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "common/swr_assert.h" 360ff57446e3786243c6d752c91be2108595f2663eTim Rowley#include "core/api.h" 37c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 381da9c8a970207b5aac96b3161706041e781124f6Tim Rowley#if defined(_WIN64) || defined(__x86_64__) 39c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_INSERT_EPI64 _mm_insert_epi64 40c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _MM_EXTRACT_EPI64 _mm_extract_epi64 41c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 42550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) 43c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 44c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley OSALIGNLINE(uint32_t) elems[4]; 45c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)elems, a); 46c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley if (ndx == 0) 47c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 48c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint64_t foo = elems[0]; 49c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley foo |= (uint64_t)elems[1] << 32; 50c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return foo; 51c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 52c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley else 53c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 54c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint64_t foo = elems[2]; 55c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley foo |= (uint64_t)elems[3] << 32; 56c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return foo; 57c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 58c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 59c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 60550503e776aaca0207184a6454de6aedc8c88aacTim RowleyINLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx) 61c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 62c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley OSALIGNLINE(int64_t) elems[2]; 63c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)elems, a); 64c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley if (ndx == 0) 65c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 66c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley elems[0] = b; 67c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 68c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley else 69c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 70c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley elems[1] = b; 71c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 72c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i out; 73c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley out = _mm_load_si128((const __m128i*)elems); 74c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return out; 75c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 76c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 77c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 78c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct simdBBox 79c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 800ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari ymin; 810ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari ymax; 820ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari xmin; 830ff57446e3786243c6d752c91be2108595f2663eTim Rowley simdscalari xmax; 84c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 85c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 86c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 87c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) 88c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 89c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row0i = _mm_castps_si128(row0); 90c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row1i = _mm_castps_si128(row1); 91c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row2i = _mm_castps_si128(row2); 92c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i row3i = _mm_castps_si128(row3); 93c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 94c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i vTemp = row2i; 95c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2i = _mm_unpacklo_epi32(row2i, row3i); 96c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTemp = _mm_unpackhi_epi32(vTemp, row3i); 97c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 98c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3i = row0i; 99c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0i = _mm_unpacklo_epi32(row0i, row1i); 100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3i = _mm_unpackhi_epi32(row3i, row1i); 101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1i = row0i; 103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0i = _mm_unpacklo_epi64(row0i, row2i); 104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1i = _mm_unpackhi_epi64(row1i, row2i); 105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2i = row3i; 107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2i = _mm_unpacklo_epi64(row2i, vTemp); 108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3i = _mm_unpackhi_epi64(row3i, vTemp); 109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0 = _mm_castsi128_ps(row0i); 111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1 = _mm_castsi128_ps(row1i); 112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = _mm_castsi128_ps(row2i); 113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = _mm_castsi128_ps(row3i); 114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) 118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i vTemp = row2; 120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = _mm_unpacklo_epi32(row2, row3); 121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTemp = _mm_unpackhi_epi32(vTemp, row3); 122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = row0; 124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0 = _mm_unpacklo_epi32(row0, row1); 125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = _mm_unpackhi_epi32(row3, row1); 126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1 = row0; 128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row0 = _mm_unpacklo_epi64(row0, row2); 129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row1 = _mm_unpackhi_epi64(row1, row2); 130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = row3; 132c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row2 = _mm_unpacklo_epi64(row2, vTemp); 133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley row3 = _mm_unpackhi_epi64(row3, vTemp); 134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define GCC_VERSION (__GNUC__ * 10000 \ 137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley + __GNUC_MINOR__ * 100 \ 138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley + __GNUC_PATCHLEVEL__) 139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 140c969ef2d427dfb3940a0d6d2b853454e65f1119fTim Rowley#if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) 141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_ps _mm_setzero_ps 142c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm_undefined_si128 _mm_setzero_si128 143c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 144c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#define _mm256_undefined_ps _mm256_setzero_ps 145c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 146c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 147c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 1481d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 || KNOB_SIMD_WIDTH == 16 149c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 150c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) 151c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 152c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 153c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 154c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 155c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 156c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 157c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 158c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 159c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 160c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 161c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 162c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 163c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 164c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 165c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 166c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 167c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 168c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 169c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 170c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 171c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 172c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 173c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 174c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) 175c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 176c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 177c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 178c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 179c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 180c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 181c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 182c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 183c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 184c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 185c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 186c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 187c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 188c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 189c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 190c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 191c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 192c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 193c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 194c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 195c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 196c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 197c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 198c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) 199c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 200c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); 201c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); 202c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); 203c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); 204c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); 205c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); 206c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); 207c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); 208c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 209c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 210c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 211c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 212c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 213c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 214c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 215c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 216c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); 217c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); 218c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); 219c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); 220c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); 221c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); 222c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); 223c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); 224c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 225c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 226c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 227c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) 228c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 229c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 230c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); 231c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 232c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 233c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 234c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 235c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// TranposeSingleComponent 236c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 237c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<uint32_t bpp> 238c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct TransposeSingleComponent 239c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 240c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 241c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Pass-thru for single component. 242c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 243c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 244542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 245c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 246c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 247c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 248c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 249c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 250c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 251c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8_8 252c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 253c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8_8 254c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 255c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 256c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 257c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 258c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 259542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 260c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 261c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src = _simd_load_si((const simdscalari*)pSrc); 2621d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 263c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 264c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX 265c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg 266c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa 267c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 268c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 269c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 270c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa 271c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba 272c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba 273c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst, c0123lo); 274c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)(pDst + 16), c0123hi); 275c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#elif KNOB_ARCH == KNOB_ARCH_AVX2 276c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari dst01 = _mm256_shuffle_epi8(src, 277c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 278c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 279c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley dst23 = _mm256_shuffle_epi8(dst23, 280c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 281c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari dst = _mm256_or_si256(dst01, dst23); 282c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _simd_store_si((simdscalari*)pDst, dst); 283c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 2841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 2851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari mask0 = _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800); 2861d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 2871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari dst01 = _simd_shuffle_epi8(src, mask0); 2881d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 2891d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari perm1 = _simd_permute_128(src, src, 1); 2901d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 2911d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari mask1 = _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080); 2921d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 2931d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari dst23 = _simd_shuffle_epi8(perm1, mask1); 2941d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 2951d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari dst = _simd_or_si(dst01, dst23); 2961d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 2971d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); 298c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 299c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 300c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 301c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 302c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 303c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 304c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 305c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8_8 306c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 307c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8_8 308c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 309c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 310c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 311c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 312c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 313542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 314c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 315c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 316c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 317c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose8_8 318c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 319c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose8_8 320c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 321c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 322c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 323c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 324c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 325542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 326c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 3271d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 328c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src = _simd_load_si((const simdscalari*)pSrc); 329c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 330c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg 331c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg 332c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley rg = _mm_unpacklo_epi8(rg, g); 333c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst, rg); 3341d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 3351d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg 3361d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3371d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx 3381d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3391d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx 3401d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3411d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg 3421d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3431d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst); 344c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 345c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 346c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 347c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 348c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 349c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 350c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 351c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32_32 352c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 353c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32_32 354c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 355c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 356c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 357c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 358c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 359542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 360c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 361c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 362c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src0 = _simd_load_ps((const float*)pSrc); 363c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 364c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 365c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 366c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 367c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 vDst[8]; 368c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTranspose4x8(vDst, src0, src1, src2, src3); 369c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst, vDst[0]); 370c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+4, vDst[1]); 371c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+8, vDst[2]); 372c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+12, vDst[3]); 373c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+16, vDst[4]); 374c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+20, vDst[5]); 375c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+24, vDst[6]); 376c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst+28, vDst[7]); 3771d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 3781d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION 3791d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); 3801d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16); 3811d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32); 3821d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src3 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 48); 3831d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m128 vDst[8]; 3851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3861d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley vTranspose4x8(vDst, src0.lo, src1.lo, src2.lo, src3.lo); 3871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3881d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]); 3891d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]); 3901d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]); 3911d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]); 3921d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]); 3931d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]); 3941d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]); 3951d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]); 3961d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3971d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley vTranspose4x8(vDst, src0.hi, src1.hi, src2.hi, src3.hi); 3981d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 3991d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]); 4001d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]); 4011d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]); 4021d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]); 4031d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]); 4041d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]); 4051d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]); 4061d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]); 4071d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 408c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 409c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 410c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 411c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 412c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 413c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 414c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 415c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32_32 416c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 417c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32_32 418c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 419c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 420c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 421c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 422c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 423542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 424c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 425c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 426c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src0 = _simd_load_ps((const float*)pSrc); 427c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 428c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 429c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 430c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 vDst[8]; 431c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley vTranspose3x8(vDst, src0, src1, src2); 432c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst, vDst[0]); 433c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 4, vDst[1]); 434c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 8, vDst[2]); 435c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 12, vDst[3]); 436c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 16, vDst[4]); 437c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 20, vDst[5]); 438c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 24, vDst[6]); 439c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps((float*)pDst + 28, vDst[7]); 4401d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 4411d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION 4421d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src0 = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); 4431d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src1 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 16); 4441d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalar src2 = _simd_load_ps(reinterpret_cast<const float*>(pSrc) + 32); 4451d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 4461d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m128 vDst[8]; 4471d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 4481d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley vTranspose3x8(vDst, src0.lo, src1.lo, src2.lo); 4491d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 4501d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst), vDst[0]); 4511d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 4, vDst[1]); 4521d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 8, vDst[2]); 4531d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 12, vDst[3]); 4541d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 16, vDst[4]); 4551d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 20, vDst[5]); 4561d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 24, vDst[6]); 4571d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 28, vDst[7]); 4581d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 4591d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley vTranspose3x8(vDst, src0.hi, src1.hi, src2.hi); 4601d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 4611d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 32, vDst[0]); 4621d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 36, vDst[1]); 4631d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 40, vDst[2]); 4641d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 44, vDst[3]); 4651d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 48, vDst[4]); 4661d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 52, vDst[5]); 4671d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 56, vDst[6]); 4681d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm_store_ps(reinterpret_cast<float*>(pDst) + 60, vDst[7]); 4691d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 470c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 471c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 472c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 473c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 474c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 475c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 476c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 477c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_32 478c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 479c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_32 480c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 481c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 482c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 483c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 484c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 485542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 486c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 4871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 488c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley const float* pfSrc = (const float*)pSrc; 489c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_r0 = _mm_load_ps(pfSrc + 0); 490c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_r1 = _mm_load_ps(pfSrc + 4); 491c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_g0 = _mm_load_ps(pfSrc + 8); 492c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 src_g1 = _mm_load_ps(pfSrc + 12); 493c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 494c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); 495c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); 496c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); 497c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); 498c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 499c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley float* pfDst = (float*)pDst; 500c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 0, dst0); 501c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 4, dst1); 502c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 8, dst2); 503c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_ps(pfDst + 12, dst3); 5041d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 5051d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley const float* pfSrc = (const float*)pSrc; 5061d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 src_r0 = _mm256_load_ps(pfSrc + 0); 5071d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 src_r1 = _mm256_load_ps(pfSrc + 8); 5081d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 src_g0 = _mm256_load_ps(pfSrc + 16); 5091d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 src_g1 = _mm256_load_ps(pfSrc + 24); 5101d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 5111d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0); 5121d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0); 5131d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1); 5141d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1); 5151d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 5161d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley float* pfDst = (float*)pDst; 5171d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_ps(pfDst + 0, dst0); 5181d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_ps(pfDst + 8, dst1); 5191d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_ps(pfDst + 16, dst2); 5201d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_ps(pfDst + 24, dst3); 5211d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#else 5221d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#error Unsupported vector width 5231d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 524c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 525c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 526c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 527c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 528c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16_16 529c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 530c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16_16 531c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 532c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 533c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 534c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 535c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 536542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 537c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 538c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 539c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 540c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 541c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 542c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 543c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 544c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_b = _mm256_extractf128_si256(src_ba, 0); 545c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_a = _mm256_extractf128_si256(src_ba, 1); 546c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 547c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 548c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 549c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 550c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 551c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 552c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 553c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 554c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 555c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 556c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 557c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 0, dst0); 558c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 1, dst1); 559c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 2, dst2); 560c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 3, dst3); 5611d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 5621d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION 5631d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); 5641d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari src_ba = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc + sizeof(simdscalari))); 5651d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 5661d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_r = src_rg.lo; 5671d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_g = src_rg.hi; 5681d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_b = src_ba.lo; 5691d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_a = src_ba.hi; 5701d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 5711d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); 5721d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); 5731d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); 5741d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); 5751d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 5761d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); 5771d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); 5781d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); 5791d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); 5801d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 5811d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); 5821d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); 5831d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); 5841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); 5851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 586c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 587c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 588c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 589c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 590c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 591c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 592c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 593c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16_16 594c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 595c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16_16 596c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 597c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 598c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 599c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 600c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 601542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 602c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 603c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_SIMD_WIDTH == 8 604c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 605c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 606c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 607c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 608c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); 609c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i src_a = _mm_undefined_si128(); 610c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 611c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 612c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 613c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 614c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 615c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 616c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 617c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 618c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 619c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 620c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 621c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 0, dst0); 622c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 1, dst1); 623c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 2, dst2); 624c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128(((__m128i*)pDst) + 3, dst3); 6251d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 6261d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION 6271d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari src_rg = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); 6281d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6291d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_r = src_rg.lo; 6301d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_g = src_rg.hi; 6311d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i*>(pSrc + sizeof(simdscalari))); 6321d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i src_a = _mm256_undefined_si256(); 6331d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6341d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); 6351d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); 6361d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); 6371d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); 6381d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6391d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); 6401d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); 6411d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); 6421d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); 6431d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6441d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); 6451d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); 6461d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); 6471d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); 6481d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 649c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 650c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 651c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 652c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 653c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 654c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 655c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 656c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose16_16 657c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 658c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose16_16 659c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 660c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 661c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 662c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 663c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 664542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 665c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 6661d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if KNOB_SIMD_WIDTH == 8 667c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley simdscalar src = _simd_load_ps((const float*)pSrc); 668c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 669c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 comp0 = _mm256_castps256_ps128(src); 670c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128 comp1 = _mm256_extractf128_ps(src, 1); 671c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 672c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i comp0i = _mm_castps_si128(comp0); 673c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i comp1i = _mm_castps_si128(comp1); 674c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 675c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); 676c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); 677c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 678c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst, resLo); 679c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley _mm_store_si128((__m128i*)pDst + 1, resHi); 6801d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#elif KNOB_SIMD_WIDTH == 16 6811d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#if ENABLE_AVX512_EMULATION 6821d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari src = _simd_castps_si(_simd_load_ps(reinterpret_cast<const float*>(pSrc))); 6831d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6841d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley simdscalari result; 6851d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6861d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley result.lo = _mm256_unpacklo_epi16(src.lo, src.hi); 6871d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley result.hi = _mm256_unpackhi_epi16(src.lo, src.hi); 6881d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley 6891d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley _simd_store_si(reinterpret_cast<simdscalari *>(pDst), result); 6901d09b3971aed8f86aa28e52b1dcec393ee5debc9Tim Rowley#endif 691c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 692c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#error Unsupported vector width 693c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 694c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 695c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 696c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 697c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 698c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose24_8 699c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 700c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose24_8 701c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 702c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 703c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 704c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 705c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 706542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 707c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 708c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 709c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 710c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose32_8_24 711c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 712c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose32_8_24 713c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 714c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 715c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 716c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 717c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 718542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 719c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 720c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 721c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 722c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 723c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 724c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose4_4_4_4 725c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 726c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose4_4_4_4 727c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 728c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 729c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 730c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 731c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 732542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 733c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 734c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 735c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 736c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_6_5 737c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 738c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_6_5 739c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 740c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 741c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 742c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 743c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 744542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 745c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 746c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 747c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 748c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose9_9_9_5 749c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 750c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose9_9_9_5 751c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 752c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 753c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 754c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 755c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 756542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 757c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 758c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 759c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 760c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose5_5_5_1 761c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 762c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose5_5_5_1 763c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 764c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 765c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 766c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 767c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 768542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 7691b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley}; 7701b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley 7711b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley////////////////////////////////////////////////////////////////////////// 7721b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley/// Transpose1_5_5_5 7731b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley////////////////////////////////////////////////////////////////////////// 7741b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct Transpose1_5_5_5 7751b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley{ 7761b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley ////////////////////////////////////////////////////////////////////////// 7771b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 7781b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley /// @param pSrc - source data in SOA form 7791b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley /// @param pDst - output data in AOS form 7801b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 781c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 782c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 783c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 784c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose10_10_10_2 785c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 786c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose10_10_10_2 787c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 788c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 789c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 790c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 791c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 792542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 793c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 794c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 795c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 796c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Transpose11_11_10 797c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 798c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct Transpose11_11_10 799c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 800c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley ////////////////////////////////////////////////////////////////////////// 801c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 802c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pSrc - source data in SOA form 803c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley /// @param pDst - output data in AOS form 804542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 805c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 806c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 807c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// helper function to unroll loops 808c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int Begin, int End, int Step = 1> 809c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL { 810c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley template<typename Lambda> 811c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley INLINE static void step(Lambda& func) { 812c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley func(Begin); 813c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley UnrollerL<Begin + Step, End, Step>::step(func); 814c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 815c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 816c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 817c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<int End, int Step> 818c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct UnrollerL<End, End, Step> { 819c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley template<typename Lambda> 820c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static void step(Lambda& func) { 821c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 822c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 823c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 8249f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley// helper function to unroll loops, with mask to skip specific iterations 8259f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int Begin, int End, int Step = 1, int Mask = 0x7f> 8269f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask { 8279f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley template<typename Lambda> 8289f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley INLINE static void step(Lambda& func) { 8299f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley if(Mask & (1 << Begin)) 8309f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley { 8319f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley func(Begin); 8329f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley } 8339f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley UnrollerL<Begin + Step, End, Step>::step(func); 8349f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley } 8359f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley}; 8369f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley 8379f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleytemplate<int End, int Step, int Mask> 8389f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowleystruct UnrollerLMask<End, End, Step, Mask> { 8399f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley template<typename Lambda> 8409f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley static void step(Lambda& func) { 8419f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley } 8429f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley}; 8439f7d99fcfecb7bb613855d25c10bb7908850c483Tim Rowley 844c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley// general CRC compute 845c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 846c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyuint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) 847c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 84890f9df3210b5b66585007ec4836bfca498fd45f0Tim Rowley#if defined(_WIN64) || defined(__x86_64__) 849c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeInQwords = size / sizeof(uint64_t); 850c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeRemainderBytes = size % sizeof(uint64_t); 851c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint64_t* pDataWords = (uint64_t*)pData; 852c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley for (uint32_t i = 0; i < sizeInQwords; ++i) 853c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 854c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); 855c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 856c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#else 857c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeInDwords = size / sizeof(uint32_t); 858c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t sizeRemainderBytes = size % sizeof(uint32_t); 859c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley uint32_t* pDataWords = (uint32_t*)pData; 860c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley for (uint32_t i = 0; i < sizeInDwords; ++i) 861c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 862c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley crc = _mm_crc32_u32(crc, *pDataWords++); 863c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 864c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif 865c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 866542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley uint8_t* pRemainderBytes = (uint8_t*)pDataWords; 867c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley for (uint32_t i = 0; i < sizeRemainderBytes; ++i) 868c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 869c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley crc = _mm_crc32_u8(crc, *pRemainderBytes++); 870c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 871c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 872c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return crc; 873c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 874c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 875c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 876c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Add byte offset to any-type pointer 877c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 878c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T> 879c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 880c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T* PtrAdd(T* p, intptr_t offset) 881c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 882c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley intptr_t intp = reinterpret_cast<intptr_t>(p); 883c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return reinterpret_cast<T*>(intp + offset); 884c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 885c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 886c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 887c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Is a power-of-2? 888c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 889c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T> 890c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 891c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic bool IsPow2(T value) 892c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 893c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return value == (value & (0 - value)); 894c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 895c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 896c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 897c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment 898c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 899c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 900c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 901c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 902c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDownPow2(T1 value, T2 alignment) 903c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 904c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley SWR_ASSERT(IsPow2(alignment)); 905c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return value & ~T1(alignment - 1); 906c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 907c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 908c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 909c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment 910c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 911c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 912c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 913c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 914c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUpPow2(T1 value, T2 alignment) 915c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 916c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return AlignDownPow2(value + T1(alignment - 1), alignment); 917c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 918c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 919c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 920c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up ptr to specified alignment 921c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 922c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 923c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 924c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 925c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUpPow2(T1* value, T2 alignment) 926c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 927c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return reinterpret_cast<T1*>( 928c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); 929c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 930c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 931c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 932c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment 933c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 934c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 935c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 936c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignDown(T1 value, T2 alignment) 937c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 938c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } 939c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return value - T1(value % alignment); 940c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 941c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 942c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 943c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align down to specified alignment 944c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 945c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 946c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 947c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignDown(T1* value, T2 alignment) 948c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 949c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return (T1*)AlignDown(uintptr_t(value), alignment); 950c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 951c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 952c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 953c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment 954c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 955c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 956c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 957c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 958c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1 AlignUp(T1 value, T2 alignment) 959c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 960c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return AlignDown(value + T1(alignment - 1), alignment); 961c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 962c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 963c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 964c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Align up to specified alignment 965c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Note: IsPow2(alignment) MUST be true 966c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 967c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <typename T1, typename T2> 968c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE 969c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic T1* AlignUp(T1* value, T2 alignment) 970c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 971c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return AlignDown(PtrAdd(value, alignment - 1), alignment); 972c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley} 973c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 974c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 975c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// Helper structure used to access an array of elements that don't 976c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// correspond to a typical word size. 977c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley////////////////////////////////////////////////////////////////////////// 978c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate<typename T, size_t BitsPerElementT, size_t ArrayLenT> 979c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyclass BitsArray 980c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{ 981c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyprivate: 982c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t BITS_PER_WORD = sizeof(size_t) * 8; 983c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; 984c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; 985c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; 986c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 987c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, 988c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley "Element size must an integral fraction of pointer size"); 989c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 990c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley size_t m_words[NUM_WORDS] = {}; 991c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 992c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleypublic: 993c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley 994c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley T operator[] (size_t elementIndex) const 995c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley { 996c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; 997c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); 998c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley return T(word & ELEMENT_MASK); 999c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley } 1000c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}; 100127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 1002e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley// Ranged integer argument for TemplateArgUnroller 1003e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleytemplate <uint32_t TMin, uint32_t TMax> 1004e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowleystruct IntArg 1005e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley{ 1006e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley uint32_t val; 1007e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley}; 1008e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley 100927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// Recursive template used to auto-nest conditionals. Converts dynamic boolean function 101027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// arguments to static template arguments. 101127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleytemplate <typename TermT, typename... ArgsB> 101227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleystruct TemplateArgUnroller 101327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley{ 1014e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1015e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley // Boolean value 1016e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1017e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley 101827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley // Last Arg Terminator 101927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley static typename TermT::FuncType GetFunc(bool bArg) 102027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 102127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley if (bArg) 102227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 102327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TermT::template GetFunc<ArgsB..., std::true_type>(); 102427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 102527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 102627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TermT::template GetFunc<ArgsB..., std::false_type>(); 102727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 102827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 102927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley // Recursively parse args 103027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley template <typename... TArgsT> 103127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs) 103227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 103327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley if (bArg) 103427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley { 103527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...); 103627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 103727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 103827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...); 103927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley } 1040c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley 1041e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1042e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley // Integer value (within specified range) 1043e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley //----------------------------------------- 1044e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley 1045c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley // Last Arg Terminator 1046e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TMin, uint32_t TMax> 1047e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg) 1048c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1049e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (iArg.val == TMax) 1050e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1051e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>(); 1052e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1053e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (TMax > TMin) 1054c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1055e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val}); 1056c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 1057e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSUME(false); return nullptr; 1058e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1059e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TVal> 1060e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg) 1061e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1062e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSERT(iArg.val == TVal); 1063e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>(); 1064c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 1065c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley 1066c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley // Recursively parse args 1067e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TMin, uint32_t TMax, typename... TArgsT> 1068e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs) 1069c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1070e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (iArg.val == TMax) 1071c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley { 1072e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...); 1073c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 1074e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley if (TMax > TMin) 1075e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1076e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...); 1077e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1078e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSUME(false); return nullptr; 1079e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley } 1080e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley template <uint32_t TVal, typename... TArgsT> 1081e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs) 1082e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley { 1083e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley SWR_ASSERT(iArg.val == TVal); 1084e0529a4668c7f50fb51b2c4ae8bc5954934db55fTim Rowley return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...); 1085c7cd33b605f0238464a3250a11f7134e4b7d22a6Tim Rowley } 108627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley}; 108727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley 108865c2abf6fdd51b0a80a72caa0c52cf3f4578e743Tim Rowley 1089