utils.h revision 3252fe3705376063f94a7717c07b9824b5d43f46
1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file utils.h
24*
25* @brief Utilities used by SWR core.
26*
27******************************************************************************/
28#pragma once
29
30#include <string.h>
31#include "common/os.h"
32#include "common/simdintrin.h"
33#include "common/swr_assert.h"
34
35#if defined(_WIN32)
36void SaveImageToPNGFile(
37    const WCHAR *pFilename,
38    void *pBuffer,
39    uint32_t width,
40    uint32_t height);
41
42void OpenBitmapFromFile(
43    const WCHAR *pFilename,
44    void **pBuffer,
45    uint32_t *width,
46    uint32_t *height);
47#endif
48
49#if defined(_WIN64) || defined(__x86_64__)
50#define _MM_INSERT_EPI64 _mm_insert_epi64
51#define _MM_EXTRACT_EPI64 _mm_extract_epi64
52#else
53INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
54{
55    OSALIGNLINE(uint32_t) elems[4];
56    _mm_store_si128((__m128i*)elems, a);
57    if (ndx == 0)
58    {
59        uint64_t foo = elems[0];
60        foo |= (uint64_t)elems[1] << 32;
61        return foo;
62    }
63    else
64    {
65        uint64_t foo = elems[2];
66        foo |= (uint64_t)elems[3] << 32;
67        return foo;
68    }
69}
70
71INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
72{
73    OSALIGNLINE(int64_t) elems[2];
74    _mm_store_si128((__m128i*)elems, a);
75    if (ndx == 0)
76    {
77        elems[0] = b;
78    }
79    else
80    {
81        elems[1] = b;
82    }
83    __m128i out;
84    out = _mm_load_si128((const __m128i*)elems);
85    return out;
86}
87#endif
88
89OSALIGNLINE(struct) BBOX
90{
91    int top{ 0 };
92    int bottom{ 0 };
93    int left{ 0 };
94    int right{ 0 };
95
96    BBOX() {}
97    BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
98
99    bool operator==(const BBOX& rhs)
100    {
101        return (this->top == rhs.top &&
102            this->bottom == rhs.bottom &&
103            this->left == rhs.left &&
104            this->right == rhs.right);
105    }
106
107    bool operator!=(const BBOX& rhs)
108    {
109        return !(*this == rhs);
110    }
111};
112
113struct simdBBox
114{
115    simdscalari top;
116    simdscalari bottom;
117    simdscalari left;
118    simdscalari right;
119};
120
121INLINE
122void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
123{
124    __m128i row0i = _mm_castps_si128(row0);
125    __m128i row1i = _mm_castps_si128(row1);
126    __m128i row2i = _mm_castps_si128(row2);
127    __m128i row3i = _mm_castps_si128(row3);
128
129    __m128i vTemp = row2i;
130    row2i = _mm_unpacklo_epi32(row2i, row3i);
131    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
132
133    row3i = row0i;
134    row0i = _mm_unpacklo_epi32(row0i, row1i);
135    row3i = _mm_unpackhi_epi32(row3i, row1i);
136
137    row1i = row0i;
138    row0i = _mm_unpacklo_epi64(row0i, row2i);
139    row1i = _mm_unpackhi_epi64(row1i, row2i);
140
141    row2i = row3i;
142    row2i = _mm_unpacklo_epi64(row2i, vTemp);
143    row3i = _mm_unpackhi_epi64(row3i, vTemp);
144
145    row0 = _mm_castsi128_ps(row0i);
146    row1 = _mm_castsi128_ps(row1i);
147    row2 = _mm_castsi128_ps(row2i);
148    row3 = _mm_castsi128_ps(row3i);
149}
150
151INLINE
152void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
153{
154    __m128i vTemp = row2;
155    row2 = _mm_unpacklo_epi32(row2, row3);
156    vTemp = _mm_unpackhi_epi32(vTemp, row3);
157
158    row3 = row0;
159    row0 = _mm_unpacklo_epi32(row0, row1);
160    row3 = _mm_unpackhi_epi32(row3, row1);
161
162    row1 = row0;
163    row0 = _mm_unpacklo_epi64(row0, row2);
164    row1 = _mm_unpackhi_epi64(row1, row2);
165
166    row2 = row3;
167    row2 = _mm_unpacklo_epi64(row2, vTemp);
168    row3 = _mm_unpackhi_epi64(row3, vTemp);
169}
170
171#define GCC_VERSION (__GNUC__ * 10000 \
172                     + __GNUC_MINOR__ * 100 \
173                     + __GNUC_PATCHLEVEL__)
174
175#if defined(__GNUC__) && (GCC_VERSION < 40900)
176#define _mm_undefined_ps _mm_setzero_ps
177#define _mm_undefined_si128 _mm_setzero_si128
178#if KNOB_SIMD_WIDTH == 8
179#define _mm256_undefined_ps _mm256_setzero_ps
180#endif
181#endif
182
183#if KNOB_SIMD_WIDTH == 8
184INLINE
185void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
186{
187    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
188    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
189    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
190    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
191
192    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
193    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
194    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
195    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
196
197    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
198    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
199    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
200    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
201
202    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
203    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
204    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
205    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
206}
207
208INLINE
209void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
210{
211    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
212    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
213    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
214    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
215
216    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
217    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
218    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
219    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
220
221    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
222    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
223    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
224    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
225
226    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
227    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
228    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
229    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
230}
231
232INLINE
233void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
234{
235    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
236    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
237    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
238    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
239    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
240    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
241    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
242    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
243    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
244    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
245    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
246    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
247    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
248    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
249    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
250    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
251    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
252    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
253    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
254    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
255    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
256    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
257    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
258    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
259}
260
261INLINE
262void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
263{
264    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
265        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
266}
267#endif
268
269//////////////////////////////////////////////////////////////////////////
270/// TranposeSingleComponent
271//////////////////////////////////////////////////////////////////////////
272template<uint32_t bpp>
273struct TransposeSingleComponent
274{
275    //////////////////////////////////////////////////////////////////////////
276    /// @brief Pass-thru for single component.
277    /// @param pSrc - source data in SOA form
278    /// @param pDst - output data in AOS form
279    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
280    {
281        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
282    }
283};
284
285//////////////////////////////////////////////////////////////////////////
286/// Transpose8_8_8_8
287//////////////////////////////////////////////////////////////////////////
288struct Transpose8_8_8_8
289{
290    //////////////////////////////////////////////////////////////////////////
291    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
292    /// @param pSrc - source data in SOA form
293    /// @param pDst - output data in AOS form
294    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
295    {
296        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
297#if KNOB_SIMD_WIDTH == 8
298#if KNOB_ARCH == KNOB_ARCH_AVX
299        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
300        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
301        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
302        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
303        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
304        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
305        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
306        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
307        _mm_store_si128((__m128i*)pDst, c0123lo);
308        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
309#elif KNOB_ARCH == KNOB_ARCH_AVX2
310        simdscalari dst01 = _mm256_shuffle_epi8(src,
311            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
312        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
313        dst23 = _mm256_shuffle_epi8(dst23,
314            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
315        simdscalari dst = _mm256_or_si256(dst01, dst23);
316        _simd_store_si((simdscalari*)pDst, dst);
317#endif
318#else
319#error Unsupported vector width
320#endif
321    }
322};
323
324//////////////////////////////////////////////////////////////////////////
325/// Transpose8_8_8
326//////////////////////////////////////////////////////////////////////////
327struct Transpose8_8_8
328{
329    //////////////////////////////////////////////////////////////////////////
330    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
331    /// @param pSrc - source data in SOA form
332    /// @param pDst - output data in AOS form
333    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
334};
335
336//////////////////////////////////////////////////////////////////////////
337/// Transpose8_8
338//////////////////////////////////////////////////////////////////////////
339struct Transpose8_8
340{
341    //////////////////////////////////////////////////////////////////////////
342    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
343    /// @param pSrc - source data in SOA form
344    /// @param pDst - output data in AOS form
345    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
346    {
347        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
348
349#if KNOB_SIMD_WIDTH == 8
350        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
351        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
352        rg = _mm_unpacklo_epi8(rg, g);
353        _mm_store_si128((__m128i*)pDst, rg);
354#else
355#error Unsupported vector width
356#endif
357    }
358};
359
360//////////////////////////////////////////////////////////////////////////
361/// Transpose32_32_32_32
362//////////////////////////////////////////////////////////////////////////
363struct Transpose32_32_32_32
364{
365    //////////////////////////////////////////////////////////////////////////
366    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
367    /// @param pSrc - source data in SOA form
368    /// @param pDst - output data in AOS form
369    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
370    {
371#if KNOB_SIMD_WIDTH == 8
372        simdscalar src0 = _simd_load_ps((const float*)pSrc);
373        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
374        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
375        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
376
377        __m128 vDst[8];
378        vTranspose4x8(vDst, src0, src1, src2, src3);
379        _mm_store_ps((float*)pDst, vDst[0]);
380        _mm_store_ps((float*)pDst+4, vDst[1]);
381        _mm_store_ps((float*)pDst+8, vDst[2]);
382        _mm_store_ps((float*)pDst+12, vDst[3]);
383        _mm_store_ps((float*)pDst+16, vDst[4]);
384        _mm_store_ps((float*)pDst+20, vDst[5]);
385        _mm_store_ps((float*)pDst+24, vDst[6]);
386        _mm_store_ps((float*)pDst+28, vDst[7]);
387#else
388#error Unsupported vector width
389#endif
390    }
391};
392
393//////////////////////////////////////////////////////////////////////////
394/// Transpose32_32_32
395//////////////////////////////////////////////////////////////////////////
396struct Transpose32_32_32
397{
398    //////////////////////////////////////////////////////////////////////////
399    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
400    /// @param pSrc - source data in SOA form
401    /// @param pDst - output data in AOS form
402    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
403    {
404#if KNOB_SIMD_WIDTH == 8
405        simdscalar src0 = _simd_load_ps((const float*)pSrc);
406        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
407        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
408
409        __m128 vDst[8];
410        vTranspose3x8(vDst, src0, src1, src2);
411        _mm_store_ps((float*)pDst, vDst[0]);
412        _mm_store_ps((float*)pDst + 4, vDst[1]);
413        _mm_store_ps((float*)pDst + 8, vDst[2]);
414        _mm_store_ps((float*)pDst + 12, vDst[3]);
415        _mm_store_ps((float*)pDst + 16, vDst[4]);
416        _mm_store_ps((float*)pDst + 20, vDst[5]);
417        _mm_store_ps((float*)pDst + 24, vDst[6]);
418        _mm_store_ps((float*)pDst + 28, vDst[7]);
419#else
420#error Unsupported vector width
421#endif
422    }
423};
424
425//////////////////////////////////////////////////////////////////////////
426/// Transpose32_32
427//////////////////////////////////////////////////////////////////////////
428struct Transpose32_32
429{
430    //////////////////////////////////////////////////////////////////////////
431    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
432    /// @param pSrc - source data in SOA form
433    /// @param pDst - output data in AOS form
434    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
435    {
436        const float* pfSrc = (const float*)pSrc;
437        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
438        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
439        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
440        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
441
442        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
443        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
444        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
445        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
446
447        float* pfDst = (float*)pDst;
448        _mm_store_ps(pfDst + 0, dst0);
449        _mm_store_ps(pfDst + 4, dst1);
450        _mm_store_ps(pfDst + 8, dst2);
451        _mm_store_ps(pfDst + 12, dst3);
452    }
453};
454
455//////////////////////////////////////////////////////////////////////////
456/// Transpose16_16_16_16
457//////////////////////////////////////////////////////////////////////////
458struct Transpose16_16_16_16
459{
460    //////////////////////////////////////////////////////////////////////////
461    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
462    /// @param pSrc - source data in SOA form
463    /// @param pDst - output data in AOS form
464    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
465    {
466#if KNOB_SIMD_WIDTH == 8
467        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
468        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
469
470        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
471        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
472        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
473        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
474
475        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
476        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
477        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
478        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
479
480        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
481        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
482        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
483        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
484
485        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
486        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
487        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
488        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
489#else
490#error Unsupported vector width
491#endif
492    }
493};
494
495//////////////////////////////////////////////////////////////////////////
496/// Transpose16_16_16
497//////////////////////////////////////////////////////////////////////////
498struct Transpose16_16_16
499{
500    //////////////////////////////////////////////////////////////////////////
501    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
502    /// @param pSrc - source data in SOA form
503    /// @param pDst - output data in AOS form
504    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
505    {
506#if KNOB_SIMD_WIDTH == 8
507        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
508
509        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
510        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
511        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
512        __m128i src_a = _mm_undefined_si128();
513
514        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
515        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
516        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
517        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
518
519        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
520        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
521        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
522        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
523
524        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
525        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
526        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
527        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
528#else
529#error Unsupported vector width
530#endif
531    }
532};
533
534//////////////////////////////////////////////////////////////////////////
535/// Transpose16_16
536//////////////////////////////////////////////////////////////////////////
537struct Transpose16_16
538{
539    //////////////////////////////////////////////////////////////////////////
540    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
541    /// @param pSrc - source data in SOA form
542    /// @param pDst - output data in AOS form
543    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
544    {
545        simdscalar src = _simd_load_ps((const float*)pSrc);
546
547#if KNOB_SIMD_WIDTH == 8
548        __m128 comp0 = _mm256_castps256_ps128(src);
549        __m128 comp1 = _mm256_extractf128_ps(src, 1);
550
551        __m128i comp0i = _mm_castps_si128(comp0);
552        __m128i comp1i = _mm_castps_si128(comp1);
553
554        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
555        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
556
557        _mm_store_si128((__m128i*)pDst, resLo);
558        _mm_store_si128((__m128i*)pDst + 1, resHi);
559#else
560#error Unsupported vector width
561#endif
562    }
563};
564
565//////////////////////////////////////////////////////////////////////////
566/// Transpose24_8
567//////////////////////////////////////////////////////////////////////////
568struct Transpose24_8
569{
570    //////////////////////////////////////////////////////////////////////////
571    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
572    /// @param pSrc - source data in SOA form
573    /// @param pDst - output data in AOS form
574    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
575};
576
577//////////////////////////////////////////////////////////////////////////
578/// Transpose32_8_24
579//////////////////////////////////////////////////////////////////////////
580struct Transpose32_8_24
581{
582    //////////////////////////////////////////////////////////////////////////
583    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
584    /// @param pSrc - source data in SOA form
585    /// @param pDst - output data in AOS form
586    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
587};
588
589
590
591//////////////////////////////////////////////////////////////////////////
592/// Transpose4_4_4_4
593//////////////////////////////////////////////////////////////////////////
594struct Transpose4_4_4_4
595{
596    //////////////////////////////////////////////////////////////////////////
597    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
598    /// @param pSrc - source data in SOA form
599    /// @param pDst - output data in AOS form
600    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
601};
602
603//////////////////////////////////////////////////////////////////////////
604/// Transpose5_6_5
605//////////////////////////////////////////////////////////////////////////
606struct Transpose5_6_5
607{
608    //////////////////////////////////////////////////////////////////////////
609    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
610    /// @param pSrc - source data in SOA form
611    /// @param pDst - output data in AOS form
612    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
613};
614
615//////////////////////////////////////////////////////////////////////////
616/// Transpose9_9_9_5
617//////////////////////////////////////////////////////////////////////////
618struct Transpose9_9_9_5
619{
620    //////////////////////////////////////////////////////////////////////////
621    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
622    /// @param pSrc - source data in SOA form
623    /// @param pDst - output data in AOS form
624    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
625};
626
627//////////////////////////////////////////////////////////////////////////
628/// Transpose5_5_5_1
629//////////////////////////////////////////////////////////////////////////
630struct Transpose5_5_5_1
631{
632    //////////////////////////////////////////////////////////////////////////
633    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
634    /// @param pSrc - source data in SOA form
635    /// @param pDst - output data in AOS form
636    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
637};
638
639//////////////////////////////////////////////////////////////////////////
640/// Transpose10_10_10_2
641//////////////////////////////////////////////////////////////////////////
642struct Transpose10_10_10_2
643{
644    //////////////////////////////////////////////////////////////////////////
645    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
646    /// @param pSrc - source data in SOA form
647    /// @param pDst - output data in AOS form
648    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
649};
650
651//////////////////////////////////////////////////////////////////////////
652/// Transpose11_11_10
653//////////////////////////////////////////////////////////////////////////
654struct Transpose11_11_10
655{
656    //////////////////////////////////////////////////////////////////////////
657    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
658    /// @param pSrc - source data in SOA form
659    /// @param pDst - output data in AOS form
660    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
661};
662
663// helper function to unroll loops
664template<int Begin, int End, int Step = 1>
665struct UnrollerL {
666    template<typename Lambda>
667    INLINE static void step(Lambda& func) {
668        func(Begin);
669        UnrollerL<Begin + Step, End, Step>::step(func);
670    }
671};
672
673template<int End, int Step>
674struct UnrollerL<End, End, Step> {
675    template<typename Lambda>
676    static void step(Lambda& func) {
677    }
678};
679
680// general CRC compute
681INLINE
682uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
683{
684#if defined(_WIN64) || defined(__x86_64__)
685    uint32_t sizeInQwords = size / sizeof(uint64_t);
686    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
687    uint64_t* pDataWords = (uint64_t*)pData;
688    for (uint32_t i = 0; i < sizeInQwords; ++i)
689    {
690        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
691    }
692#else
693    uint32_t sizeInDwords = size / sizeof(uint32_t);
694    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
695    uint32_t* pDataWords = (uint32_t*)pData;
696    for (uint32_t i = 0; i < sizeInDwords; ++i)
697    {
698        crc = _mm_crc32_u32(crc, *pDataWords++);
699    }
700#endif
701
702    BYTE* pRemainderBytes = (BYTE*)pDataWords;
703    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
704    {
705        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
706    }
707
708    return crc;
709}
710
711//////////////////////////////////////////////////////////////////////////
712/// Add byte offset to any-type pointer
713//////////////////////////////////////////////////////////////////////////
714template <typename T>
715INLINE
716static T* PtrAdd(T* p, intptr_t offset)
717{
718    intptr_t intp = reinterpret_cast<intptr_t>(p);
719    return reinterpret_cast<T*>(intp + offset);
720}
721
722//////////////////////////////////////////////////////////////////////////
723/// Is a power-of-2?
724//////////////////////////////////////////////////////////////////////////
725template <typename T>
726INLINE
727static bool IsPow2(T value)
728{
729    return value == (value & (0 - value));
730}
731
732//////////////////////////////////////////////////////////////////////////
733/// Align down to specified alignment
734/// Note: IsPow2(alignment) MUST be true
735//////////////////////////////////////////////////////////////////////////
736template <typename T1, typename T2>
737INLINE
738static T1 AlignDownPow2(T1 value, T2 alignment)
739{
740    SWR_ASSERT(IsPow2(alignment));
741    return value & ~T1(alignment - 1);
742}
743
744//////////////////////////////////////////////////////////////////////////
745/// Align up to specified alignment
746/// Note: IsPow2(alignment) MUST be true
747//////////////////////////////////////////////////////////////////////////
748template <typename T1, typename T2>
749INLINE
750static T1 AlignUpPow2(T1 value, T2 alignment)
751{
752    return AlignDownPow2(value + T1(alignment - 1), alignment);
753}
754
755//////////////////////////////////////////////////////////////////////////
756/// Align up ptr to specified alignment
757/// Note: IsPow2(alignment) MUST be true
758//////////////////////////////////////////////////////////////////////////
759template <typename T1, typename T2>
760INLINE
761static T1* AlignUpPow2(T1* value, T2 alignment)
762{
763    return reinterpret_cast<T1*>(
764        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
765}
766
767//////////////////////////////////////////////////////////////////////////
768/// Align down to specified alignment
769//////////////////////////////////////////////////////////////////////////
770template <typename T1, typename T2>
771INLINE
772static T1 AlignDown(T1 value, T2 alignment)
773{
774    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
775    return value - T1(value % alignment);
776}
777
778//////////////////////////////////////////////////////////////////////////
779/// Align down to specified alignment
780//////////////////////////////////////////////////////////////////////////
781template <typename T1, typename T2>
782INLINE
783static T1* AlignDown(T1* value, T2 alignment)
784{
785    return (T1*)AlignDown(uintptr_t(value), alignment);
786}
787
788//////////////////////////////////////////////////////////////////////////
789/// Align up to specified alignment
790/// Note: IsPow2(alignment) MUST be true
791//////////////////////////////////////////////////////////////////////////
792template <typename T1, typename T2>
793INLINE
794static T1 AlignUp(T1 value, T2 alignment)
795{
796    return AlignDown(value + T1(alignment - 1), alignment);
797}
798
799//////////////////////////////////////////////////////////////////////////
800/// Align up to specified alignment
801/// Note: IsPow2(alignment) MUST be true
802//////////////////////////////////////////////////////////////////////////
803template <typename T1, typename T2>
804INLINE
805static T1* AlignUp(T1* value, T2 alignment)
806{
807    return AlignDown(PtrAdd(value, alignment - 1), alignment);
808}
809
810//////////////////////////////////////////////////////////////////////////
811/// Helper structure used to access an array of elements that don't
812/// correspond to a typical word size.
813//////////////////////////////////////////////////////////////////////////
814template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
815class BitsArray
816{
817private:
818    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
819    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
820    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
821    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
822
823    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
824        "Element size must an integral fraction of pointer size");
825
826    size_t              m_words[NUM_WORDS] = {};
827
828public:
829
830    T operator[] (size_t elementIndex) const
831    {
832        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
833        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
834        return T(word & ELEMENT_MASK);
835    }
836};
837