1/****************************************************************************
2* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file StoreTile.h
24*
25* @brief Functionality for Store.
26*
27******************************************************************************/
28#pragma once
29
30#include "common/os.h"
31#include "common/formats.h"
32#include "core/context.h"
33#include "core/rdtsc_core.h"
34#include "core/format_conversion.h"
35
36#include "memory/TilingFunctions.h"
37#include "memory/Convert.h"
38#include "core/multisample.h"
39
40#include <array>
41#include <sstream>
42
43// Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
44typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
45
46//////////////////////////////////////////////////////////////////////////
47/// Store Raster Tile Function Tables.
48//////////////////////////////////////////////////////////////////////////
49extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
50extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
51extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
52
53void InitStoreTilesTable_Linear_1();
54void InitStoreTilesTable_Linear_2();
55void InitStoreTilesTable_TileX_1();
56void InitStoreTilesTable_TileX_2();
57void InitStoreTilesTable_TileY_1();
58void InitStoreTilesTable_TileY_2();
59void InitStoreTilesTable_TileW();
60void InitStoreTilesTable();
61
62//////////////////////////////////////////////////////////////////////////
63/// StorePixels
64/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
65/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
66/// @param ppDsts   - Array of destination pointers.  Each pointer is
67///                   to a single row of at most 16B.
68/// @tparam NumDests - Number of destination pointers.  Each pair of
69///                    pointers is for a 16-byte column of two rows.
70//////////////////////////////////////////////////////////////////////////
71template <size_t PixelSize, size_t NumDests>
72struct StorePixels
73{
74    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
75};
76
77//////////////////////////////////////////////////////////////////////////
78/// StorePixels (32-bit pixel specialization)
79/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
80/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
81/// @param ppDsts   - Array of destination pointers.  Each pointer is
82///                   to a single row of at most 16B.
83/// @tparam NumDests - Number of destination pointers.  Each pair of
84///                    pointers is for a 16-byte column of two rows.
85//////////////////////////////////////////////////////////////////////////
86template <>
87struct StorePixels<8, 2>
88{
89    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
90    {
91        // Each 4-pixel row is 4 bytes.
92        const uint16_t* pPixSrc = (const uint16_t*)pSrc;
93
94        // Unswizzle from SWR-Z order
95        uint16_t* pRow = (uint16_t*)ppDsts[0];
96        pRow[0] = pPixSrc[0];
97        pRow[1] = pPixSrc[2];
98
99        pRow = (uint16_t*)ppDsts[1];
100        pRow[0] = pPixSrc[1];
101        pRow[1] = pPixSrc[3];
102    }
103};
104
105#if USE_8x2_TILE_BACKEND
106template <>
107struct StorePixels<8, 4>
108{
109    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110    {
111        // 8 x 2 bytes = 16 bytes, 16 pixels
112        const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113
114        uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115
116        // Unswizzle from SWR-Z order
117        ppDsts16[0][0] = pSrc16[0];     // 0 1
118        ppDsts16[0][1] = pSrc16[2];     // 4 5
119
120        ppDsts16[1][0] = pSrc16[1];     // 2 3
121        ppDsts16[1][1] = pSrc16[3];     // 6 7
122
123        ppDsts16[2][0] = pSrc16[4];     // 8 9
124        ppDsts16[2][1] = pSrc16[6];     // C D
125
126        ppDsts16[3][0] = pSrc16[5];     // A B
127        ppDsts16[3][1] = pSrc16[7];     // E F
128    }
129};
130
131#endif
132//////////////////////////////////////////////////////////////////////////
133/// StorePixels (32-bit pixel specialization)
134/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
135/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
136/// @param ppDsts   - Array of destination pointers.  Each pointer is
137///                   to a single row of at most 16B.
138/// @tparam NumDests - Number of destination pointers.  Each pair of
139///                    pointers is for a 16-byte column of two rows.
140//////////////////////////////////////////////////////////////////////////
141template <>
142struct StorePixels<16, 2>
143{
144    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
145    {
146        // Each 4-pixel row is 8 bytes.
147        const uint32_t* pPixSrc = (const uint32_t*)pSrc;
148
149        // Unswizzle from SWR-Z order
150        uint32_t* pRow = (uint32_t*)ppDsts[0];
151        pRow[0] = pPixSrc[0];
152        pRow[1] = pPixSrc[2];
153
154        pRow = (uint32_t*)ppDsts[1];
155        pRow[0] = pPixSrc[1];
156        pRow[1] = pPixSrc[3];
157    }
158};
159
160#if USE_8x2_TILE_BACKEND
161template <>
162struct StorePixels<16, 4>
163{
164    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165    {
166        // 8 x 4 bytes = 32 bytes, 16 pixels
167        const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168
169        uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170
171        // Unswizzle from SWR-Z order
172        ppDsts32[0][0] = pSrc32[0];     // 0 1
173        ppDsts32[0][1] = pSrc32[2];     // 4 5
174
175        ppDsts32[1][0] = pSrc32[1];     // 2 3
176        ppDsts32[1][1] = pSrc32[3];     // 6 7
177
178        ppDsts32[2][0] = pSrc32[4];     // 8 9
179        ppDsts32[2][1] = pSrc32[6];     // C D
180
181        ppDsts32[3][0] = pSrc32[5];     // A B
182        ppDsts32[3][1] = pSrc32[7];     // E F
183    }
184};
185
186#endif
187//////////////////////////////////////////////////////////////////////////
188/// StorePixels (32-bit pixel specialization)
189/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
190/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
191/// @param ppDsts   - Array of destination pointers.  Each pointer is
192///                   to a single row of at most 16B.
193/// @tparam NumDests - Number of destination pointers.  Each pair of
194///                    pointers is for a 16-byte column of two rows.
195//////////////////////////////////////////////////////////////////////////
196template <>
197struct StorePixels<32, 2>
198{
199    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
200    {
201        // Each 4-pixel row is 16-bytes
202        __m128i *pZRow01 = (__m128i*)pSrc;
203        __m128i vQuad00 = _mm_load_si128(pZRow01);
204        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
205
206        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
207        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
208
209        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
210        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
211    }
212};
213
214#if USE_8x2_TILE_BACKEND
215template <>
216struct StorePixels<32, 4>
217{
218    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219    {
220        // 4 x 16 bytes = 64 bytes, 16 pixels
221        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
222
223        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
224
225        // Unswizzle from SWR-Z order
226        __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
227        __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
228        __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
229        __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
230
231        _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
232        _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
233        _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
234        _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
235    }
236};
237
238#endif
239//////////////////////////////////////////////////////////////////////////
240/// StorePixels (32-bit pixel specialization)
241/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
242/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
243/// @param ppDsts   - Array of destination pointers.  Each pointer is
244///                   to a single row of at most 16B.
245/// @tparam NumDests - Number of destination pointers.  Each pair of
246///                    pointers is for a 16-byte column of two rows.
247//////////////////////////////////////////////////////////////////////////
248template <>
249struct StorePixels<64, 4>
250{
251    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
252    {
253        // Each 4-pixel row is 32 bytes.
254        const __m128i* pPixSrc = (const __m128i*)pSrc;
255
256        // order of pointers match SWR-Z layout
257        __m128i** pvDsts = (__m128i**)&ppDsts[0];
258        *pvDsts[0] = pPixSrc[0];
259        *pvDsts[1] = pPixSrc[1];
260        *pvDsts[2] = pPixSrc[2];
261        *pvDsts[3] = pPixSrc[3];
262    }
263};
264
265#if USE_8x2_TILE_BACKEND
266template <>
267struct StorePixels<64, 8>
268{
269    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270    {
271        // 8 x 16 bytes = 128 bytes, 16 pixels
272        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
273
274        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
275
276        // order of pointers match SWR-Z layout
277        *ppDsts128[0] = pSrc128[0];     // 0 1
278        *ppDsts128[1] = pSrc128[1];     // 2 3
279        *ppDsts128[2] = pSrc128[2];     // 4 5
280        *ppDsts128[3] = pSrc128[3];     // 6 7
281        *ppDsts128[4] = pSrc128[4];     // 8 9
282        *ppDsts128[5] = pSrc128[5];     // A B
283        *ppDsts128[6] = pSrc128[6];     // C D
284        *ppDsts128[7] = pSrc128[7];     // E F
285    }
286};
287
288#endif
289//////////////////////////////////////////////////////////////////////////
290/// StorePixels (32-bit pixel specialization)
291/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
292/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
293/// @param ppDsts   - Array of destination pointers.  Each pointer is
294///                   to a single row of at most 16B.
295/// @tparam NumDests - Number of destination pointers.  Each pair of
296///                    pointers is for a 16-byte column of two rows.
297//////////////////////////////////////////////////////////////////////////
298template <>
299struct StorePixels<128, 8>
300{
301    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
302    {
303        // Each 4-pixel row is 64 bytes.
304        const __m128i* pPixSrc = (const __m128i*)pSrc;
305
306        // Unswizzle from SWR-Z order
307        __m128i** pvDsts = (__m128i**)&ppDsts[0];
308        *pvDsts[0] = pPixSrc[0];
309        *pvDsts[1] = pPixSrc[2];
310        *pvDsts[2] = pPixSrc[1];
311        *pvDsts[3] = pPixSrc[3];
312        *pvDsts[4] = pPixSrc[4];
313        *pvDsts[5] = pPixSrc[6];
314        *pvDsts[6] = pPixSrc[5];
315        *pvDsts[7] = pPixSrc[7];
316    }
317};
318
319#if USE_8x2_TILE_BACKEND
320template <>
321struct StorePixels<128, 16>
322{
323    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324    {
325        // 16 x 16 bytes = 256 bytes, 16 pixels
326        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
327
328        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
329
330        for (uint32_t i = 0; i < 16; i += 4)
331        {
332            *ppDsts128[i + 0] = pSrc128[i + 0];
333            *ppDsts128[i + 1] = pSrc128[i + 2];
334            *ppDsts128[i + 2] = pSrc128[i + 1];
335            *ppDsts128[i + 3] = pSrc128[i + 3];
336        }
337    }
338};
339
340#endif
341//////////////////////////////////////////////////////////////////////////
342/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
343//////////////////////////////////////////////////////////////////////////
344template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
345struct ConvertPixelsSOAtoAOS
346{
347    //////////////////////////////////////////////////////////////////////////
348    /// @brief Converts a SIMD from the Hot Tile to the destination format
349    ///        and converts from SOA to AOS.
350    /// @param pSrc - Pointer to raster tile.
351    /// @param pDst - Pointer to destination surface or deswizzling buffer.
352    template <size_t NumDests>
353    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
354    {
355#if USE_8x2_TILE_BACKEND
356        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357
358        OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360
361        // Convert from SrcFormat --> DstFormat
362        simd16vector src;
363        LoadSOA<SrcFormat>(pSrc, src);
364        StoreSOA<DstFormat>(src, soaTile);
365
366        // Convert from SOA --> AOS
367        FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368
369#else
370        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
371
372        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
373        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
374
375        // Convert from SrcFormat --> DstFormat
376        simdvector src;
377        LoadSOA<SrcFormat>(pSrc, src);
378        StoreSOA<DstFormat>(src, soaTile);
379
380        // Convert from SOA --> AOS
381        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
382
383#endif
384        // Store data into destination
385        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
386    }
387};
388
389//////////////////////////////////////////////////////////////////////////
390/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
391/// Specialization for no format conversion
392//////////////////////////////////////////////////////////////////////////
393template<SWR_FORMAT Format>
394struct ConvertPixelsSOAtoAOS<Format, Format>
395{
396    //////////////////////////////////////////////////////////////////////////
397    /// @brief Converts a SIMD from the Hot Tile to the destination format
398    ///        and converts from SOA to AOS.
399    /// @param pSrc - Pointer to raster tile.
400    /// @param pDst - Pointer to destination surface or deswizzling buffer.
401    template <size_t NumDests>
402    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
403    {
404#if USE_8x2_TILE_BACKEND
405        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406
407        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408
409        // Convert from SOA --> AOS
410        FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411
412#else
413        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
414
415        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
416
417        // Convert from SOA --> AOS
418        FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
419
420#endif
421        // Store data into destination
422        StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
423    }
424};
425
426//////////////////////////////////////////////////////////////////////////
427/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
428//////////////////////////////////////////////////////////////////////////
429template<>
430struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
431{
432    //////////////////////////////////////////////////////////////////////////
433    /// @brief Converts a SIMD from the Hot Tile to the destination format
434    ///        and converts from SOA to AOS.
435    /// @param pSrc - Pointer to raster tile.
436    /// @param pDst - Pointer to destination surface or deswizzling buffer.
437    template <size_t NumDests>
438    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
439    {
440#if USE_8x2_TILE_BACKEND
441        static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442        static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443
444        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445
446        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447
448        // Load hot-tile
449        simd16vector src, dst;
450        LoadSOA<SrcFormat>(pSrc, src);
451
452        // deswizzle
453        dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454        dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455        dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456
457        // clamp
458        dst.x = Clamp<DstFormat>(dst.x, 0);
459        dst.y = Clamp<DstFormat>(dst.y, 1);
460        dst.z = Clamp<DstFormat>(dst.z, 2);
461
462        // normalize
463        dst.x = Normalize<DstFormat>(dst.x, 0);
464        dst.y = Normalize<DstFormat>(dst.y, 1);
465        dst.z = Normalize<DstFormat>(dst.z, 2);
466
467        // pack
468        simd16scalari packed = _simd16_castps_si(dst.x);
469
470        SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471        SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472
473        packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474        packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475
476        // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477        uint32_t *pPacked = (uint32_t*)&packed;
478        uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479        for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480        {
481            *pAosTile++ = *pPacked++;
482        }
483
484#else
485        static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
486        static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
487        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
488
489        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
490
491        // Load hot-tile
492        simdvector src, dst;
493        LoadSOA<SrcFormat>(pSrc, src);
494
495        // deswizzle
496        dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
497        dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
498        dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
499
500        // clamp
501        dst.x = Clamp<DstFormat>(dst.x, 0);
502        dst.y = Clamp<DstFormat>(dst.y, 1);
503        dst.z = Clamp<DstFormat>(dst.z, 2);
504
505        // normalize
506        dst.x = Normalize<DstFormat>(dst.x, 0);
507        dst.y = Normalize<DstFormat>(dst.y, 1);
508        dst.z = Normalize<DstFormat>(dst.z, 2);
509
510        // pack
511        simdscalari packed = _simd_castps_si(dst.x);
512        packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
513        packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
514                                                                              FormatTraits<DstFormat>::GetBPC(1)));
515
516        // pack low 16 bits of each 32 bit lane to low 128 bits of dst
517        uint32_t *pPacked = (uint32_t*)&packed;
518        uint16_t *pAosTile = (uint16_t*)&aosTile[0];
519        for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
520        {
521            *pAosTile++ = *pPacked++;
522        }
523
524#endif
525        // Store data into destination
526        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
527    }
528};
529
530//////////////////////////////////////////////////////////////////////////
531/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
532//////////////////////////////////////////////////////////////////////////
533template<>
534struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
535{
536    static const SWR_FORMAT SrcFormat = R32_FLOAT;
537    static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
538
539    //////////////////////////////////////////////////////////////////////////
540    /// @brief Converts a SIMD from the Hot Tile to the destination format
541    ///        and converts from SOA to AOS.
542    /// @param pSrc - Pointer to raster tile.
543    /// @param pDst - Pointer to destination surface or deswizzling buffer.
544    template <size_t NumDests>
545    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
546    {
547#if USE_8x2_TILE_BACKEND
548        simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
549
550        // clamp
551        const simd16scalar zero = _simd16_setzero_ps();
552        const simd16scalar ones = _simd16_set1_ps(1.0f);
553
554        comp = _simd16_max_ps(comp, zero);
555        comp = _simd16_min_ps(comp, ones);
556
557        // normalize
558        comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
559
560        simd16scalari temp = _simd16_cvtps_epi32(comp);
561
562        // swizzle
563        temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
564
565        // merge/store data into destination but don't overwrite the X8 bits
566        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
567        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
568
569        simd16scalari dest = _simd16_setzero_si();
570
571        dest = _simd16_insert_si(dest, destlo, 0);
572        dest = _simd16_insert_si(dest, desthi, 1);
573
574        simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
575
576        dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
577
578        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
579        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
580#else
581        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
582
583        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
584        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
585
586        // Convert from SrcFormat --> DstFormat
587        simdvector src;
588        LoadSOA<SrcFormat>(pSrc, src);
589        StoreSOA<DstFormat>(src, soaTile);
590
591        // Convert from SOA --> AOS
592        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
593
594        // Store data into destination but don't overwrite the X8 bits
595        // Each 4-pixel row is 16-bytes
596        __m128i *pZRow01 = (__m128i*)aosTile;
597        __m128i vQuad00 = _mm_load_si128(pZRow01);
598        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
599
600        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
601        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
602
603        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
604        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
605
606        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
607
608        vDst0 = _mm_andnot_si128(vMask, vDst0);
609        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
610        vDst1 = _mm_andnot_si128(vMask, vDst1);
611        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
612
613        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
614        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
615#endif
616    }
617};
618
619#if USE_8x2_TILE_BACKEND
620template<SWR_FORMAT DstFormat>
621INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622{
623    // swizzle rgba -> bgra while we load
624    simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625    simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626    simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627    simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628
629    // clamp
630    const simd16scalar zero = _simd16_setzero_ps();
631    const simd16scalar ones = _simd16_set1_ps(1.0f);
632
633    comp0 = _simd16_max_ps(comp0, zero);
634    comp0 = _simd16_min_ps(comp0, ones);
635
636    comp1 = _simd16_max_ps(comp1, zero);
637    comp1 = _simd16_min_ps(comp1, ones);
638
639    comp2 = _simd16_max_ps(comp2, zero);
640    comp2 = _simd16_min_ps(comp2, ones);
641
642    comp3 = _simd16_max_ps(comp3, zero);
643    comp3 = _simd16_min_ps(comp3, ones);
644
645    // gamma-correct only rgb
646    if (FormatTraits<DstFormat>::isSRGB)
647    {
648        comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649        comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650        comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651    }
652
653    // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654    comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655    comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656    comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657    comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658
659    // moving to 16 wide integer vector types
660    simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661    simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662    simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663    simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664
665    // SOA to AOS conversion
666    src1 = _simd16_slli_epi32(src1,  8);
667    src2 = _simd16_slli_epi32(src2, 16);
668    src3 = _simd16_slli_epi32(src3, 24);
669
670    simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671
672    // de-swizzle conversion
673#if 1
674    simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675    simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
676
677    final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678
679#else
680    final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681
682#endif
683    // store 8x2 memory order:
684    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
687    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
688}
689
690#endif
691template<SWR_FORMAT DstFormat>
692INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
693{
694    static const uint32_t offset = sizeof(simdscalar);
695
696    // swizzle rgba -> bgra while we load
697    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
698    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
699    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
700    simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
701
702    // clamp
703    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
704    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
705
706    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
707    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
708
709    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
710    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
711
712    vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
713    vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
714
715    if (FormatTraits<DstFormat>::isSRGB)
716    {
717        // Gamma-correct only rgb
718        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
719        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
720        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
721    }
722
723    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
724    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
725    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
726    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
727    vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
728
729    // moving to 8 wide integer vector types
730    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
731    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
732    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
733    __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
734
735#if KNOB_ARCH == KNOB_ARCH_AVX
736
737    // splitting into two sets of 4 wide integer vector types
738    // because AVX doesn't have instructions to support this operation at 8 wide
739    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
740    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
741    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
742    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
743
744    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
745    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
746    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
747    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
748
749    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
750    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
751    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
752    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
753    srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
754    srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
755
756    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
757    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
758
759    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
760    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
761
762    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
763    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
764
765    // unpack into rows that get the tiling order correct
766    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
767    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
768
769    __m256i final = _mm256_castsi128_si256(vRow00);
770    final = _mm256_insertf128_si256(final, vRow10, 1);
771
772#elif KNOB_ARCH >= KNOB_ARCH_AVX2
773
774    // logic is as above, only wider
775    src1 = _mm256_slli_si256(src1, 1);
776    src2 = _mm256_slli_si256(src2, 2);
777    src3 = _mm256_slli_si256(src3, 3);
778
779    src0 = _mm256_or_si256(src0, src1);
780    src2 = _mm256_or_si256(src2, src3);
781
782    __m256i final = _mm256_or_si256(src0, src2);
783#if 0
784
785    __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
786
787    final = _mm256_permutevar8x32_epi32(final, perm);
788#else
789
790    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
791    final = _mm256_permute4x64_epi64(final, 0xD8);
792#endif
793#endif
794
795    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
796}
797
798#if USE_8x2_TILE_BACKEND
799template<SWR_FORMAT DstFormat>
800INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
801{
802    // swizzle rgba -> bgra while we load
803    simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
804    simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
805    simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
806
807    // clamp
808    const simd16scalar zero = _simd16_setzero_ps();
809    const simd16scalar ones = _simd16_set1_ps(1.0f);
810
811    comp0 = _simd16_max_ps(comp0, zero);
812    comp0 = _simd16_min_ps(comp0, ones);
813
814    comp1 = _simd16_max_ps(comp1, zero);
815    comp1 = _simd16_min_ps(comp1, ones);
816
817    comp2 = _simd16_max_ps(comp2, zero);
818    comp2 = _simd16_min_ps(comp2, ones);
819
820    // gamma-correct only rgb
821    if (FormatTraits<DstFormat>::isSRGB)
822    {
823        comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
824        comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
825        comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
826    }
827
828    // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
829    comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
830    comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
831    comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
832
833    // moving to 16 wide integer vector types
834    simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
835    simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
836    simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
837
838    // SOA to AOS conversion
839    src1 = _simd16_slli_epi32(src1,  8);
840    src2 = _simd16_slli_epi32(src2, 16);
841
842    simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
843
844    // de-swizzle conversion
845#if 1
846    simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
847    simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
848
849    final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
850
851#else
852    final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
853
854#endif
855    // store 8x2 memory order:
856    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
857    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
858    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
859    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
860}
861
862#endif
863template<SWR_FORMAT DstFormat>
864INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
865{
866    static const uint32_t offset = sizeof(simdscalar);
867
868    // swizzle rgba -> bgra while we load
869    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
870    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
871    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
872                                                                                                            // clamp
873    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
874    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
875
876    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
877    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
878
879    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
880    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
881
882    if (FormatTraits<DstFormat>::isSRGB)
883    {
884        // Gamma-correct only rgb
885        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
886        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
887        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
888    }
889
890    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
891    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
892    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
893    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
894
895    // moving to 8 wide integer vector types
896    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
897    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
898    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
899
900#if KNOB_ARCH == KNOB_ARCH_AVX
901
902    // splitting into two sets of 4 wide integer vector types
903    // because AVX doesn't have instructions to support this operation at 8 wide
904    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
905    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
906    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
907
908    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
909    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
910    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
911
912    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
913    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
914    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
915    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
916
917    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
918
919    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
920
921    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
922    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
923
924    // unpack into rows that get the tiling order correct
925    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
926    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
927
928    __m256i final = _mm256_castsi128_si256(vRow00);
929    final = _mm256_insertf128_si256(final, vRow10, 1);
930
931#elif KNOB_ARCH >= KNOB_ARCH_AVX2
932
933                                              // logic is as above, only wider
934    src1 = _mm256_slli_si256(src1, 1);
935    src2 = _mm256_slli_si256(src2, 2);
936
937    src0 = _mm256_or_si256(src0, src1);
938
939    __m256i final = _mm256_or_si256(src0, src2);
940
941    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
942    final = _mm256_permute4x64_epi64(final, 0xD8);
943
944#endif
945
946    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
947}
948
949template<>
950struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
951{
952    template <size_t NumDests>
953    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
954    {
955#if USE_8x2_TILE_BACKEND
956        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
957#else
958        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
959#endif
960    }
961};
962
963template<>
964struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
965{
966    template <size_t NumDests>
967    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
968    {
969#if USE_8x2_TILE_BACKEND
970        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
971#else
972        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
973#endif
974    }
975};
976
977template<>
978struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
979{
980    template <size_t NumDests>
981    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
982    {
983#if USE_8x2_TILE_BACKEND
984        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
985#else
986        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
987#endif
988    }
989};
990
991template<>
992struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
993{
994    template <size_t NumDests>
995    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
996    {
997#if USE_8x2_TILE_BACKEND
998        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
999#else
1000        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1001#endif
1002    }
1003};
1004
1005template<>
1006struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
1007{
1008    template <size_t NumDests>
1009    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1010    {
1011#if USE_8x2_TILE_BACKEND
1012        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1013#else
1014        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1015#endif
1016    }
1017};
1018
1019template<>
1020struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
1021{
1022    template <size_t NumDests>
1023    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1024    {
1025#if USE_8x2_TILE_BACKEND
1026        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1027#else
1028        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1029#endif
1030    }
1031};
1032
1033template<>
1034struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
1035{
1036    template <size_t NumDests>
1037    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1038    {
1039#if USE_8x2_TILE_BACKEND
1040        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1041#else
1042        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1043#endif
1044    }
1045};
1046
1047template<>
1048struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
1049{
1050    template <size_t NumDests>
1051    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
1052    {
1053#if USE_8x2_TILE_BACKEND
1054        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1055#else
1056        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1057#endif
1058    }
1059};
1060
1061//////////////////////////////////////////////////////////////////////////
1062/// StoreRasterTile
1063//////////////////////////////////////////////////////////////////////////
1064template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1065struct StoreRasterTile
1066{
1067    //////////////////////////////////////////////////////////////////////////
1068    /// @brief Retrieve color from hot tile source which is always float.
1069    /// @param pSrc - Pointer to raster tile.
1070    /// @param x, y - Coordinates to raster tile.
1071    /// @param output - output color
1072    INLINE static void GetSwizzledSrcColor(
1073        uint8_t* pSrc,
1074        uint32_t x, uint32_t y,
1075        float outputColor[4])
1076    {
1077#if USE_8x2_TILE_BACKEND
1078        typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1079
1080        SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1081
1082        // Compute which simd tile we're accessing within 8x8 tile.
1083        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1084        uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1085
1086        SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1087
1088        uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1089
1090        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1091#else
1092        typedef SimdTile<SrcFormat, DstFormat> SimdT;
1093
1094        SimdT* pSrcSimdTiles = (SimdT*)pSrc;
1095
1096        // Compute which simd tile we're accessing within 8x8 tile.
1097        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1098        uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
1099
1100        SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
1101
1102        uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
1103
1104        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1105#endif
1106    }
1107
1108    //////////////////////////////////////////////////////////////////////////
1109    /// @brief Stores an 8x8 raster tile to the destination surface.
1110    /// @param pSrc - Pointer to raster tile.
1111    /// @param pDstSurface - Destination surface state
1112    /// @param x, y - Coordinates to raster tile.
1113    INLINE static void Store(
1114        uint8_t *pSrc,
1115        SWR_SURFACE_STATE* pDstSurface,
1116        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
1117    {
1118        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1119        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1120
1121        // For each raster tile pixel (rx, ry)
1122        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
1123        {
1124            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
1125            {
1126                // Perform bounds checking.
1127                if (((x + rx) < lodWidth) &&
1128                    ((y + ry) < lodHeight))
1129                {
1130                    float srcColor[4];
1131                    GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
1132
1133                    uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1134                        pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
1135                        sampleNum, pDstSurface->lod, pDstSurface);
1136                    {
1137                        ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
1138                    }
1139                }
1140            }
1141        }
1142    }
1143};
1144
1145template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1146struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1147{};
1148
1149//////////////////////////////////////////////////////////////////////////
1150/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1151//////////////////////////////////////////////////////////////////////////
1152template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1153struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1154{
1155    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1156    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1157    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1158
1159    //////////////////////////////////////////////////////////////////////////
1160    /// @brief Stores an 8x8 raster tile to the destination surface.
1161    /// @param pSrc - Pointer to raster tile.
1162    /// @param pDstSurface - Destination surface state
1163    /// @param x, y - Coordinates to raster tile.
1164    INLINE static void Store(
1165        uint8_t *pSrc,
1166        SWR_SURFACE_STATE* pDstSurface,
1167        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1168    {
1169        // Punt non-full tiles to generic store
1170        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1171        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1172
1173        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1174        {
1175            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1176        }
1177
1178        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1179            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1180#if USE_8x2_TILE_BACKEND
1181
1182        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1183        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1184
1185        uint8_t* ppDsts[] =
1186        {
1187            pDst,                                           // row 0, col 0
1188            pDst + pDstSurface->pitch,                      // row 1, col 0
1189            pDst + dx / 2,                                  // row 0, col 1
1190            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1191        };
1192
1193        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1194        {
1195            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1196            {
1197                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1198
1199                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1200
1201                ppDsts[0] += dx;
1202                ppDsts[1] += dx;
1203                ppDsts[2] += dx;
1204                ppDsts[3] += dx;
1205            }
1206
1207            ppDsts[0] += dy;
1208            ppDsts[1] += dy;
1209            ppDsts[2] += dy;
1210            ppDsts[3] += dy;
1211        }
1212#else
1213        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1214
1215        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1216        {
1217            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1218
1219            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1220            {
1221                // Format conversion and convert from SOA to AOS, and store the rows.
1222                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1223
1224                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1225                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1226                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1227            }
1228
1229            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1230            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1231        }
1232#endif
1233    }
1234};
1235
1236//////////////////////////////////////////////////////////////////////////
1237/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1238//////////////////////////////////////////////////////////////////////////
1239template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1240struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1241{
1242    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1243    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1244    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1245
1246    //////////////////////////////////////////////////////////////////////////
1247    /// @brief Stores an 8x8 raster tile to the destination surface.
1248    /// @param pSrc - Pointer to raster tile.
1249    /// @param pDstSurface - Destination surface state
1250    /// @param x, y - Coordinates to raster tile.
1251    INLINE static void Store(
1252        uint8_t *pSrc,
1253        SWR_SURFACE_STATE* pDstSurface,
1254        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1255    {
1256        // Punt non-full tiles to generic store
1257        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1258        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1259
1260        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1261        {
1262            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1263        }
1264
1265        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1266            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1267#if USE_8x2_TILE_BACKEND
1268
1269        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1270        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1271
1272        uint8_t* ppDsts[] =
1273        {
1274            pDst,                                           // row 0, col 0
1275            pDst + pDstSurface->pitch,                      // row 1, col 0
1276            pDst + dx / 2,                                  // row 0, col 1
1277            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1278        };
1279
1280        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1281        {
1282            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1283            {
1284                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1285
1286                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1287
1288                ppDsts[0] += dx;
1289                ppDsts[1] += dx;
1290                ppDsts[2] += dx;
1291                ppDsts[3] += dx;
1292            }
1293
1294            ppDsts[0] += dy;
1295            ppDsts[1] += dy;
1296            ppDsts[2] += dy;
1297            ppDsts[3] += dy;
1298        }
1299#else
1300        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1301
1302        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1303        {
1304            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1305
1306            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1307            {
1308                // Format conversion and convert from SOA to AOS, and store the rows.
1309                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1310
1311                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1312                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1313                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1314            }
1315
1316            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1317            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1318        }
1319#endif
1320    }
1321};
1322
1323//////////////////////////////////////////////////////////////////////////
1324/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1325//////////////////////////////////////////////////////////////////////////
1326template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1327struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1328{
1329    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1330    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1331    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1332
1333    //////////////////////////////////////////////////////////////////////////
1334    /// @brief Stores an 8x8 raster tile to the destination surface.
1335    /// @param pSrc - Pointer to raster tile.
1336    /// @param pDstSurface - Destination surface state
1337    /// @param x, y - Coordinates to raster tile.
1338    INLINE static void Store(
1339        uint8_t *pSrc,
1340        SWR_SURFACE_STATE* pDstSurface,
1341        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1342    {
1343        // Punt non-full tiles to generic store
1344        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1345        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1346
1347        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1348        {
1349            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1350        }
1351
1352        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1353            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1354#if USE_8x2_TILE_BACKEND
1355
1356        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1357        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1358
1359        uint8_t* ppDsts[] =
1360        {
1361            pDst,                                           // row 0, col 0
1362            pDst + pDstSurface->pitch,                      // row 1, col 0
1363            pDst + dx / 2,                                  // row 0, col 1
1364            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1365        };
1366
1367        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1368        {
1369            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1370            {
1371                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1372
1373                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1374
1375                ppDsts[0] += dx;
1376                ppDsts[1] += dx;
1377                ppDsts[2] += dx;
1378                ppDsts[3] += dx;
1379            }
1380
1381            ppDsts[0] += dy;
1382            ppDsts[1] += dy;
1383            ppDsts[2] += dy;
1384            ppDsts[3] += dy;
1385        }
1386#else
1387        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1388
1389        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1390        {
1391            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
1392
1393            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1394            {
1395                // Format conversion and convert from SOA to AOS, and store the rows.
1396                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
1397
1398                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1399                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1400                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
1401            }
1402
1403            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1404            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1405        }
1406#endif
1407    }
1408};
1409
1410//////////////////////////////////////////////////////////////////////////
1411/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1412//////////////////////////////////////////////////////////////////////////
1413template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1414struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1415{
1416    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1417    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1418    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1419    static const size_t MAX_DST_COLUMN_BYTES = 16;
1420#if !USE_8x2_TILE_BACKEND
1421    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1422    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1423#endif
1424
1425    //////////////////////////////////////////////////////////////////////////
1426    /// @brief Stores an 8x8 raster tile to the destination surface.
1427    /// @param pSrc - Pointer to raster tile.
1428    /// @param pDstSurface - Destination surface state
1429    /// @param x, y - Coordinates to raster tile.
1430    INLINE static void Store(
1431        uint8_t *pSrc,
1432        SWR_SURFACE_STATE* pDstSurface,
1433        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1434    {
1435        // Punt non-full tiles to generic store
1436        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1437        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1438
1439        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1440        {
1441            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1442        }
1443
1444        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1445            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1446#if USE_8x2_TILE_BACKEND
1447
1448        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1449        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1450
1451        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1452        static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1453
1454        uint8_t *ppDsts[] =
1455        {
1456            pDst,                                                               // row 0, col 0
1457            pDst + pDstSurface->pitch,                                          // row 1, col 0
1458            pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1459            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1460            pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1461            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1462            pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1463            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
1464        };
1465
1466        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1467        {
1468            // Raster tile width is same as simd16 tile width
1469            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1470
1471            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1472
1473            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1474
1475            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1476            {
1477                ppDsts[i] += dy;
1478            }
1479        }
1480#else
1481        uint8_t* ppDsts[] =
1482        {
1483            pDst,                                               // row 0, col 0
1484            pDst + pDstSurface->pitch,                          // row 1, col 0
1485            pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
1486            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
1487        };
1488
1489        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1490        {
1491            uint8_t* ppStartRows[] =
1492            {
1493                ppDsts[0],
1494                ppDsts[1],
1495                ppDsts[2],
1496                ppDsts[3],
1497            };
1498
1499            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1500            {
1501                // Format conversion and convert from SOA to AOS, and store the rows.
1502                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1503
1504                ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1505                ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1506                ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1507                ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1508                pSrc += SRC_COLUMN_BYTES;
1509            }
1510
1511            ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
1512            ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
1513            ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
1514            ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
1515        }
1516#endif
1517    }
1518};
1519
1520//////////////////////////////////////////////////////////////////////////
1521/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1522//////////////////////////////////////////////////////////////////////////
1523template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1524struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1525{
1526    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1527    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1528    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1529    static const size_t MAX_DST_COLUMN_BYTES = 16;
1530#if !USE_8x2_TILE_BACKEND
1531    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
1532    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1533#endif
1534
1535    //////////////////////////////////////////////////////////////////////////
1536    /// @brief Stores an 8x8 raster tile to the destination surface.
1537    /// @param pSrc - Pointer to raster tile.
1538    /// @param pDstSurface - Destination surface state
1539    /// @param x, y - Coordinates to raster tile.
1540    INLINE static void Store(
1541        uint8_t *pSrc,
1542        SWR_SURFACE_STATE* pDstSurface,
1543        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1544    {
1545        // Punt non-full tiles to generic store
1546        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1547        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1548
1549        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1550        {
1551            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1552        }
1553
1554        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1555            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1556#if USE_8x2_TILE_BACKEND
1557
1558        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1559        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1560
1561        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1562        static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1563
1564        uint8_t* ppDsts[] =
1565        {
1566            pDst,                                                               // row 0, col 0
1567            pDst + pDstSurface->pitch,                                          // row 1, col 0
1568            pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1569            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1570            pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1571            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1572            pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1573            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
1574            pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
1575            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
1576            pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
1577            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
1578            pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
1579            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
1580            pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
1581            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
1582        };
1583
1584        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1585        {
1586            // Raster tile width is same as simd16 tile width
1587            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1588
1589            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1590
1591            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1592
1593            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1594            {
1595                ppDsts[i] += dy;
1596            }
1597        }
1598#else
1599        struct DstPtrs
1600        {
1601            uint8_t* ppDsts[8];
1602        } ptrs;
1603
1604        // Need 8 pointers, 4 columns of 2 rows each
1605        for (uint32_t y = 0; y < 2; ++y)
1606        {
1607            for (uint32_t x = 0; x < 4; ++x)
1608            {
1609                ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
1610            }
1611        }
1612
1613        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
1614        {
1615            DstPtrs startPtrs = ptrs;
1616
1617            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
1618            {
1619                // Format conversion and convert from SOA to AOS, and store the rows.
1620                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
1621
1622                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
1623                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
1624                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
1625                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
1626                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
1627                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
1628                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
1629                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
1630                pSrc += SRC_COLUMN_BYTES;
1631            }
1632
1633            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
1634            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
1635            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
1636            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
1637            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
1638            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
1639            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
1640            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
1641        }
1642#endif
1643    }
1644};
1645
1646//////////////////////////////////////////////////////////////////////////
1647/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1648//////////////////////////////////////////////////////////////////////////
1649template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1650struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1651{
1652    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1653    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1654
1655    //////////////////////////////////////////////////////////////////////////
1656    /// @brief Stores an 8x8 raster tile to the destination surface.
1657    /// @param pSrc - Pointer to raster tile.
1658    /// @param pDstSurface - Destination surface state
1659    /// @param x, y - Coordinates to raster tile.
1660    INLINE static void Store(
1661        uint8_t *pSrc,
1662        SWR_SURFACE_STATE* pDstSurface,
1663        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1664    {
1665        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1666
1667        // Punt non-full tiles to generic store
1668        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1669        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1670
1671        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1672        {
1673            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1674        }
1675
1676        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1677        // We can compute the offsets to each column within the raster tile once and increment from these.
1678#if USE_8x2_TILE_BACKEND
1679        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1680        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1681            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1682
1683        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1684
1685        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1686        uint8_t *ppDsts[] =
1687        {
1688            pDst,
1689            pDst + DestRowWidthBytes,
1690            pDst + DestRowWidthBytes / 4,
1691            pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1692        };
1693
1694        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1695        {
1696            // Raster tile width is same as simd16 tile width
1697            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1698
1699            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1700
1701            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1702
1703            ppDsts[0] += dy;
1704            ppDsts[1] += dy;
1705            ppDsts[2] += dy;
1706            ppDsts[3] += dy;
1707        }
1708#else
1709        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1710        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1711            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1712
1713        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1714        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1715
1716        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1717        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1718        {
1719            uint32_t rowOffset = row * DestRowWidthBytes;
1720
1721            uint8_t* pRow = pCol0 + rowOffset;
1722            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1723
1724            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1725            pSrc += pSrcInc;
1726
1727            ppDsts[0] += DestRowWidthBytes / 4;
1728            ppDsts[1] += DestRowWidthBytes / 4;
1729
1730            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1731            pSrc += pSrcInc;
1732        }
1733#endif
1734    }
1735};
1736
1737//////////////////////////////////////////////////////////////////////////
1738/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1739//////////////////////////////////////////////////////////////////////////
1740template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1741struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1742{
1743    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1744    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1745
1746    //////////////////////////////////////////////////////////////////////////
1747    /// @brief Stores an 8x8 raster tile to the destination surface.
1748    /// @param pSrc - Pointer to raster tile.
1749    /// @param pDstSurface - Destination surface state
1750    /// @param x, y - Coordinates to raster tile.
1751    INLINE static void Store(
1752        uint8_t *pSrc,
1753        SWR_SURFACE_STATE* pDstSurface,
1754        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1755    {
1756        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1757
1758        // Punt non-full tiles to generic store
1759        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1760        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1761
1762        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1763        {
1764            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1765        }
1766
1767        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1768        // We can compute the offsets to each column within the raster tile once and increment from these.
1769#if USE_8x2_TILE_BACKEND
1770        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1771        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1772            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1773
1774        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1775
1776        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1777        uint8_t *ppDsts[] =
1778        {
1779            pDst,
1780            pDst + DestRowWidthBytes,
1781            pDst + DestRowWidthBytes / 2,
1782            pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1783        };
1784
1785        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1786        {
1787            // Raster tile width is same as simd16 tile width
1788            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1789
1790            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1791
1792            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1793
1794            ppDsts[0] += dy;
1795            ppDsts[1] += dy;
1796            ppDsts[2] += dy;
1797            ppDsts[3] += dy;
1798        }
1799#else
1800        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1801        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1802            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1803
1804        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1805        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1806
1807        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1808        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1809        {
1810            uint32_t rowOffset = row * DestRowWidthBytes;
1811
1812            uint8_t* pRow = pCol0 + rowOffset;
1813            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1814
1815            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1816            pSrc += pSrcInc;
1817
1818            ppDsts[0] += DestRowWidthBytes / 2;
1819            ppDsts[1] += DestRowWidthBytes / 2;
1820
1821            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1822            pSrc += pSrcInc;
1823        }
1824#endif
1825    }
1826};
1827
1828//////////////////////////////////////////////////////////////////////////
1829/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1830//////////////////////////////////////////////////////////////////////////
1831template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1832struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1833{
1834    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1835    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1836    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1837
1838    //////////////////////////////////////////////////////////////////////////
1839    /// @brief Stores an 8x8 raster tile to the destination surface.
1840    /// @param pSrc - Pointer to raster tile.
1841    /// @param pDstSurface - Destination surface state
1842    /// @param x, y - Coordinates to raster tile.
1843    INLINE static void Store(
1844        uint8_t *pSrc,
1845        SWR_SURFACE_STATE* pDstSurface,
1846        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1847    {
1848        static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
1849
1850        // Punt non-full tiles to generic store
1851        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1852        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1853
1854        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1855        {
1856            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1857        }
1858
1859        // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1860        // We can compute the offsets to each column within the raster tile once and increment from these.
1861#if USE_8x2_TILE_BACKEND
1862        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1863            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1864
1865        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1866        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1867
1868        uint8_t* ppDsts[] =
1869        {
1870            pDst,                                           // row 0, col 0
1871            pDst + DestRowWidthBytes,                       // row 1, col 0
1872            pDst + dx / 2,                                  // row 0, col 1
1873            pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
1874        };
1875
1876        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1877        {
1878            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1879            {
1880                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1881
1882                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1883
1884                ppDsts[0] += dx;
1885                ppDsts[1] += dx;
1886                ppDsts[2] += dx;
1887                ppDsts[3] += dx;
1888            }
1889
1890            ppDsts[0] += dy;
1891            ppDsts[1] += dy;
1892            ppDsts[2] += dy;
1893            ppDsts[3] += dy;
1894        }
1895#else
1896        uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1897            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1898        uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
1899
1900        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1901        {
1902            for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
1903            {
1904                uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
1905
1906                uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
1907                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1908
1909                // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1910                pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1911            }
1912
1913            pRow0 += (DestRowWidthBytes * 2);
1914            pRow1 += (DestRowWidthBytes * 2);
1915        }
1916#endif
1917    }
1918};
1919
1920//////////////////////////////////////////////////////////////////////////
1921/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1922//////////////////////////////////////////////////////////////////////////
1923template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1924struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1925{
1926    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1927    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1928
1929    //////////////////////////////////////////////////////////////////////////
1930    /// @brief Stores an 8x8 raster tile to the destination surface.
1931    /// @param pSrc - Pointer to raster tile.
1932    /// @param pDstSurface - Destination surface state
1933    /// @param x, y - Coordinates to raster tile.
1934    INLINE static void Store(
1935        uint8_t *pSrc,
1936        SWR_SURFACE_STATE* pDstSurface,
1937        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1938    {
1939        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1940        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
1941
1942        // Punt non-full tiles to generic store
1943        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1944        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1945
1946        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1947        {
1948            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1949        }
1950
1951        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1952        // We can compute the offsets to each column within the raster tile once and increment from these.
1953#if USE_8x2_TILE_BACKEND
1954        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1955        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1956            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1957
1958        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1959        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1960
1961        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1962        uint8_t *ppDsts[] =
1963        {
1964            pDst,                                           // row 0, col 0
1965            pDst + DestRowWidthBytes,                       // row 1, col 0
1966            pDst + DestColumnBytes,                         // row 0, col 1
1967            pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
1968        };
1969
1970        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1971        {
1972            // Raster tile width is same as simd16 tile width
1973            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1974
1975            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1976
1977            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1978
1979            ppDsts[0] += dy;
1980            ppDsts[1] += dy;
1981            ppDsts[2] += dy;
1982            ppDsts[3] += dy;
1983        }
1984#else
1985        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1986        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1987            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1988
1989        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
1990        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
1991
1992        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1993        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
1994        {
1995            uint32_t rowOffset = row * DestRowWidthBytes;
1996
1997            uint8_t* pRow = pCol0 + rowOffset;
1998            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
1999
2000            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2001            pSrc += pSrcInc;
2002
2003            ppDsts[0] += DestColumnBytes;
2004            ppDsts[1] += DestColumnBytes;
2005
2006            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2007            pSrc += pSrcInc;
2008        }
2009#endif
2010    }
2011};
2012
2013//////////////////////////////////////////////////////////////////////////
2014/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
2015//////////////////////////////////////////////////////////////////////////
2016template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2017struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
2018{
2019    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2020    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2021
2022    //////////////////////////////////////////////////////////////////////////
2023    /// @brief Stores an 8x8 raster tile to the destination surface.
2024    /// @param pSrc - Pointer to raster tile.
2025    /// @param pDstSurface - Destination surface state
2026    /// @param x, y - Coordinates to raster tile.
2027    INLINE static void Store(
2028        uint8_t *pSrc,
2029        SWR_SURFACE_STATE* pDstSurface,
2030        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2031    {
2032        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2033        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2034
2035        // Punt non-full tiles to generic store
2036        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2037        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2038
2039        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2040        {
2041            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2042        }
2043
2044        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2045        // We can compute the offsets to each column within the raster tile once and increment from these.
2046#if USE_8x2_TILE_BACKEND
2047        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2048        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2049            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2050
2051        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2052        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2053
2054        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2055        uint8_t *ppDsts[] =
2056        {
2057            pDst,                                           // row 0, col 0
2058            pDst + DestRowWidthBytes,                       // row 1, col 0
2059            pDst + DestColumnBytes,                         // row 0, col 1
2060            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
2061            pDst + DestColumnBytes * 2,                     // row 0, col 2
2062            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2063            pDst + DestColumnBytes * 3,                     // row 0, col 3
2064            pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
2065        };
2066
2067        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2068        {
2069            // Raster tile width is same as simd16 tile width
2070            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2071
2072            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2073
2074            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2075
2076            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2077            {
2078                ppDsts[i] += dy;
2079            }
2080        }
2081#else
2082        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2083        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2084            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2085        uint8_t* pCol1 = pCol0 + DestColumnBytes;
2086
2087        // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
2088        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
2089        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
2090
2091        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2092        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
2093        {
2094            uint32_t rowOffset = row * DestRowWidthBytes;
2095            uint8_t* ppDsts[] =
2096            {
2097                pCol0 + rowOffset,
2098                pCol0 + rowOffset + DestRowWidthBytes,
2099                pCol1 + rowOffset,
2100                pCol1 + rowOffset + DestRowWidthBytes,
2101            };
2102
2103            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2104            pSrc += pSrcInc;
2105
2106            ppDsts[0] += DestColumnBytes * 2;
2107            ppDsts[1] += DestColumnBytes * 2;
2108            ppDsts[2] += DestColumnBytes * 2;
2109            ppDsts[3] += DestColumnBytes * 2;
2110
2111            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2112            pSrc += pSrcInc;
2113        }
2114#endif
2115    }
2116};
2117
2118//////////////////////////////////////////////////////////////////////////
2119/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
2120//////////////////////////////////////////////////////////////////////////
2121template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2122struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
2123{
2124    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2125#if USE_8x2_TILE_BACKEND
2126    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2127
2128#else
2129    static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2130    static const size_t TILE_Y_ROWS = 32;
2131    static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2132
2133    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2134    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2135    static const size_t MAX_DST_COLUMN_BYTES = 16;
2136
2137    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
2138    static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
2139
2140#endif
2141    //////////////////////////////////////////////////////////////////////////
2142    /// @brief Stores an 8x8 raster tile to the destination surface.
2143    /// @param pSrc - Pointer to raster tile.
2144    /// @param pDstSurface - Destination surface state
2145    /// @param x, y - Coordinates to raster tile.
2146    INLINE static void Store(
2147        uint8_t *pSrc,
2148        SWR_SURFACE_STATE* pDstSurface,
2149        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
2150    {
2151#if USE_8x2_TILE_BACKEND
2152        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2153        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2154#endif
2155
2156        // Punt non-full tiles to generic store
2157        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
2158        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
2159
2160        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
2161        {
2162            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
2163        }
2164
2165        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
2166        // We can compute the offsets to each column within the raster tile once and increment from these.
2167#if USE_8x2_TILE_BACKEND
2168        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2169        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2170            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2171
2172        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
2173        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2174
2175        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2176        uint8_t *ppDsts[] =
2177        {
2178            pDst,                                           // row 0, col 0
2179            pDst + DestRowWidthBytes,                       // row 1, col 0
2180            pDst + DestColumnBytes,                         // row 0, col 1
2181            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
2182            pDst + DestColumnBytes * 2,                     // row 0, col 2
2183            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
2184            pDst + DestColumnBytes * 3,                     // row 0, col 3
2185            pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
2186            pDst + DestColumnBytes * 4,                     // row 0, col 4
2187            pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
2188            pDst + DestColumnBytes * 5,                     // row 0, col 5
2189            pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
2190            pDst + DestColumnBytes * 6,                     // row 0, col 6
2191            pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
2192            pDst + DestColumnBytes * 7,                     // row 0, col 7
2193            pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
2194        };
2195
2196        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2197        {
2198            // Raster tile width is same as simd16 tile width
2199            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2200
2201            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2202
2203            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2204
2205            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2206            {
2207                ppDsts[i] += dy;
2208            }
2209        }
2210#else
2211        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2212        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2213            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2214        struct DstPtrs
2215        {
2216            uint8_t* ppDsts[8];
2217        } ptrs;
2218
2219        // Need 8 pointers, 4 columns of 2 rows each
2220        for (uint32_t y = 0; y < 2; ++y)
2221        {
2222            for (uint32_t x = 0; x < 4; ++x)
2223            {
2224                ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
2225            }
2226        }
2227
2228        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
2229        {
2230            DstPtrs startPtrs = ptrs;
2231
2232            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
2233            {
2234                // Format conversion and convert from SOA to AOS, and store the rows.
2235                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
2236
2237                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
2238                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
2239                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
2240                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
2241                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
2242                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
2243                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
2244                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
2245                pSrc += SRC_COLUMN_BYTES;
2246            }
2247
2248            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
2249            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
2250            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
2251            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
2252            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
2253            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
2254            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
2255            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
2256        }
2257#endif
2258    }
2259};
2260
2261//////////////////////////////////////////////////////////////////////////
2262/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
2263//////////////////////////////////////////////////////////////////////////
2264template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
2265struct StoreMacroTile
2266{
2267    //////////////////////////////////////////////////////////////////////////
2268    /// @brief Stores a macrotile to the destination surface using safe implementation.
2269    /// @param pSrc - Pointer to macro tile.
2270    /// @param pDstSurface - Destination surface state
2271    /// @param x, y - Coordinates to macro tile
2272    static void StoreGeneric(
2273        uint8_t *pSrcHotTile,
2274        SWR_SURFACE_STATE* pDstSurface,
2275        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2276    {
2277        PFN_STORE_TILES_INTERNAL pfnStore;
2278        pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2279
2280        // Store each raster tile from the hot tile to the destination surface.
2281        for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2282        {
2283            for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2284            {
2285                for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2286                {
2287                    pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2288                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2289                }
2290            }
2291        }
2292
2293    }
2294
2295    typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
2296    //////////////////////////////////////////////////////////////////////////
2297    /// @brief Stores a macrotile to the destination surface.
2298    /// @param pSrc - Pointer to macro tile.
2299    /// @param pDstSurface - Destination surface state
2300    /// @param x, y - Coordinates to macro tile
2301    static void Store(
2302        uint8_t *pSrcHotTile,
2303        SWR_SURFACE_STATE* pDstSurface,
2304        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
2305    {
2306        PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
2307
2308        for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2309        {
2310            size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
2311                0,
2312                0,
2313                pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
2314                pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
2315                sampleNum,
2316                pDstSurface->lod,
2317                pDstSurface);
2318
2319            // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
2320            bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
2321                (pDstSurface->bInterleavedSamples);
2322
2323            pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
2324        }
2325
2326        // Store each raster tile from the hot tile to the destination surface.
2327        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
2328        {
2329            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
2330            {
2331                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
2332                {
2333                    pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
2334                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
2335                }
2336            }
2337        }
2338    }
2339};
2340
2341//////////////////////////////////////////////////////////////////////////
2342/// InitStoreTilesTable - Helper for setting up the tables.
2343template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2344void InitStoreTilesTableColor_Half1(
2345    PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
2346{
2347    table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
2348    table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
2349    table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
2350    table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
2351    table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
2352    table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
2353    table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
2354    table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
2355    table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
2356    table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
2357    table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
2358    table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
2359    table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
2360    table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
2361    table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
2362    table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
2363    table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
2364    table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
2365    table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
2366    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2367    table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
2368    table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
2369    table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
2370    table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
2371    table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
2372    table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
2373    table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
2374    table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
2375    table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
2376    table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
2377    table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
2378    table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
2379    table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
2380    table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
2381    table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
2382    table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
2383    table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
2384    table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
2385    table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
2386    table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
2387    table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
2388    table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
2389    table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
2390    table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
2391    table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
2392    table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
2393    table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
2394    table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
2395    table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
2396    table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
2397    table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
2398    table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
2399    table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
2400    table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
2401    table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
2402    table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
2403}
2404
2405template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
2406void InitStoreTilesTableColor_Half2(
2407    PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
2408{
2409    table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
2410    table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
2411    table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
2412    table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
2413    table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
2414    table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
2415    table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
2416    table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
2417    table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
2418    table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
2419    table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
2420    table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
2421    table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
2422    table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
2423    table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
2424    table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
2425    table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
2426    table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
2427    table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
2428    table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
2429    table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
2430    table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
2431    table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
2432    table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
2433    table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
2434    table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
2435    table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
2436    table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
2437    table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
2438    table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
2439    table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
2440    table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
2441    table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
2442    table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
2443    table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
2444    table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
2445    table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
2446    table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
2447    table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
2448    table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
2449    table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
2450    table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
2451    table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
2452    table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
2453    table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
2454    table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
2455    table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
2456    table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
2457    table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2458    table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2459    table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2460    table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2461    table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2462    table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2463    table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2464    table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2465    table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2466    table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2467    table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2468    table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2469    table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2470    table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2471    table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2472    table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2473}
2474
2475//////////////////////////////////////////////////////////////////////////
2476/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2477template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2478void InitStoreTilesTableDepth(
2479    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2480{
2481   table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2482   table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2483   table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2484   table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2485}
2486
2487template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2488void InitStoreTilesTableStencil(
2489    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2490{
2491    table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2492}
2493