12550b04179614da4c71dbef195d06a7f53273438Tim Rowley/****************************************************************************
22550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
32550b04179614da4c71dbef195d06a7f53273438Tim Rowley*
42550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a
52550b04179614da4c71dbef195d06a7f53273438Tim Rowley* copy of this software and associated documentation files (the "Software"),
62550b04179614da4c71dbef195d06a7f53273438Tim Rowley* to deal in the Software without restriction, including without limitation
72550b04179614da4c71dbef195d06a7f53273438Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense,
82550b04179614da4c71dbef195d06a7f53273438Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the
92550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Software is furnished to do so, subject to the following conditions:
102550b04179614da4c71dbef195d06a7f53273438Tim Rowley*
112550b04179614da4c71dbef195d06a7f53273438Tim Rowley* The above copyright notice and this permission notice (including the next
122550b04179614da4c71dbef195d06a7f53273438Tim Rowley* paragraph) shall be included in all copies or substantial portions of the
132550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Software.
142550b04179614da4c71dbef195d06a7f53273438Tim Rowley*
152550b04179614da4c71dbef195d06a7f53273438Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
162550b04179614da4c71dbef195d06a7f53273438Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
172550b04179614da4c71dbef195d06a7f53273438Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
182550b04179614da4c71dbef195d06a7f53273438Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
192550b04179614da4c71dbef195d06a7f53273438Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
202550b04179614da4c71dbef195d06a7f53273438Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
212550b04179614da4c71dbef195d06a7f53273438Tim Rowley* IN THE SOFTWARE.
222550b04179614da4c71dbef195d06a7f53273438Tim Rowley*
232550b04179614da4c71dbef195d06a7f53273438Tim Rowley* @file StoreTile.h
242550b04179614da4c71dbef195d06a7f53273438Tim Rowley*
252550b04179614da4c71dbef195d06a7f53273438Tim Rowley* @brief Functionality for Store.
262550b04179614da4c71dbef195d06a7f53273438Tim Rowley*
272550b04179614da4c71dbef195d06a7f53273438Tim Rowley******************************************************************************/
282550b04179614da4c71dbef195d06a7f53273438Tim Rowley#pragma once
292550b04179614da4c71dbef195d06a7f53273438Tim Rowley
302550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "common/os.h"
312550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "common/formats.h"
322550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/context.h"
332550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/rdtsc_core.h"
342550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/format_conversion.h"
352550b04179614da4c71dbef195d06a7f53273438Tim Rowley
362550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "memory/TilingFunctions.h"
372550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "memory/Convert.h"
382550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/multisample.h"
392550b04179614da4c71dbef195d06a7f53273438Tim Rowley
402550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include <array>
412550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include <sstream>
422550b04179614da4c71dbef195d06a7f53273438Tim Rowley
432550b04179614da4c71dbef195d06a7f53273438Tim Rowley// Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
442550b04179614da4c71dbef195d06a7f53273438Tim Rowleytypedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
452550b04179614da4c71dbef195d06a7f53273438Tim Rowley
462550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
472550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// Store Raster Tile Function Tables.
482550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
492550b04179614da4c71dbef195d06a7f53273438Tim Rowleyextern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
502550b04179614da4c71dbef195d06a7f53273438Tim Rowleyextern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
512550b04179614da4c71dbef195d06a7f53273438Tim Rowleyextern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
522550b04179614da4c71dbef195d06a7f53273438Tim Rowley
532550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_Linear_1();
542550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_Linear_2();
552550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileX_1();
562550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileX_2();
572550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileY_1();
582550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileY_2();
592550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileW();
602550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable();
612550b04179614da4c71dbef195d06a7f53273438Tim Rowley
622550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
632550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels
642550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
652550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
662550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts   - Array of destination pointers.  Each pointer is
672550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                   to a single row of at most 16B.
682550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers.  Each pair of
692550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                    pointers is for a 16-byte column of two rows.
702550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
712550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <size_t PixelSize, size_t NumDests>
722550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels
732550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
742550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
752550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
762550b04179614da4c71dbef195d06a7f53273438Tim Rowley
772550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
782550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization)
792550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
802550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
812550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts   - Array of destination pointers.  Each pointer is
822550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                   to a single row of at most 16B.
832550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers.  Each pair of
842550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                    pointers is for a 16-byte column of two rows.
852550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
862550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <>
872550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<8, 2>
882550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
892550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
902550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
912550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Each 4-pixel row is 4 bytes.
922550b04179614da4c71dbef195d06a7f53273438Tim Rowley        const uint16_t* pPixSrc = (const uint16_t*)pSrc;
932550b04179614da4c71dbef195d06a7f53273438Tim Rowley
942550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Unswizzle from SWR-Z order
952550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint16_t* pRow = (uint16_t*)ppDsts[0];
962550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[0] = pPixSrc[0];
972550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[1] = pPixSrc[2];
982550b04179614da4c71dbef195d06a7f53273438Tim Rowley
992550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow = (uint16_t*)ppDsts[1];
1002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[0] = pPixSrc[1];
1012550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[1] = pPixSrc[3];
1022550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
1032550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
1042550b04179614da4c71dbef195d06a7f53273438Tim Rowley
105937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
106937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <>
107937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<8, 4>
108937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{
109937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
110937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    {
111937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // 8 x 2 bytes = 16 bytes, 16 pixels
112937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
113937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
114937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
115937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
116937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // Unswizzle from SWR-Z order
117937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[0][0] = pSrc16[0];     // 0 1
118937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[0][1] = pSrc16[2];     // 4 5
119937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
120937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[1][0] = pSrc16[1];     // 2 3
121937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[1][1] = pSrc16[3];     // 6 7
122937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
123937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[2][0] = pSrc16[4];     // 8 9
124937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[2][1] = pSrc16[6];     // C D
125937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
126937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[3][0] = pSrc16[5];     // A B
127937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts16[3][1] = pSrc16[7];     // E F
128937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    }
129937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley};
130937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
131937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
1322550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
1332550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization)
1342550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
1352550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
1362550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts   - Array of destination pointers.  Each pointer is
1372550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                   to a single row of at most 16B.
1382550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers.  Each pair of
1392550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                    pointers is for a 16-byte column of two rows.
1402550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
1412550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <>
1422550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<16, 2>
1432550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
1442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
1452550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
1462550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Each 4-pixel row is 8 bytes.
1472550b04179614da4c71dbef195d06a7f53273438Tim Rowley        const uint32_t* pPixSrc = (const uint32_t*)pSrc;
1482550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1492550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Unswizzle from SWR-Z order
1502550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t* pRow = (uint32_t*)ppDsts[0];
1512550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[0] = pPixSrc[0];
1522550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[1] = pPixSrc[2];
1532550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1542550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow = (uint32_t*)ppDsts[1];
1552550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[0] = pPixSrc[1];
1562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pRow[1] = pPixSrc[3];
1572550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
1582550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
1592550b04179614da4c71dbef195d06a7f53273438Tim Rowley
160937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
161937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <>
162937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<16, 4>
163937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{
164937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    {
166937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // 8 x 4 bytes = 32 bytes, 16 pixels
167937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
169937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
171937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // Unswizzle from SWR-Z order
172937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[0][0] = pSrc32[0];     // 0 1
173937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[0][1] = pSrc32[2];     // 4 5
174937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
175937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[1][0] = pSrc32[1];     // 2 3
176937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[1][1] = pSrc32[3];     // 6 7
177937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
178937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[2][0] = pSrc32[4];     // 8 9
179937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[2][1] = pSrc32[6];     // C D
180937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
181937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[3][0] = pSrc32[5];     // A B
182937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        ppDsts32[3][1] = pSrc32[7];     // E F
183937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    }
184937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley};
185937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
186937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
1872550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
1882550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization)
1892550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
1902550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
1912550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts   - Array of destination pointers.  Each pointer is
1922550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                   to a single row of at most 16B.
1932550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers.  Each pair of
1942550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                    pointers is for a 16-byte column of two rows.
1952550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
1962550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <>
1972550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<32, 2>
1982550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
1992550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
2002550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
2012550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Each 4-pixel row is 16-bytes
2022550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i *pZRow01 = (__m128i*)pSrc;
2032550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vQuad00 = _mm_load_si128(pZRow01);
2042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
2052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
2062550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
2072550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
2082550b04179614da4c71dbef195d06a7f53273438Tim Rowley
2092550b04179614da4c71dbef195d06a7f53273438Tim Rowley        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
2102550b04179614da4c71dbef195d06a7f53273438Tim Rowley        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
2112550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
2122550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
2132550b04179614da4c71dbef195d06a7f53273438Tim Rowley
214488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
215488992221056edaf7111f9290afdf216c5e98d62Tim Rowleytemplate <>
216488992221056edaf7111f9290afdf216c5e98d62Tim Rowleystruct StorePixels<32, 4>
217488992221056edaf7111f9290afdf216c5e98d62Tim Rowley{
218488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
219488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
220937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // 4 x 16 bytes = 64 bytes, 16 pixels
221937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
222937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
223937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
224937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
225937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // Unswizzle from SWR-Z order
226937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
227937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
228937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
229937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
230937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
231937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
232937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
233937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
234937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
235488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
236488992221056edaf7111f9290afdf216c5e98d62Tim Rowley};
237488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
238488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
2392550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
2402550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization)
2412550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
2422550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
2432550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts   - Array of destination pointers.  Each pointer is
2442550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                   to a single row of at most 16B.
2452550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers.  Each pair of
2462550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                    pointers is for a 16-byte column of two rows.
2472550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
2482550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <>
2492550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<64, 4>
2502550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
2512550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
2522550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
2532550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Each 4-pixel row is 32 bytes.
2542550b04179614da4c71dbef195d06a7f53273438Tim Rowley        const __m128i* pPixSrc = (const __m128i*)pSrc;
2552550b04179614da4c71dbef195d06a7f53273438Tim Rowley
2562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // order of pointers match SWR-Z layout
2572550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i** pvDsts = (__m128i**)&ppDsts[0];
2582550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[0] = pPixSrc[0];
2592550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[1] = pPixSrc[1];
2602550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[2] = pPixSrc[2];
2612550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[3] = pPixSrc[3];
2622550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
2632550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
2642550b04179614da4c71dbef195d06a7f53273438Tim Rowley
265937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
266937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <>
267937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<64, 8>
268937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{
269937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
270937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    {
271937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // 8 x 16 bytes = 128 bytes, 16 pixels
272937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
273937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
274937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
275937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
276937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // order of pointers match SWR-Z layout
277937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[0] = pSrc128[0];     // 0 1
278937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[1] = pSrc128[1];     // 2 3
279937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[2] = pSrc128[2];     // 4 5
280937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[3] = pSrc128[3];     // 6 7
281937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[4] = pSrc128[4];     // 8 9
282937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[5] = pSrc128[5];     // A B
283937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[6] = pSrc128[6];     // C D
284937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        *ppDsts128[7] = pSrc128[7];     // E F
285937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    }
286937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley};
287937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
288937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
2892550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
2902550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization)
2912550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
2922550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
2932550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts   - Array of destination pointers.  Each pointer is
2942550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                   to a single row of at most 16B.
2952550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers.  Each pair of
2962550b04179614da4c71dbef195d06a7f53273438Tim Rowley///                    pointers is for a 16-byte column of two rows.
2972550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
2982550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <>
2992550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<128, 8>
3002550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
3012550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
3022550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
3032550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Each 4-pixel row is 64 bytes.
3042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        const __m128i* pPixSrc = (const __m128i*)pSrc;
3052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
3062550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Unswizzle from SWR-Z order
3072550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i** pvDsts = (__m128i**)&ppDsts[0];
3082550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[0] = pPixSrc[0];
3092550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[1] = pPixSrc[2];
3102550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[2] = pPixSrc[1];
3112550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[3] = pPixSrc[3];
3122550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[4] = pPixSrc[4];
3132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[5] = pPixSrc[6];
3142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[6] = pPixSrc[5];
3152550b04179614da4c71dbef195d06a7f53273438Tim Rowley        *pvDsts[7] = pPixSrc[7];
3162550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
3172550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
3182550b04179614da4c71dbef195d06a7f53273438Tim Rowley
319937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
320937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <>
321937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<128, 16>
322937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{
323937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
324937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    {
325937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // 16 x 16 bytes = 256 bytes, 16 pixels
326937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
327937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
328937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
329937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
330937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t i = 0; i < 16; i += 4)
331937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
332937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            *ppDsts128[i + 0] = pSrc128[i + 0];
333937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            *ppDsts128[i + 1] = pSrc128[i + 2];
334937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            *ppDsts128[i + 2] = pSrc128[i + 1];
335937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            *ppDsts128[i + 3] = pSrc128[i + 3];
336937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
337937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    }
338937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley};
339937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
340937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
3412550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
3422550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
3432550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
3442550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
3452550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS
3462550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
3472550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
3482550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Converts a SIMD from the Hot Tile to the destination format
3492550b04179614da4c71dbef195d06a7f53273438Tim Rowley    ///        and converts from SOA to AOS.
3502550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
3512550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDst - Pointer to destination surface or deswizzling buffer.
3522550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
3532550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
3542550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
355488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
356488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
357488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
358488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
359488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
360488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
361488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        // Convert from SrcFormat --> DstFormat
362488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        simd16vector src;
363488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        LoadSOA<SrcFormat>(pSrc, src);
364488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        StoreSOA<DstFormat>(src, soaTile);
365488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
366488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        // Convert from SOA --> AOS
367488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
368488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
369488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
3702550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
3712550b04179614da4c71dbef195d06a7f53273438Tim Rowley
3722550b04179614da4c71dbef195d06a7f53273438Tim Rowley        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
3732550b04179614da4c71dbef195d06a7f53273438Tim Rowley        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
3742550b04179614da4c71dbef195d06a7f53273438Tim Rowley
3752550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Convert from SrcFormat --> DstFormat
3762550b04179614da4c71dbef195d06a7f53273438Tim Rowley        simdvector src;
3772550b04179614da4c71dbef195d06a7f53273438Tim Rowley        LoadSOA<SrcFormat>(pSrc, src);
3782550b04179614da4c71dbef195d06a7f53273438Tim Rowley        StoreSOA<DstFormat>(src, soaTile);
3792550b04179614da4c71dbef195d06a7f53273438Tim Rowley
3802550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Convert from SOA --> AOS
3812550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
3822550b04179614da4c71dbef195d06a7f53273438Tim Rowley
383488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
3842550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Store data into destination
3852550b04179614da4c71dbef195d06a7f53273438Tim Rowley        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
3862550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
3872550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
3882550b04179614da4c71dbef195d06a7f53273438Tim Rowley
3892550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
3902550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
3912550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// Specialization for no format conversion
3922550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
3932550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT Format>
3942550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<Format, Format>
3952550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
3962550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
3972550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Converts a SIMD from the Hot Tile to the destination format
3982550b04179614da4c71dbef195d06a7f53273438Tim Rowley    ///        and converts from SOA to AOS.
3992550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
4002550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDst - Pointer to destination surface or deswizzling buffer.
4012550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
4022550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
4032550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
404488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
405488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
406488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
407488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
408488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
409488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        // Convert from SOA --> AOS
410488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
411488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
412488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
4132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
4142550b04179614da4c71dbef195d06a7f53273438Tim Rowley
4152550b04179614da4c71dbef195d06a7f53273438Tim Rowley        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
4162550b04179614da4c71dbef195d06a7f53273438Tim Rowley
4172550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Convert from SOA --> AOS
4182550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
4192550b04179614da4c71dbef195d06a7f53273438Tim Rowley
420488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
4212550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Store data into destination
4222550b04179614da4c71dbef195d06a7f53273438Tim Rowley        StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
4232550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
4242550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
4252550b04179614da4c71dbef195d06a7f53273438Tim Rowley
4262550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
4272550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
4282550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
4292550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
4302550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
4312550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
4322550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
4332550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Converts a SIMD from the Hot Tile to the destination format
4342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    ///        and converts from SOA to AOS.
4352550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
4362550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDst - Pointer to destination surface or deswizzling buffer.
4372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
4382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
4392550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
440937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
441937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
442937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
443937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
444937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
445937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
446937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
447937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
448937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // Load hot-tile
449937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        simd16vector src, dst;
450937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        LoadSOA<SrcFormat>(pSrc, src);
451937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
452937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // deswizzle
453937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
454937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
455937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
456937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
457937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // clamp
458937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.x = Clamp<DstFormat>(dst.x, 0);
459937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.y = Clamp<DstFormat>(dst.y, 1);
460937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.z = Clamp<DstFormat>(dst.z, 2);
461937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
462937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // normalize
463937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.x = Normalize<DstFormat>(dst.x, 0);
464937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.y = Normalize<DstFormat>(dst.y, 1);
465937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        dst.z = Normalize<DstFormat>(dst.z, 2);
466937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
467937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // pack
468937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        simd16scalari packed = _simd16_castps_si(dst.x);
469937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
470937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
471937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
472937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
473937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
474937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
475937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
476937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // pack low 16 bits of each 32 bit lane to low 128 bits of dst
477937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint32_t *pPacked = (uint32_t*)&packed;
478937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint16_t *pAosTile = (uint16_t*)&aosTile[0];
479937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
480937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
481937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            *pAosTile++ = *pPacked++;
482937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
483937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
484937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
4852550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
4862550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
4872550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
4882550b04179614da4c71dbef195d06a7f53273438Tim Rowley
4892550b04179614da4c71dbef195d06a7f53273438Tim Rowley        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
4902550b04179614da4c71dbef195d06a7f53273438Tim Rowley
4912550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Load hot-tile
4922550b04179614da4c71dbef195d06a7f53273438Tim Rowley        simdvector src, dst;
4932550b04179614da4c71dbef195d06a7f53273438Tim Rowley        LoadSOA<SrcFormat>(pSrc, src);
4942550b04179614da4c71dbef195d06a7f53273438Tim Rowley
4952550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // deswizzle
4962550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
4972550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
4982550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
4992550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // clamp
5012550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.x = Clamp<DstFormat>(dst.x, 0);
5022550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.y = Clamp<DstFormat>(dst.y, 1);
5032550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.z = Clamp<DstFormat>(dst.z, 2);
5042550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5052550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // normalize
5062550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.x = Normalize<DstFormat>(dst.x, 0);
5072550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.y = Normalize<DstFormat>(dst.y, 1);
5082550b04179614da4c71dbef195d06a7f53273438Tim Rowley        dst.z = Normalize<DstFormat>(dst.z, 2);
5092550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5102550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // pack
5112550b04179614da4c71dbef195d06a7f53273438Tim Rowley        simdscalari packed = _simd_castps_si(dst.x);
5122550b04179614da4c71dbef195d06a7f53273438Tim Rowley        packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
5132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
5142550b04179614da4c71dbef195d06a7f53273438Tim Rowley                                                                              FormatTraits<DstFormat>::GetBPC(1)));
5152550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5162550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // pack low 16 bits of each 32 bit lane to low 128 bits of dst
5172550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t *pPacked = (uint32_t*)&packed;
5182550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint16_t *pAosTile = (uint16_t*)&aosTile[0];
5192550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
5202550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
5212550b04179614da4c71dbef195d06a7f53273438Tim Rowley            *pAosTile++ = *pPacked++;
5222550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
5232550b04179614da4c71dbef195d06a7f53273438Tim Rowley
524937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
5252550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Store data into destination
5262550b04179614da4c71dbef195d06a7f53273438Tim Rowley        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
5272550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
5282550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
5292550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5302550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
5312550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
5322550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
5332550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
5342550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
5352550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
5362550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const SWR_FORMAT SrcFormat = R32_FLOAT;
5372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
5382550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5392550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
5402550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Converts a SIMD from the Hot Tile to the destination format
5412550b04179614da4c71dbef195d06a7f53273438Tim Rowley    ///        and converts from SOA to AOS.
5422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
5432550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDst - Pointer to destination surface or deswizzling buffer.
5442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
5452550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
5462550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
547488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
5487aea08667c673713e1f419539e788eedeea047cbTim Rowley        simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
549488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5507aea08667c673713e1f419539e788eedeea047cbTim Rowley        // clamp
5517aea08667c673713e1f419539e788eedeea047cbTim Rowley        const simd16scalar zero = _simd16_setzero_ps();
5527aea08667c673713e1f419539e788eedeea047cbTim Rowley        const simd16scalar ones = _simd16_set1_ps(1.0f);
553488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5547aea08667c673713e1f419539e788eedeea047cbTim Rowley        comp = _simd16_max_ps(comp, zero);
5557aea08667c673713e1f419539e788eedeea047cbTim Rowley        comp = _simd16_min_ps(comp, ones);
556488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5577aea08667c673713e1f419539e788eedeea047cbTim Rowley        // normalize
5587aea08667c673713e1f419539e788eedeea047cbTim Rowley        comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
559937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
5607aea08667c673713e1f419539e788eedeea047cbTim Rowley        simd16scalari temp = _simd16_cvtps_epi32(comp);
561488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5627aea08667c673713e1f419539e788eedeea047cbTim Rowley        // swizzle
5637aea08667c673713e1f419539e788eedeea047cbTim Rowley        temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
564488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5657aea08667c673713e1f419539e788eedeea047cbTim Rowley        // merge/store data into destination but don't overwrite the X8 bits
566937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
567937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
568488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5697aea08667c673713e1f419539e788eedeea047cbTim Rowley        simd16scalari dest = _simd16_setzero_si();
5707aea08667c673713e1f419539e788eedeea047cbTim Rowley
5717aea08667c673713e1f419539e788eedeea047cbTim Rowley        dest = _simd16_insert_si(dest, destlo, 0);
5727aea08667c673713e1f419539e788eedeea047cbTim Rowley        dest = _simd16_insert_si(dest, desthi, 1);
5737aea08667c673713e1f419539e788eedeea047cbTim Rowley
5747aea08667c673713e1f419539e788eedeea047cbTim Rowley        simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
575488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5767aea08667c673713e1f419539e788eedeea047cbTim Rowley        dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
577488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
5787aea08667c673713e1f419539e788eedeea047cbTim Rowley        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
5797aea08667c673713e1f419539e788eedeea047cbTim Rowley        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
580488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
5812550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
5822550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5832550b04179614da4c71dbef195d06a7f53273438Tim Rowley        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
5842550b04179614da4c71dbef195d06a7f53273438Tim Rowley        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
5852550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5862550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Convert from SrcFormat --> DstFormat
5872550b04179614da4c71dbef195d06a7f53273438Tim Rowley        simdvector src;
5882550b04179614da4c71dbef195d06a7f53273438Tim Rowley        LoadSOA<SrcFormat>(pSrc, src);
5892550b04179614da4c71dbef195d06a7f53273438Tim Rowley        StoreSOA<DstFormat>(src, soaTile);
5902550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5912550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Convert from SOA --> AOS
5922550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
5932550b04179614da4c71dbef195d06a7f53273438Tim Rowley
5942550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Store data into destination but don't overwrite the X8 bits
5952550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Each 4-pixel row is 16-bytes
5962550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i *pZRow01 = (__m128i*)aosTile;
5972550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vQuad00 = _mm_load_si128(pZRow01);
5982550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
5992550b04179614da4c71dbef195d06a7f53273438Tim Rowley
6002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
6012550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
6022550b04179614da4c71dbef195d06a7f53273438Tim Rowley
6032550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
6042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
6052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
6062550b04179614da4c71dbef195d06a7f53273438Tim Rowley        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
6072550b04179614da4c71dbef195d06a7f53273438Tim Rowley
6082550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vDst0 = _mm_andnot_si128(vMask, vDst0);
6092550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
6102550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vDst1 = _mm_andnot_si128(vMask, vDst1);
6112550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
6122550b04179614da4c71dbef195d06a7f53273438Tim Rowley
6132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
6142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
615488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
6162550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
6172550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
6182550b04179614da4c71dbef195d06a7f53273438Tim Rowley
619488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
620488992221056edaf7111f9290afdf216c5e98d62Tim Rowleytemplate<SWR_FORMAT DstFormat>
621488992221056edaf7111f9290afdf216c5e98d62Tim RowleyINLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
622488992221056edaf7111f9290afdf216c5e98d62Tim Rowley{
623488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    // swizzle rgba -> bgra while we load
624488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
625488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
626488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
627488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
628488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
629937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // clamp
630488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    const simd16scalar zero = _simd16_setzero_ps();
631488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    const simd16scalar ones = _simd16_set1_ps(1.0f);
632488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
633488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp0 = _simd16_max_ps(comp0, zero);
634488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp0 = _simd16_min_ps(comp0, ones);
635488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
636488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp1 = _simd16_max_ps(comp1, zero);
637488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp1 = _simd16_min_ps(comp1, ones);
638488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
639488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp2 = _simd16_max_ps(comp2, zero);
640488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp2 = _simd16_min_ps(comp2, ones);
641488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
642488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp3 = _simd16_max_ps(comp3, zero);
643488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp3 = _simd16_min_ps(comp3, ones);
644488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
645937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // gamma-correct only rgb
646488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    if (FormatTraits<DstFormat>::isSRGB)
647488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    {
648488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
649488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
650488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
651488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    }
652488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
653937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
654488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
655488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
656488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
657488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
658488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
659488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    // moving to 16 wide integer vector types
660488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
661488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
662488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
663488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
664488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
665937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // SOA to AOS conversion
6667aea08667c673713e1f419539e788eedeea047cbTim Rowley    src1 = _simd16_slli_epi32(src1,  8);
667488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    src2 = _simd16_slli_epi32(src2, 16);
668488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    src3 = _simd16_slli_epi32(src3, 24);
669488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
670488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
671488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
672937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // de-swizzle conversion
673488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if 1
674488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
675488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
676488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
677488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
678488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
679488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
680488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
681488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
682488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
683937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // store 8x2 memory order:
684937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
685937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
686937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
687937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
688488992221056edaf7111f9290afdf216c5e98d62Tim Rowley}
689488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
690488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
6912550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT DstFormat>
6922550b04179614da4c71dbef195d06a7f53273438Tim RowleyINLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
6932550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
6942550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const uint32_t offset = sizeof(simdscalar);
6952550b04179614da4c71dbef195d06a7f53273438Tim Rowley
6962550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // swizzle rgba -> bgra while we load
6972550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
6982550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
6992550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
7002550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
7012550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7022550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // clamp
7032550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
7042550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
7052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7062550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
7072550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
7082550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7092550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
7102550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
7112550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7122550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
7132550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
7142550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7152550b04179614da4c71dbef195d06a7f53273438Tim Rowley    if (FormatTraits<DstFormat>::isSRGB)
7162550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
7172550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Gamma-correct only rgb
7182550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
7192550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
7202550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
7212550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
7222550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7232550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
7242550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
7252550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
7262550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
7272550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
7282550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7292550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // moving to 8 wide integer vector types
7302550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
7312550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
7322550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
7332550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
7342550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7352550b04179614da4c71dbef195d06a7f53273438Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX
7362550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // splitting into two sets of 4 wide integer vector types
7382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // because AVX doesn't have instructions to support this operation at 8 wide
7392550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
7402550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
7412550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
7422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
7432550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
7452550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
7462550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
7472550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
7482550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7492550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
7502550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
7512550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
7522550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
7532550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
7542550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
7552550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7562550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
7572550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
7582550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7592550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
7602550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
7612550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7622550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
7632550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
7642550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7652550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // unpack into rows that get the tiling order correct
7662550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
7672550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
7682550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7692550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i final = _mm256_castsi128_si256(vRow00);
7702550b04179614da4c71dbef195d06a7f53273438Tim Rowley    final = _mm256_insertf128_si256(final, vRow10, 1);
7712550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7722550b04179614da4c71dbef195d06a7f53273438Tim Rowley#elif KNOB_ARCH >= KNOB_ARCH_AVX2
7732550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7742550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // logic is as above, only wider
7752550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src1 = _mm256_slli_si256(src1, 1);
7762550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src2 = _mm256_slli_si256(src2, 2);
7772550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src3 = _mm256_slli_si256(src3, 3);
7782550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7792550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src0 = _mm256_or_si256(src0, src1);
7802550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src2 = _mm256_or_si256(src2, src3);
7812550b04179614da4c71dbef195d06a7f53273438Tim Rowley
7822550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i final = _mm256_or_si256(src0, src2);
783488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if 0
784488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
785488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
786488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
787488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    final = _mm256_permutevar8x32_epi32(final, perm);
788488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
789488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
7902550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
7912550b04179614da4c71dbef195d06a7f53273438Tim Rowley    final = _mm256_permute4x64_epi64(final, 0xD8);
792488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
7932550b04179614da4c71dbef195d06a7f53273438Tim Rowley#endif
7942550b04179614da4c71dbef195d06a7f53273438Tim Rowley
795937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
796937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley}
797937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
798937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
799937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate<SWR_FORMAT DstFormat>
800937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim RowleyINLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
801937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{
802937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // swizzle rgba -> bgra while we load
803937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
804937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
805937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
806937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
807937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // clamp
808937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    const simd16scalar zero = _simd16_setzero_ps();
809937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    const simd16scalar ones = _simd16_set1_ps(1.0f);
810937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
811937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp0 = _simd16_max_ps(comp0, zero);
812937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp0 = _simd16_min_ps(comp0, ones);
813937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
814937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp1 = _simd16_max_ps(comp1, zero);
815937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp1 = _simd16_min_ps(comp1, ones);
816937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
817937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp2 = _simd16_max_ps(comp2, zero);
818937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp2 = _simd16_min_ps(comp2, ones);
819937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
820937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // gamma-correct only rgb
821937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    if (FormatTraits<DstFormat>::isSRGB)
822937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    {
823937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
824937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
825937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
826937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    }
827937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
828937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
829937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
830937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
831937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
832937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
833937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // moving to 16 wide integer vector types
834937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
835937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
836937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
837937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
838937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // SOA to AOS conversion
8397aea08667c673713e1f419539e788eedeea047cbTim Rowley    src1 = _simd16_slli_epi32(src1,  8);
840937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    src2 = _simd16_slli_epi32(src2, 16);
841937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
842937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
843937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
844937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // de-swizzle conversion
845937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if 1
846937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
847937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
848937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
849937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
850937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
851937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
852937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
853937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
854937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
855937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    // store 8x2 memory order:
856937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
857937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
858937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
859937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
8602550b04179614da4c71dbef195d06a7f53273438Tim Rowley}
8612550b04179614da4c71dbef195d06a7f53273438Tim Rowley
862937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
8632550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT DstFormat>
8642550b04179614da4c71dbef195d06a7f53273438Tim RowleyINLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
8652550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
8662550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const uint32_t offset = sizeof(simdscalar);
8672550b04179614da4c71dbef195d06a7f53273438Tim Rowley
8682550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // swizzle rgba -> bgra while we load
8692550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
8702550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
8712550b04179614da4c71dbef195d06a7f53273438Tim Rowley    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
8722550b04179614da4c71dbef195d06a7f53273438Tim Rowley                                                                                                            // clamp
8732550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
8742550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
8752550b04179614da4c71dbef195d06a7f53273438Tim Rowley
8762550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
8772550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
8782550b04179614da4c71dbef195d06a7f53273438Tim Rowley
8792550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
8802550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
8812550b04179614da4c71dbef195d06a7f53273438Tim Rowley
8822550b04179614da4c71dbef195d06a7f53273438Tim Rowley    if (FormatTraits<DstFormat>::isSRGB)
8832550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
8842550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Gamma-correct only rgb
8852550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
8862550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
8872550b04179614da4c71dbef195d06a7f53273438Tim Rowley        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
8882550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
8892550b04179614da4c71dbef195d06a7f53273438Tim Rowley
8902550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
8912550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
8922550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
8932550b04179614da4c71dbef195d06a7f53273438Tim Rowley    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
8942550b04179614da4c71dbef195d06a7f53273438Tim Rowley
8952550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // moving to 8 wide integer vector types
8962550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
8972550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
8982550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
8992550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9002550b04179614da4c71dbef195d06a7f53273438Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX
9012550b04179614da4c71dbef195d06a7f53273438Tim Rowley
90275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley    // splitting into two sets of 4 wide integer vector types
90375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley    // because AVX doesn't have instructions to support this operation at 8 wide
9042550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
9052550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
9062550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
9072550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9082550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
9092550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
9102550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
9112550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9122550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
9132550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
9142550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
9152550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
9162550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9172550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
9182550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9192550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
9202550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9212550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
9222550b04179614da4c71dbef195d06a7f53273438Tim Rowley    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
9232550b04179614da4c71dbef195d06a7f53273438Tim Rowley
92475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley    // unpack into rows that get the tiling order correct
9252550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
9262550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
9272550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9282550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i final = _mm256_castsi128_si256(vRow00);
9292550b04179614da4c71dbef195d06a7f53273438Tim Rowley    final = _mm256_insertf128_si256(final, vRow10, 1);
9302550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9312550b04179614da4c71dbef195d06a7f53273438Tim Rowley#elif KNOB_ARCH >= KNOB_ARCH_AVX2
9322550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9332550b04179614da4c71dbef195d06a7f53273438Tim Rowley                                              // logic is as above, only wider
9342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src1 = _mm256_slli_si256(src1, 1);
9352550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src2 = _mm256_slli_si256(src2, 2);
9362550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    src0 = _mm256_or_si256(src0, src1);
9382550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9392550b04179614da4c71dbef195d06a7f53273438Tim Rowley    __m256i final = _mm256_or_si256(src0, src2);
9402550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9412550b04179614da4c71dbef195d06a7f53273438Tim Rowley    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
9422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    final = _mm256_permute4x64_epi64(final, 0xD8);
9432550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9442550b04179614da4c71dbef195d06a7f53273438Tim Rowley#endif
9452550b04179614da4c71dbef195d06a7f53273438Tim Rowley
946937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
9472550b04179614da4c71dbef195d06a7f53273438Tim Rowley}
9482550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9492550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
9502550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
9512550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
9522550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
9532550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
9542550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
955937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
956937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
957937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
9582550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
959937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
9602550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
9612550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
9622550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9632550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
9642550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
9652550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
9662550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
9672550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
9682550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
969937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
970937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
971937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
9722550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
973937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
9742550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
9752550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
9762550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9772550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
9782550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
9792550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
9802550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
9812550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
9822550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
983937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
984937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
985937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
9862550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
987937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
9882550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
9892550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
9902550b04179614da4c71dbef195d06a7f53273438Tim Rowley
9912550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
9922550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
9932550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
9942550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
9952550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
9962550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
997937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
998937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
999937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
10002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1001937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
10022550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
10032550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
10042550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10052550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
10062550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
10072550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
10082550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
10092550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
10102550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
1011488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
1012488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1013488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
10142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1015488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
10162550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
10172550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
10182550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10192550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
10202550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
10212550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
10222550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
10232550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
10242550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
1025937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
1026937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1027937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
10282550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
1029937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
10302550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
10312550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
10322550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10332550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
10342550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
10352550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
10362550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
10372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
10382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
1039488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
1040488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1041488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
10422550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1043488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
10442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
10452550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
10462550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10472550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<>
10482550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
10492550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
10502550b04179614da4c71dbef195d06a7f53273438Tim Rowley    template <size_t NumDests>
10512550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
10522550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
1053937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
1054937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
1055937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
10562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
1057937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
10582550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
10592550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
10602550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10612550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
10622550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StoreRasterTile
10632550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
10642550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
10652550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StoreRasterTile
10662550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
10672550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
10682550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Retrieve color from hot tile source which is always float.
10692550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
10702550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
10712550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param output - output color
10722550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void GetSwizzledSrcColor(
10732550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* pSrc,
10742550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y,
10752550b04179614da4c71dbef195d06a7f53273438Tim Rowley        float outputColor[4])
10762550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
1077488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
1078488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
1079488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1080937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
1081488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1082488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        // Compute which simd tile we're accessing within 8x8 tile.
1083488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
1084488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
1085488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1086937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
1087488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1088488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
1089488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1090488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1091488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
10922550b04179614da4c71dbef195d06a7f53273438Tim Rowley        typedef SimdTile<SrcFormat, DstFormat> SimdT;
10932550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10942550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SimdT* pSrcSimdTiles = (SimdT*)pSrc;
10952550b04179614da4c71dbef195d06a7f53273438Tim Rowley
10962550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Compute which simd tile we're accessing within 8x8 tile.
10972550b04179614da4c71dbef195d06a7f53273438Tim Rowley        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
10982550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
10992550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
11012550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11022550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
11032550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
1105488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
11062550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
11072550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11082550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
11092550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
11102550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
11112550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
11122550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
11132550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
11142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
11152550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
11162550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
11172550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
11182550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
11192550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
11202550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11212550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // For each raster tile pixel (rx, ry)
11222550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
11232550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
11242550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
11252550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
11262550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Perform bounds checking.
11272550b04179614da4c71dbef195d06a7f53273438Tim Rowley                if (((x + rx) < lodWidth) &&
11282550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    ((y + ry) < lodHeight))
11292550b04179614da4c71dbef195d06a7f53273438Tim Rowley                {
11302550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    float srcColor[4];
11312550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
11322550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11332550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
11342550b04179614da4c71dbef195d06a7f53273438Tim Rowley                        pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
11352550b04179614da4c71dbef195d06a7f53273438Tim Rowley                        sampleNum, pDstSurface->lod, pDstSurface);
11362550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    {
11372550b04179614da4c71dbef195d06a7f53273438Tim Rowley                        ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
11382550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    }
11392550b04179614da4c71dbef195d06a7f53273438Tim Rowley                }
11402550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
11412550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
11422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
11432550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
11442550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11452550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
11462550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
11472550b04179614da4c71dbef195d06a7f53273438Tim Rowley{};
11482550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11492550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
11502550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
11512550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
11522550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
11531b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
11542550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
11552550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
11562550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
11572550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
11582550b04179614da4c71dbef195d06a7f53273438Tim Rowley
11592550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
11602550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
11612550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
11622550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
11632550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
11642550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
11652550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
11662550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
11672550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
11682550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
11692550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
11702550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
11712550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
117275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
117375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
11742550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
11752550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
11762550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
11772550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1178937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
11792550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1180937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
11812550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1182937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1183937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1184937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1185937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* ppDsts[] =
1186937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1187937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst,                                           // row 0, col 0
1188937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + pDstSurface->pitch,                      // row 1, col 0
1189937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + dx / 2,                                  // row 0, col 1
1190937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1191937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1192937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1193937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1194937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1195937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1196937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
1197937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1198937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1199937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1200937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1201937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[0] += dx;
1202937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[1] += dx;
1203937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[2] += dx;
1204937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[3] += dx;
1205937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
1206937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1207937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1208937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1209937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1210937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1211937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1212937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
1213937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
1214937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1215937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
12162550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
12172550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
12182550b04179614da4c71dbef195d06a7f53273438Tim Rowley
12192550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
12202550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
12212550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Format conversion and convert from SOA to AOS, and store the rows.
12222550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
12232550b04179614da4c71dbef195d06a7f53273438Tim Rowley
12242550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
12252550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
12262550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
12272550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
12282550b04179614da4c71dbef195d06a7f53273438Tim Rowley
12292550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
12302550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
12312550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1232937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
12332550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
12342550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
12352550b04179614da4c71dbef195d06a7f53273438Tim Rowley
12362550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
12372550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
12382550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
12392550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
12401b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
12412550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
12422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
12432550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
12442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
12452550b04179614da4c71dbef195d06a7f53273438Tim Rowley
12462550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
12472550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
12482550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
12492550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
12502550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
12512550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
12522550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
12532550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
12542550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
12552550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
12562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
12572550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
12582550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
125975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
126075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
12612550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
12622550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
12632550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
12642550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1265937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
12662550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1267937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
1268937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1269937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1270937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1271937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1272937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* ppDsts[] =
1273937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1274937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst,                                           // row 0, col 0
1275937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + pDstSurface->pitch,                      // row 1, col 0
1276937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + dx / 2,                                  // row 0, col 1
1277937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1278937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1279937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1280937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1281937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1282937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1283937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
1284937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1285937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1286937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1287937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1288937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[0] += dx;
1289937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[1] += dx;
1290937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[2] += dx;
1291937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[3] += dx;
1292937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
1293937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1294937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1295937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1296937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1297937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1298937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1299937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
13002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
13012550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13022550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
13032550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
13042550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
13052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13062550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
13072550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
13082550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Format conversion and convert from SOA to AOS, and store the rows.
13092550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
13102550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13112550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
13122550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
13132550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
13142550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
13152550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13162550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
13172550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
13182550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1319937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
13202550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
13212550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
13222550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13232550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
13242550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
13252550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
13262550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
13271b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
13282550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
13292550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
13302550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
13312550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
13322550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13332550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
13342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
13352550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
13362550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
13372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
13382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
13392550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
13402550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
13412550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
13422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
13432550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
13442550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
13452550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
134675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
134775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
13482550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
13492550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
13502550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
13512550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1352937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
13532550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1354488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND
1355488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1356937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1357937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1358937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1359937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* ppDsts[] =
1360488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        {
1361937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst,                                           // row 0, col 0
1362937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + pDstSurface->pitch,                      // row 1, col 0
1363937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + dx / 2,                                  // row 0, col 1
1364937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1365937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1366488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1367937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1368937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1369937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1370488992221056edaf7111f9290afdf216c5e98d62Tim Rowley            {
1371937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1372488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1373488992221056edaf7111f9290afdf216c5e98d62Tim Rowley                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1374937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1375937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[0] += dx;
1376937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[1] += dx;
1377937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[2] += dx;
1378937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[3] += dx;
1379488992221056edaf7111f9290afdf216c5e98d62Tim Rowley            }
1380488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1381937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1382937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1383937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1384937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1385488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        }
1386488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
13872550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
13882550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13892550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
13902550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
13912550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
13922550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13932550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
13942550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
13952550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Format conversion and convert from SOA to AOS, and store the rows.
13962550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
13972550b04179614da4c71dbef195d06a7f53273438Tim Rowley
13982550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
13992550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
14002550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
14012550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
14022550b04179614da4c71dbef195d06a7f53273438Tim Rowley
14032550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
14042550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
14052550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1406488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
14072550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
14082550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
14092550b04179614da4c71dbef195d06a7f53273438Tim Rowley
14102550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
14112550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
14122550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
141375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
14142550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
14152550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
14162550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
14172550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1418937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
14192550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t MAX_DST_COLUMN_BYTES = 16;
1420937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if !USE_8x2_TILE_BACKEND
14212550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
14222550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1423937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
14242550b04179614da4c71dbef195d06a7f53273438Tim Rowley
14252550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
14262550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
14272550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
14282550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
14292550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
14302550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
14312550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
14322550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
14332550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
14342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
14352550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
14362550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
14372550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
143875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
143975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
14402550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
14412550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
14422550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
14432550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1444937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
14452550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1446937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
1447937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1448937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1449937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1450937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1451937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1452937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1453937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1454937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *ppDsts[] =
1455937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
145675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst,                                                               // row 0, col 0
145775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch,                                          // row 1, col 0
145875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
145975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
146075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
146175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
146275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
146375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
1464937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1465937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1466937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1467937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1468937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            // Raster tile width is same as simd16 tile width
1469937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1470937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1471937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1472937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1473937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1474937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1475937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1476937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
1477937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[i] += dy;
1478937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
1479937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1480937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
14812550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* ppDsts[] =
14822550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
14832550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDst,                                               // row 0, col 0
14842550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDst + pDstSurface->pitch,                          // row 1, col 0
14852550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
14862550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
14872550b04179614da4c71dbef195d06a7f53273438Tim Rowley        };
14882550b04179614da4c71dbef195d06a7f53273438Tim Rowley
14892550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
14902550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
14912550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppStartRows[] =
14922550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
14932550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[0],
14942550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[1],
14952550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[2],
14962550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[3],
14972550b04179614da4c71dbef195d06a7f53273438Tim Rowley            };
14982550b04179614da4c71dbef195d06a7f53273438Tim Rowley
14992550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
15002550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
15012550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Format conversion and convert from SOA to AOS, and store the rows.
15022550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
15032550b04179614da4c71dbef195d06a7f53273438Tim Rowley
15042550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
15052550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
15062550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
15072550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
15082550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += SRC_COLUMN_BYTES;
15092550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
15102550b04179614da4c71dbef195d06a7f53273438Tim Rowley
15112550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
15122550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
15132550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
15142550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
15152550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1516937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
15172550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
15182550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
15192550b04179614da4c71dbef195d06a7f53273438Tim Rowley
15202550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
15212550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
15222550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
15232550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
15242550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
15252550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
15262550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
15272550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1528937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
15292550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t MAX_DST_COLUMN_BYTES = 16;
1530937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if !USE_8x2_TILE_BACKEND
15312550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
15322550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
1533937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
15342550b04179614da4c71dbef195d06a7f53273438Tim Rowley
15352550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
15362550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
15372550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
15382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
15392550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
15402550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
15412550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
15422550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
15432550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
15442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
15452550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
15462550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
15472550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
154875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
154975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
15502550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
15512550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
15522550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
15532550b04179614da4c71dbef195d06a7f53273438Tim Rowley
1554937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
15552550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1556937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
1557937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1558937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
155975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1560937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1561937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1562937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1563937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1564937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* ppDsts[] =
1565937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
156675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst,                                                               // row 0, col 0
156775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch,                                          // row 1, col 0
156875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
156975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
157075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
157175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
157275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
157375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
157475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
157575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
157675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
157775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
157875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
157975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
158075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
158175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
1582937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1583937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
158475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1585937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1586937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            // Raster tile width is same as simd16 tile width
158775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1588937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1589937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1590937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1591937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1592937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1593937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
1594937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
1595937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[i] += dy;
1596937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
1597937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1598937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
15992550b04179614da4c71dbef195d06a7f53273438Tim Rowley        struct DstPtrs
16002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
16012550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppDsts[8];
16022550b04179614da4c71dbef195d06a7f53273438Tim Rowley        } ptrs;
16032550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Need 8 pointers, 4 columns of 2 rows each
16052550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t y = 0; y < 2; ++y)
16062550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
16072550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t x = 0; x < 4; ++x)
16082550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
16092550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
16102550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
16112550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
16122550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
16142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
16152550b04179614da4c71dbef195d06a7f53273438Tim Rowley            DstPtrs startPtrs = ptrs;
16162550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16172550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
16182550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
16192550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Format conversion and convert from SOA to AOS, and store the rows.
16202550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
16212550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16222550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
16232550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
16242550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
16252550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
16262550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
16272550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
16282550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
16292550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
16302550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += SRC_COLUMN_BYTES;
16312550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
16322550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16332550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
16342550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
16352550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
16362550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
16372550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
16382550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
16392550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
16402550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
16412550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1642937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
16432550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
16442550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
16452550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16462550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
16472550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
16482550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
16492550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
16502550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
16512550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
16522550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1653937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
16542550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16552550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
16562550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
16572550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
16582550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
16592550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
16602550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
16612550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
16622550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
16632550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
16642550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
16652550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
16662550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16672550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
16682550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
16692550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
167075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
167175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
16722550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
16732550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
16742550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
16752550b04179614da4c71dbef195d06a7f53273438Tim Rowley
16762550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
16772550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // We can compute the offsets to each column within the raster tile once and increment from these.
1678937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
167975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1680937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1681937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1682937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1683937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1684937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
168575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1686937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *ppDsts[] =
1687937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1688937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst,
1689937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes,
1690937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes / 4,
1691937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1692937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1693937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1694937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1695937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
169675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            // Raster tile width is same as simd16 tile width
169775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
169875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
1699937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1700937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1701937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1702937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1703937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1704937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1705937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1706937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1707937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1708937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
170975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
17102550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
17112550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
17122550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
17142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
17152550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17162550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
17172550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
17182550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
17192550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint32_t rowOffset = row * DestRowWidthBytes;
17202550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17212550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* pRow = pCol0 + rowOffset;
17222550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
17232550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17242550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
17252550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
17262550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17272550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[0] += DestRowWidthBytes / 4;
17282550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[1] += DestRowWidthBytes / 4;
17292550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17302550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
17312550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
17322550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1733937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
17342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
17352550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
17362550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17372550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
17382550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
17392550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
17402550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
17412550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
17422550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
17432550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1744937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
17452550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17462550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
17472550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
17482550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
17492550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
17502550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
17512550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
17522550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
17532550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
17542550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
17552550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
17562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
17572550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17582550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
17592550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
17602550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
176175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
176275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
17632550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
17642550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
17652550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
17662550b04179614da4c71dbef195d06a7f53273438Tim Rowley
17672550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
17682550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // We can compute the offsets to each column within the raster tile once and increment from these.
1769937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
177075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1771937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1772937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1773937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1774937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1775937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
177675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1777937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *ppDsts[] =
1778937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1779937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst,
1780937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes,
1781937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes / 2,
1782937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1783937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1784937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1785937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1786937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
178775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            // Raster tile width is same as simd16 tile width
178875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
178975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
1790937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1791937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1792937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1793937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1794937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1795937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1796937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1797937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1798937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1799937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
180075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
18012550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
18022550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
18032550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
18052550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
18062550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18072550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
18082550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
18092550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
18102550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint32_t rowOffset = row * DestRowWidthBytes;
18112550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18122550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* pRow = pCol0 + rowOffset;
18132550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
18142550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18152550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
18162550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
18172550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18182550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[0] += DestRowWidthBytes / 2;
18192550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[1] += DestRowWidthBytes / 2;
18202550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18212550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
18222550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
18232550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1824937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
18252550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
18262550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
18272550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18282550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
18292550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
18302550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
18312550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
18322550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
18332550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
18342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1835937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1836937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
18372550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
18392550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
18402550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
18412550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
18422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
18432550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
18442550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
18452550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
18462550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
18472550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
18482550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
18492550b04179614da4c71dbef195d06a7f53273438Tim Rowley
185075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // Punt non-full tiles to generic store
18512550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
18522550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
185375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
185475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
18552550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
18562550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
18572550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
18582550b04179614da4c71dbef195d06a7f53273438Tim Rowley
18592550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
18602550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // We can compute the offsets to each column within the raster tile once and increment from these.
1861937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
1862937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1863937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1864937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1865937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1866937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1867937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1868937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* ppDsts[] =
1869937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1870937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst,                                           // row 0, col 0
1871937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes,                       // row 1, col 0
1872937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + dx / 2,                                  // row 0, col 1
1873937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
1874937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1875937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1876937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1877937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
1878937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1879937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
1880937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1881937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1882937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1883937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1884937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[0] += dx;
1885937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[1] += dx;
1886937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[2] += dx;
1887937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[3] += dx;
1888937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
1889937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1890937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1891937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1892937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1893937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1894937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
1895937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
1896937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
18972550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
18982550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
18992550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19002550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
19012550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
19022550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
19032550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
19042550b04179614da4c71dbef195d06a7f53273438Tim Rowley                uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
19052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19062550b04179614da4c71dbef195d06a7f53273438Tim Rowley                uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
19072550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
19082550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19092550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
19102550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
19112550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
19122550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19132550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pRow0 += (DestRowWidthBytes * 2);
19142550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pRow1 += (DestRowWidthBytes * 2);
19152550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
1916937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
19172550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
19182550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
19192550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19202550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
19212550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
19222550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
19232550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
19242550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
19252550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
19262550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1927488992221056edaf7111f9290afdf216c5e98d62Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
19282550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19292550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
19302550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
19312550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
19322550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
19332550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
19342550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
19352550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
19362550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
19372550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
19382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
19392550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
19402550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
19412550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19422550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
19432550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
19442550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
194575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
194675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
19472550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
19482550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
19492550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
19502550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19512550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
19522550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // We can compute the offsets to each column within the raster tile once and increment from these.
1953937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
195475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1955937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
19562550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
19572550b04179614da4c71dbef195d06a7f53273438Tim Rowley
195875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1959937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1960488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
196175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
196275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        uint8_t *ppDsts[] =
1963937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
196475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst,                                           // row 0, col 0
196575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes,                       // row 1, col 0
196675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes,                         // row 0, col 1
196775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
1968937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
1969488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1970937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1971937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
197275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            // Raster tile width is same as simd16 tile width
197375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
197475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
1975488992221056edaf7111f9290afdf216c5e98d62Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1976488992221056edaf7111f9290afdf216c5e98d62Tim Rowley
1977488992221056edaf7111f9290afdf216c5e98d62Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1978937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
1979937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[0] += dy;
1980937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[1] += dy;
1981937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[2] += dy;
1982937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ppDsts[3] += dy;
1983488992221056edaf7111f9290afdf216c5e98d62Tim Rowley        }
1984488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else
198575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
1986937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1987937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1988937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
19892550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
19902550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
19912550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19922550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
19932550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
19942550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
19952550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint32_t rowOffset = row * DestRowWidthBytes;
19962550b04179614da4c71dbef195d06a7f53273438Tim Rowley
19972550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* pRow = pCol0 + rowOffset;
19982550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
19992550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20002550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
20012550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
20022550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20032550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[0] += DestColumnBytes;
20042550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[1] += DestColumnBytes;
20052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20062550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
20072550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
20082550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
2009488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif
20102550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
20112550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
20122550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20132550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
20142550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
20152550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
20162550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
20172550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
20182550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
20192550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
2020937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
20212550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20222550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
20232550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
20242550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
20252550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
20262550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
20272550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
20282550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
20292550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
20302550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
20312550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
20322550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
20332550b04179614da4c71dbef195d06a7f53273438Tim Rowley        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
20342550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20352550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
20362550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
20372550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
203875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
203975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
20402550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
20412550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
20422550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
20432550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20442550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
20452550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // We can compute the offsets to each column within the raster tile once and increment from these.
2046937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
204775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2048937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2049937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2050937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2051937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
205275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2053937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
205475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2055937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *ppDsts[] =
2056937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
205775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst,                                           // row 0, col 0
205875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes,                       // row 1, col 0
205975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes,                         // row 0, col 1
206075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
206175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 2,                     // row 0, col 2
206275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
206375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 3,                     // row 0, col 3
206475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
2065937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        };
2066937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2067937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2068937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
206975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            // Raster tile width is same as simd16 tile width
207075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
207175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
2072937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2073937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2074937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2075937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2076937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2077937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
2078937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[i] += dy;
2079937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
2080937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
2081937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
208275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2083937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
20842550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
20852550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t* pCol1 = pCol0 + DestColumnBytes;
20862550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20872550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
20882550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
20892550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
20902550b04179614da4c71dbef195d06a7f53273438Tim Rowley
20912550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
20922550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
20932550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
20942550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint32_t rowOffset = row * DestRowWidthBytes;
20952550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppDsts[] =
20962550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
20972550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pCol0 + rowOffset,
20982550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pCol0 + rowOffset + DestRowWidthBytes,
20992550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pCol1 + rowOffset,
21002550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pCol1 + rowOffset + DestRowWidthBytes,
21012550b04179614da4c71dbef195d06a7f53273438Tim Rowley            };
21022550b04179614da4c71dbef195d06a7f53273438Tim Rowley
21032550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
21042550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
21052550b04179614da4c71dbef195d06a7f53273438Tim Rowley
21062550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[0] += DestColumnBytes * 2;
21072550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[1] += DestColumnBytes * 2;
21082550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[2] += DestColumnBytes * 2;
21092550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ppDsts[3] += DestColumnBytes * 2;
21102550b04179614da4c71dbef195d06a7f53273438Tim Rowley
21112550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
21122550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pSrc += pSrcInc;
21132550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
2114937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
21152550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
21162550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
21172550b04179614da4c71dbef195d06a7f53273438Tim Rowley
21182550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
21192550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
21202550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
21212550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
21222550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
21232550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
21245dd0b8d3c635b67d8274c64653d825b8855b8167Ilia Mirkin    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
2125937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
2126937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2127937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2128937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
2129937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
2130937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t TILE_Y_ROWS = 32;
2131937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
2132937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2133937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
2134937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
2135937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t MAX_DST_COLUMN_BYTES = 16;
2136937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2137937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
21382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
21392550b04179614da4c71dbef195d06a7f53273438Tim Rowley
2140937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
21412550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
21422550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores an 8x8 raster tile to the destination surface.
21432550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to raster tile.
21442550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
21452550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to raster tile.
21462550b04179614da4c71dbef195d06a7f53273438Tim Rowley    INLINE static void Store(
21472550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrc,
21482550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
21492550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
21502550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
2151937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
2152937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
2153937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
2154937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
2155937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
21562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Punt non-full tiles to generic store
21572550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
21582550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
215975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
216075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
21612550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
21622550b04179614da4c71dbef195d06a7f53273438Tim Rowley            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
21632550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
21642550b04179614da4c71dbef195d06a7f53273438Tim Rowley
216575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
216675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // We can compute the offsets to each column within the raster tile once and increment from these.
2167937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND
216875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 4 8x2 simd tiles in an 8x8 raster tile.
2169937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
2170937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
2171937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2172937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
217375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
2174937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
217575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
2176937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t *ppDsts[] =
2177937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
217875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst,                                           // row 0, col 0
217975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes,                       // row 1, col 0
218075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes,                         // row 0, col 1
218175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
218275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 2,                     // row 0, col 2
218375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
218475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 3,                     // row 0, col 3
218575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
218675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 4,                     // row 0, col 4
218775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
218875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 5,                     // row 0, col 5
218975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
219075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 6,                     // row 0, col 6
219175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
219275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestColumnBytes * 7,                     // row 0, col 7
219375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
219475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        };
2195937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
219675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
2197937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        {
219875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            // Raster tile width is same as simd16 tile width
219975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
2200937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2201937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
2202937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2203937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
2204937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley
2205937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
2206937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            {
2207937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley                ppDsts[i] += dy;
2208937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley            }
2209937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        }
2210937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else
221175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley        // There will be 8 4x2 simd tiles in an 8x8 raster tile.
2212937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
22132550b04179614da4c71dbef195d06a7f53273438Tim Rowley            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
22142550b04179614da4c71dbef195d06a7f53273438Tim Rowley        struct DstPtrs
22152550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
22162550b04179614da4c71dbef195d06a7f53273438Tim Rowley            uint8_t* ppDsts[8];
22172550b04179614da4c71dbef195d06a7f53273438Tim Rowley        } ptrs;
22182550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22192550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Need 8 pointers, 4 columns of 2 rows each
22202550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t y = 0; y < 2; ++y)
22212550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
22222550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t x = 0; x < 4; ++x)
22232550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
22242550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
22252550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
22262550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
22272550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22282550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
22292550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
22302550b04179614da4c71dbef195d06a7f53273438Tim Rowley            DstPtrs startPtrs = ptrs;
22312550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22322550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
22332550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
22342550b04179614da4c71dbef195d06a7f53273438Tim Rowley                // Format conversion and convert from SOA to AOS, and store the rows.
22352550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
22362550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22372550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
22382550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
22392550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
22402550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
22412550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
22422550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
22432550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
22442550b04179614da4c71dbef195d06a7f53273438Tim Rowley                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
22452550b04179614da4c71dbef195d06a7f53273438Tim Rowley                pSrc += SRC_COLUMN_BYTES;
22462550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
22472550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22482550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
22492550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
22502550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
22512550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
22522550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
22532550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
22542550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
22552550b04179614da4c71dbef195d06a7f53273438Tim Rowley            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
22562550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
2257937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif
22582550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
22592550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
22602550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22612550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
22622550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
22632550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
22642550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
22652550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StoreMacroTile
22662550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
22672550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
22682550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores a macrotile to the destination surface using safe implementation.
22692550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to macro tile.
22702550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
22712550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to macro tile
22722550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void StoreGeneric(
22732550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrcHotTile,
22742550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
22752550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
22762550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
22772550b04179614da4c71dbef195d06a7f53273438Tim Rowley        PFN_STORE_TILES_INTERNAL pfnStore;
22782550b04179614da4c71dbef195d06a7f53273438Tim Rowley        pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
22792550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22802550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Store each raster tile from the hot tile to the destination surface.
22812550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
22822550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
22832550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
22842550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
22852550b04179614da4c71dbef195d06a7f53273438Tim Rowley                for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
22862550b04179614da4c71dbef195d06a7f53273438Tim Rowley                {
22872550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
22882550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
22892550b04179614da4c71dbef195d06a7f53273438Tim Rowley                }
22902550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
22912550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
22922550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22932550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
22942550b04179614da4c71dbef195d06a7f53273438Tim Rowley
22952550b04179614da4c71dbef195d06a7f53273438Tim Rowley    typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
22962550b04179614da4c71dbef195d06a7f53273438Tim Rowley    //////////////////////////////////////////////////////////////////////////
22972550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @brief Stores a macrotile to the destination surface.
22982550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pSrc - Pointer to macro tile.
22992550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param pDstSurface - Destination surface state
23002550b04179614da4c71dbef195d06a7f53273438Tim Rowley    /// @param x, y - Coordinates to macro tile
23012550b04179614da4c71dbef195d06a7f53273438Tim Rowley    static void Store(
23022550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint8_t *pSrcHotTile,
23032550b04179614da4c71dbef195d06a7f53273438Tim Rowley        SWR_SURFACE_STATE* pDstSurface,
23042550b04179614da4c71dbef195d06a7f53273438Tim Rowley        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
23052550b04179614da4c71dbef195d06a7f53273438Tim Rowley    {
23062550b04179614da4c71dbef195d06a7f53273438Tim Rowley        PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
230775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
23082550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
23092550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
231075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
231175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                0,
231275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                0,
231375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
231475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
231575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                sampleNum,
231675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                pDstSurface->lod,
231775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                pDstSurface);
231875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
231975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
232075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
232175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley                (pDstSurface->bInterleavedSamples);
232275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley
232375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley            pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
23242550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
23252550b04179614da4c71dbef195d06a7f53273438Tim Rowley
23262550b04179614da4c71dbef195d06a7f53273438Tim Rowley        // Store each raster tile from the hot tile to the destination surface.
23272550b04179614da4c71dbef195d06a7f53273438Tim Rowley        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
23282550b04179614da4c71dbef195d06a7f53273438Tim Rowley        {
23292550b04179614da4c71dbef195d06a7f53273438Tim Rowley            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
23302550b04179614da4c71dbef195d06a7f53273438Tim Rowley            {
23312550b04179614da4c71dbef195d06a7f53273438Tim Rowley                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
23322550b04179614da4c71dbef195d06a7f53273438Tim Rowley                {
23332550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
23342550b04179614da4c71dbef195d06a7f53273438Tim Rowley                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
23352550b04179614da4c71dbef195d06a7f53273438Tim Rowley                }
23362550b04179614da4c71dbef195d06a7f53273438Tim Rowley            }
23372550b04179614da4c71dbef195d06a7f53273438Tim Rowley        }
23382550b04179614da4c71dbef195d06a7f53273438Tim Rowley    }
23392550b04179614da4c71dbef195d06a7f53273438Tim Rowley};
23402550b04179614da4c71dbef195d06a7f53273438Tim Rowley
23412550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
23422550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// InitStoreTilesTable - Helper for setting up the tables.
23431b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
23442550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableColor_Half1(
23452550b04179614da4c71dbef195d06a7f53273438Tim Rowley    PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
23462550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
23471b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
23481b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
23491b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
23501b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
23511b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
23521b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
23531b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
23541b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
23551b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
23561b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
23571b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
23581b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
23591b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
23601b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
23611b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
23621b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
23631b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
23641b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
23651b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
23661b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
23671b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
23681b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
23691b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
23701b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
23711b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
23721b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
23731b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
23741b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
23751b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
23761b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
23771b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
23781b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
23791b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
23801b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
23811b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
23821b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
23831b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
23841b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
23851b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
23861b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
23871b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
23881b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
23891b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
23901b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
23911b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
23921b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
23931b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
23941b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
23951b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
23961b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
23971b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
23981b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
23991b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
24001b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
24011b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
24021b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
24032550b04179614da4c71dbef195d06a7f53273438Tim Rowley}
24042550b04179614da4c71dbef195d06a7f53273438Tim Rowley
24051b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
24062550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableColor_Half2(
24072550b04179614da4c71dbef195d06a7f53273438Tim Rowley    PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
24082550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
24091b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
24101b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
24111b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
24121b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
24131b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
24141b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
24151b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
24161b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
24171b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
24181b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
24191b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
24201b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
24211b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
24221b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
24231b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
24241b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
24251b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
24261b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
24271b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
24281b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
24291b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
24301b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
24311b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
24321b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
24331b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
24341b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
24351b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
24361b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
24371b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
24381b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
24391b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
24401b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
24411b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
24421b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
24431b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
24441b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
24451b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
24461b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
24471b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
24481b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
24491b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
24501b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
24511b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
24521b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
24531b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
24541b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
24551b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
24561b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
24571b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
24581b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
24591b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
24601b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
24611b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
24621b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
24631b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
24641b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
24651b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
24661b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
24671b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
24681b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
24691b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
24701b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
24711b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
24721b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
24732550b04179614da4c71dbef195d06a7f53273438Tim Rowley}
24742550b04179614da4c71dbef195d06a7f53273438Tim Rowley
24752550b04179614da4c71dbef195d06a7f53273438Tim Rowley//////////////////////////////////////////////////////////////////////////
24762550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
24771b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
24782550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableDepth(
24792550b04179614da4c71dbef195d06a7f53273438Tim Rowley    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
24802550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
24811b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley   table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
248245d9cd36fe9a3132e32f3efda0fbcbade2c71d21Ilia Mirkin   table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
24831b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley   table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
24841b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley   table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
24852550b04179614da4c71dbef195d06a7f53273438Tim Rowley}
24862550b04179614da4c71dbef195d06a7f53273438Tim Rowley
24871b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
24882550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableStencil(
24892550b04179614da4c71dbef195d06a7f53273438Tim Rowley    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
24902550b04179614da4c71dbef195d06a7f53273438Tim Rowley{
24911b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley    table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
24922550b04179614da4c71dbef195d06a7f53273438Tim Rowley}
2493