12550b04179614da4c71dbef195d06a7f53273438Tim Rowley/**************************************************************************** 22550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 32550b04179614da4c71dbef195d06a7f53273438Tim Rowley* 42550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a 52550b04179614da4c71dbef195d06a7f53273438Tim Rowley* copy of this software and associated documentation files (the "Software"), 62550b04179614da4c71dbef195d06a7f53273438Tim Rowley* to deal in the Software without restriction, including without limitation 72550b04179614da4c71dbef195d06a7f53273438Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense, 82550b04179614da4c71dbef195d06a7f53273438Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the 92550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Software is furnished to do so, subject to the following conditions: 102550b04179614da4c71dbef195d06a7f53273438Tim Rowley* 112550b04179614da4c71dbef195d06a7f53273438Tim Rowley* The above copyright notice and this permission notice (including the next 122550b04179614da4c71dbef195d06a7f53273438Tim Rowley* paragraph) shall be included in all copies or substantial portions of the 132550b04179614da4c71dbef195d06a7f53273438Tim Rowley* Software. 142550b04179614da4c71dbef195d06a7f53273438Tim Rowley* 152550b04179614da4c71dbef195d06a7f53273438Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 162550b04179614da4c71dbef195d06a7f53273438Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 172550b04179614da4c71dbef195d06a7f53273438Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 182550b04179614da4c71dbef195d06a7f53273438Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 192550b04179614da4c71dbef195d06a7f53273438Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 202550b04179614da4c71dbef195d06a7f53273438Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 212550b04179614da4c71dbef195d06a7f53273438Tim Rowley* IN THE SOFTWARE. 222550b04179614da4c71dbef195d06a7f53273438Tim Rowley* 232550b04179614da4c71dbef195d06a7f53273438Tim Rowley* @file StoreTile.h 242550b04179614da4c71dbef195d06a7f53273438Tim Rowley* 252550b04179614da4c71dbef195d06a7f53273438Tim Rowley* @brief Functionality for Store. 262550b04179614da4c71dbef195d06a7f53273438Tim Rowley* 272550b04179614da4c71dbef195d06a7f53273438Tim Rowley******************************************************************************/ 282550b04179614da4c71dbef195d06a7f53273438Tim Rowley#pragma once 292550b04179614da4c71dbef195d06a7f53273438Tim Rowley 302550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "common/os.h" 312550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "common/formats.h" 322550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/context.h" 332550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/rdtsc_core.h" 342550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/format_conversion.h" 352550b04179614da4c71dbef195d06a7f53273438Tim Rowley 362550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "memory/TilingFunctions.h" 372550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "memory/Convert.h" 382550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include "core/multisample.h" 392550b04179614da4c71dbef195d06a7f53273438Tim Rowley 402550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include <array> 412550b04179614da4c71dbef195d06a7f53273438Tim Rowley#include <sstream> 422550b04179614da4c71dbef195d06a7f53273438Tim Rowley 432550b04179614da4c71dbef195d06a7f53273438Tim Rowley// Function pointer to different storing functions for color, depth, and stencil based on incoming formats. 442550b04179614da4c71dbef195d06a7f53273438Tim Rowleytypedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t); 452550b04179614da4c71dbef195d06a7f53273438Tim Rowley 462550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 472550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// Store Raster Tile Function Tables. 482550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 492550b04179614da4c71dbef195d06a7f53273438Tim Rowleyextern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 502550b04179614da4c71dbef195d06a7f53273438Tim Rowleyextern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 512550b04179614da4c71dbef195d06a7f53273438Tim Rowleyextern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS]; 522550b04179614da4c71dbef195d06a7f53273438Tim Rowley 532550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_Linear_1(); 542550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_Linear_2(); 552550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileX_1(); 562550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileX_2(); 572550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileY_1(); 582550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileY_2(); 592550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable_TileW(); 602550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTable(); 612550b04179614da4c71dbef195d06a7f53273438Tim Rowley 622550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 632550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels 642550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows. 652550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc - Pointer to source raster tile in SWRZ pixel order 662550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts - Array of destination pointers. Each pointer is 672550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// to a single row of at most 16B. 682550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers. Each pair of 692550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// pointers is for a 16-byte column of two rows. 702550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 712550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <size_t PixelSize, size_t NumDests> 722550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels 732550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 742550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete; 752550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 762550b04179614da4c71dbef195d06a7f53273438Tim Rowley 772550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 782550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization) 792550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows. 802550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc - Pointer to source raster tile in SWRZ pixel order 812550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts - Array of destination pointers. Each pointer is 822550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// to a single row of at most 16B. 832550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers. Each pair of 842550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// pointers is for a 16-byte column of two rows. 852550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 862550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <> 872550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<8, 2> 882550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 892550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 902550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 912550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Each 4-pixel row is 4 bytes. 922550b04179614da4c71dbef195d06a7f53273438Tim Rowley const uint16_t* pPixSrc = (const uint16_t*)pSrc; 932550b04179614da4c71dbef195d06a7f53273438Tim Rowley 942550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Unswizzle from SWR-Z order 952550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint16_t* pRow = (uint16_t*)ppDsts[0]; 962550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[0] = pPixSrc[0]; 972550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[1] = pPixSrc[2]; 982550b04179614da4c71dbef195d06a7f53273438Tim Rowley 992550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow = (uint16_t*)ppDsts[1]; 1002550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[0] = pPixSrc[1]; 1012550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[1] = pPixSrc[3]; 1022550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1032550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 1042550b04179614da4c71dbef195d06a7f53273438Tim Rowley 105937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 106937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <> 107937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<8, 4> 108937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{ 109937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 110937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 111937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // 8 x 2 bytes = 16 bytes, 16 pixels 112937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc); 113937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 114937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts); 115937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 116937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // Unswizzle from SWR-Z order 117937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[0][0] = pSrc16[0]; // 0 1 118937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[0][1] = pSrc16[2]; // 4 5 119937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 120937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[1][0] = pSrc16[1]; // 2 3 121937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[1][1] = pSrc16[3]; // 6 7 122937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 123937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[2][0] = pSrc16[4]; // 8 9 124937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[2][1] = pSrc16[6]; // C D 125937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 126937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[3][0] = pSrc16[5]; // A B 127937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts16[3][1] = pSrc16[7]; // E F 128937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 129937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley}; 130937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 131937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 1322550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 1332550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization) 1342550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows. 1352550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc - Pointer to source raster tile in SWRZ pixel order 1362550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts - Array of destination pointers. Each pointer is 1372550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// to a single row of at most 16B. 1382550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers. Each pair of 1392550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// pointers is for a 16-byte column of two rows. 1402550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 1412550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <> 1422550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<16, 2> 1432550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 1442550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 1452550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 1462550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Each 4-pixel row is 8 bytes. 1472550b04179614da4c71dbef195d06a7f53273438Tim Rowley const uint32_t* pPixSrc = (const uint32_t*)pSrc; 1482550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1492550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Unswizzle from SWR-Z order 1502550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t* pRow = (uint32_t*)ppDsts[0]; 1512550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[0] = pPixSrc[0]; 1522550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[1] = pPixSrc[2]; 1532550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1542550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow = (uint32_t*)ppDsts[1]; 1552550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[0] = pPixSrc[1]; 1562550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow[1] = pPixSrc[3]; 1572550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1582550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 1592550b04179614da4c71dbef195d06a7f53273438Tim Rowley 160937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 161937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <> 162937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<16, 4> 163937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{ 164937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 165937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 166937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // 8 x 4 bytes = 32 bytes, 16 pixels 167937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc); 168937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 169937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts); 170937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 171937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // Unswizzle from SWR-Z order 172937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[0][0] = pSrc32[0]; // 0 1 173937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[0][1] = pSrc32[2]; // 4 5 174937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 175937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[1][0] = pSrc32[1]; // 2 3 176937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[1][1] = pSrc32[3]; // 6 7 177937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 178937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[2][0] = pSrc32[4]; // 8 9 179937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[2][1] = pSrc32[6]; // C D 180937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 181937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[3][0] = pSrc32[5]; // A B 182937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts32[3][1] = pSrc32[7]; // E F 183937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 184937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley}; 185937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 186937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 1872550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 1882550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization) 1892550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows. 1902550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc - Pointer to source raster tile in SWRZ pixel order 1912550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts - Array of destination pointers. Each pointer is 1922550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// to a single row of at most 16B. 1932550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers. Each pair of 1942550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// pointers is for a 16-byte column of two rows. 1952550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 1962550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <> 1972550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<32, 2> 1982550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 1992550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2]) 2002550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 2012550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Each 4-pixel row is 16-bytes 2022550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i *pZRow01 = (__m128i*)pSrc; 2032550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vQuad00 = _mm_load_si128(pZRow01); 2042550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); 2052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 2062550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); 2072550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); 2082550b04179614da4c71dbef195d06a7f53273438Tim Rowley 2092550b04179614da4c71dbef195d06a7f53273438Tim Rowley _mm_storeu_si128((__m128i*)ppDsts[0], vRow00); 2102550b04179614da4c71dbef195d06a7f53273438Tim Rowley _mm_storeu_si128((__m128i*)ppDsts[1], vRow10); 2112550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 2122550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 2132550b04179614da4c71dbef195d06a7f53273438Tim Rowley 214488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 215488992221056edaf7111f9290afdf216c5e98d62Tim Rowleytemplate <> 216488992221056edaf7111f9290afdf216c5e98d62Tim Rowleystruct StorePixels<32, 4> 217488992221056edaf7111f9290afdf216c5e98d62Tim Rowley{ 218488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 219488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 220937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // 4 x 16 bytes = 64 bytes, 16 pixels 221937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); 222937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 223937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); 224937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 225937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // Unswizzle from SWR-Z order 226937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3 227937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7 228937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B 229937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F 230937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 231937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5 232937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7 233937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D 234937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F 235488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 236488992221056edaf7111f9290afdf216c5e98d62Tim Rowley}; 237488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 238488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 2392550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 2402550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization) 2412550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows. 2422550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc - Pointer to source raster tile in SWRZ pixel order 2432550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts - Array of destination pointers. Each pointer is 2442550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// to a single row of at most 16B. 2452550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers. Each pair of 2462550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// pointers is for a 16-byte column of two rows. 2472550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 2482550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <> 2492550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<64, 4> 2502550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 2512550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) 2522550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 2532550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Each 4-pixel row is 32 bytes. 2542550b04179614da4c71dbef195d06a7f53273438Tim Rowley const __m128i* pPixSrc = (const __m128i*)pSrc; 2552550b04179614da4c71dbef195d06a7f53273438Tim Rowley 2562550b04179614da4c71dbef195d06a7f53273438Tim Rowley // order of pointers match SWR-Z layout 2572550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i** pvDsts = (__m128i**)&ppDsts[0]; 2582550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[0] = pPixSrc[0]; 2592550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[1] = pPixSrc[1]; 2602550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[2] = pPixSrc[2]; 2612550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[3] = pPixSrc[3]; 2622550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 2632550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 2642550b04179614da4c71dbef195d06a7f53273438Tim Rowley 265937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 266937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <> 267937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<64, 8> 268937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{ 269937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 270937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 271937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // 8 x 16 bytes = 128 bytes, 16 pixels 272937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); 273937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 274937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); 275937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 276937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // order of pointers match SWR-Z layout 277937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[0] = pSrc128[0]; // 0 1 278937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[1] = pSrc128[1]; // 2 3 279937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[2] = pSrc128[2]; // 4 5 280937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[3] = pSrc128[3]; // 6 7 281937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[4] = pSrc128[4]; // 8 9 282937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[5] = pSrc128[5]; // A B 283937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[6] = pSrc128[6]; // C D 284937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[7] = pSrc128[7]; // E F 285937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 286937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley}; 287937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 288937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 2892550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 2902550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StorePixels (32-bit pixel specialization) 2912550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @brief Stores a 4x2 (AVX) raster-tile to two rows. 2922550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param pSrc - Pointer to source raster tile in SWRZ pixel order 2932550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @param ppDsts - Array of destination pointers. Each pointer is 2942550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// to a single row of at most 16B. 2952550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// @tparam NumDests - Number of destination pointers. Each pair of 2962550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// pointers is for a 16-byte column of two rows. 2972550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 2982550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate <> 2992550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StorePixels<128, 8> 3002550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 3012550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) 3022550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 3032550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Each 4-pixel row is 64 bytes. 3042550b04179614da4c71dbef195d06a7f53273438Tim Rowley const __m128i* pPixSrc = (const __m128i*)pSrc; 3052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 3062550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Unswizzle from SWR-Z order 3072550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i** pvDsts = (__m128i**)&ppDsts[0]; 3082550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[0] = pPixSrc[0]; 3092550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[1] = pPixSrc[2]; 3102550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[2] = pPixSrc[1]; 3112550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[3] = pPixSrc[3]; 3122550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[4] = pPixSrc[4]; 3132550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[5] = pPixSrc[6]; 3142550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[6] = pPixSrc[5]; 3152550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pvDsts[7] = pPixSrc[7]; 3162550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 3172550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 3182550b04179614da4c71dbef195d06a7f53273438Tim Rowley 319937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 320937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate <> 321937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleystruct StorePixels<128, 16> 322937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{ 323937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16]) 324937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 325937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // 16 x 16 bytes = 256 bytes, 16 pixels 326937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); 327937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 328937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); 329937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 330937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t i = 0; i < 16; i += 4) 331937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 332937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[i + 0] = pSrc128[i + 0]; 333937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[i + 1] = pSrc128[i + 2]; 334937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[i + 2] = pSrc128[i + 1]; 335937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *ppDsts128[i + 3] = pSrc128[i + 3]; 336937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 337937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 338937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley}; 339937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 340937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 3412550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 3422550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 3432550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 3442550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 3452550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS 3462550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 3472550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 3482550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Converts a SIMD from the Hot Tile to the destination format 3492550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// and converts from SOA to AOS. 3502550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 3512550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDst - Pointer to destination surface or deswizzling buffer. 3522550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 3532550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 3542550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 355488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 356488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 357488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 358488992221056edaf7111f9290afdf216c5e98d62Tim Rowley OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 359488992221056edaf7111f9290afdf216c5e98d62Tim Rowley OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 360488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 361488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // Convert from SrcFormat --> DstFormat 362488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16vector src; 363488992221056edaf7111f9290afdf216c5e98d62Tim Rowley LoadSOA<SrcFormat>(pSrc, src); 364488992221056edaf7111f9290afdf216c5e98d62Tim Rowley StoreSOA<DstFormat>(src, soaTile); 365488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 366488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // Convert from SOA --> AOS 367488992221056edaf7111f9290afdf216c5e98d62Tim Rowley FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile); 368488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 369488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 3702550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 3712550b04179614da4c71dbef195d06a7f53273438Tim Rowley 3722550b04179614da4c71dbef195d06a7f53273438Tim Rowley OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 3732550b04179614da4c71dbef195d06a7f53273438Tim Rowley OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 3742550b04179614da4c71dbef195d06a7f53273438Tim Rowley 3752550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Convert from SrcFormat --> DstFormat 3762550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdvector src; 3772550b04179614da4c71dbef195d06a7f53273438Tim Rowley LoadSOA<SrcFormat>(pSrc, src); 3782550b04179614da4c71dbef195d06a7f53273438Tim Rowley StoreSOA<DstFormat>(src, soaTile); 3792550b04179614da4c71dbef195d06a7f53273438Tim Rowley 3802550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Convert from SOA --> AOS 3812550b04179614da4c71dbef195d06a7f53273438Tim Rowley FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); 3822550b04179614da4c71dbef195d06a7f53273438Tim Rowley 383488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 3842550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Store data into destination 3852550b04179614da4c71dbef195d06a7f53273438Tim Rowley StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 3862550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 3872550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 3882550b04179614da4c71dbef195d06a7f53273438Tim Rowley 3892550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 3902550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 3912550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// Specialization for no format conversion 3922550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 3932550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT Format> 3942550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<Format, Format> 3952550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 3962550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 3972550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Converts a SIMD from the Hot Tile to the destination format 3982550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// and converts from SOA to AOS. 3992550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 4002550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDst - Pointer to destination surface or deswizzling buffer. 4012550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 4022550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 4032550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 404488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 405488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 406488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 407488992221056edaf7111f9290afdf216c5e98d62Tim Rowley OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 408488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 409488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // Convert from SOA --> AOS 410488992221056edaf7111f9290afdf216c5e98d62Tim Rowley FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile); 411488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 412488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 4132550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 4142550b04179614da4c71dbef195d06a7f53273438Tim Rowley 4152550b04179614da4c71dbef195d06a7f53273438Tim Rowley OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 4162550b04179614da4c71dbef195d06a7f53273438Tim Rowley 4172550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Convert from SOA --> AOS 4182550b04179614da4c71dbef195d06a7f53273438Tim Rowley FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile); 4192550b04179614da4c71dbef195d06a7f53273438Tim Rowley 420488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 4212550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Store data into destination 4222550b04179614da4c71dbef195d06a7f53273438Tim Rowley StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); 4232550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 4242550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 4252550b04179614da4c71dbef195d06a7f53273438Tim Rowley 4262550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 4272550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM 4282550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 4292550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 4302550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > 4312550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 4322550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 4332550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Converts a SIMD from the Hot Tile to the destination format 4342550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// and converts from SOA to AOS. 4352550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 4362550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDst - Pointer to destination surface or deswizzling buffer. 4372550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 4382550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 4392550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 440937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 441937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 442937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 443937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 444937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel 445937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 446937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 447937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 448937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // Load hot-tile 449937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16vector src, dst; 450937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley LoadSOA<SrcFormat>(pSrc, src); 451937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 452937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // deswizzle 453937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 454937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 455937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 456937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 457937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // clamp 458937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.x = Clamp<DstFormat>(dst.x, 0); 459937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.y = Clamp<DstFormat>(dst.y, 1); 460937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.z = Clamp<DstFormat>(dst.z, 2); 461937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 462937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // normalize 463937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.x = Normalize<DstFormat>(dst.x, 0); 464937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.y = Normalize<DstFormat>(dst.y, 1); 465937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley dst.z = Normalize<DstFormat>(dst.z, 2); 466937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 467937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // pack 468937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari packed = _simd16_castps_si(dst.x); 469937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 470937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5); 471937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6); 472937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 473937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5)); 474937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6)); 475937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 476937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // pack low 16 bits of each 32 bit lane to low 128 bits of dst 477937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint32_t *pPacked = (uint32_t*)&packed; 478937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 479937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t) 480937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 481937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley *pAosTile++ = *pPacked++; 482937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 483937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 484937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 4852550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; 4862550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const SWR_FORMAT DstFormat = B5G6R5_UNORM; 4872550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 4882550b04179614da4c71dbef195d06a7f53273438Tim Rowley 4892550b04179614da4c71dbef195d06a7f53273438Tim Rowley OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 4902550b04179614da4c71dbef195d06a7f53273438Tim Rowley 4912550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Load hot-tile 4922550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdvector src, dst; 4932550b04179614da4c71dbef195d06a7f53273438Tim Rowley LoadSOA<SrcFormat>(pSrc, src); 4942550b04179614da4c71dbef195d06a7f53273438Tim Rowley 4952550b04179614da4c71dbef195d06a7f53273438Tim Rowley // deswizzle 4962550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; 4972550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; 4982550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; 4992550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5002550b04179614da4c71dbef195d06a7f53273438Tim Rowley // clamp 5012550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.x = Clamp<DstFormat>(dst.x, 0); 5022550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.y = Clamp<DstFormat>(dst.y, 1); 5032550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.z = Clamp<DstFormat>(dst.z, 2); 5042550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5052550b04179614da4c71dbef195d06a7f53273438Tim Rowley // normalize 5062550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.x = Normalize<DstFormat>(dst.x, 0); 5072550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.y = Normalize<DstFormat>(dst.y, 1); 5082550b04179614da4c71dbef195d06a7f53273438Tim Rowley dst.z = Normalize<DstFormat>(dst.z, 2); 5092550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5102550b04179614da4c71dbef195d06a7f53273438Tim Rowley // pack 5112550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalari packed = _simd_castps_si(dst.x); 5122550b04179614da4c71dbef195d06a7f53273438Tim Rowley packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0))); 5132550b04179614da4c71dbef195d06a7f53273438Tim Rowley packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) + 5142550b04179614da4c71dbef195d06a7f53273438Tim Rowley FormatTraits<DstFormat>::GetBPC(1))); 5152550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5162550b04179614da4c71dbef195d06a7f53273438Tim Rowley // pack low 16 bits of each 32 bit lane to low 128 bits of dst 5172550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t *pPacked = (uint32_t*)&packed; 5182550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint16_t *pAosTile = (uint16_t*)&aosTile[0]; 5192550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t) 5202550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 5212550b04179614da4c71dbef195d06a7f53273438Tim Rowley *pAosTile++ = *pPacked++; 5222550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 5232550b04179614da4c71dbef195d06a7f53273438Tim Rowley 524937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 5252550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Store data into destination 5262550b04179614da4c71dbef195d06a7f53273438Tim Rowley StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); 5272550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 5282550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 5292550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5302550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 5312550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) 5322550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 5332550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 5342550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> 5352550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 5362550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const SWR_FORMAT SrcFormat = R32_FLOAT; 5372550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS; 5382550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5392550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 5402550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Converts a SIMD from the Hot Tile to the destination format 5412550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// and converts from SOA to AOS. 5422550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 5432550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDst - Pointer to destination surface or deswizzling buffer. 5442550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 5452550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 5462550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 547488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 5487aea08667c673713e1f419539e788eedeea047cbTim Rowley simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 549488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5507aea08667c673713e1f419539e788eedeea047cbTim Rowley // clamp 5517aea08667c673713e1f419539e788eedeea047cbTim Rowley const simd16scalar zero = _simd16_setzero_ps(); 5527aea08667c673713e1f419539e788eedeea047cbTim Rowley const simd16scalar ones = _simd16_set1_ps(1.0f); 553488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5547aea08667c673713e1f419539e788eedeea047cbTim Rowley comp = _simd16_max_ps(comp, zero); 5557aea08667c673713e1f419539e788eedeea047cbTim Rowley comp = _simd16_min_ps(comp, ones); 556488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5577aea08667c673713e1f419539e788eedeea047cbTim Rowley // normalize 5587aea08667c673713e1f419539e788eedeea047cbTim Rowley comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 559937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 5607aea08667c673713e1f419539e788eedeea047cbTim Rowley simd16scalari temp = _simd16_cvtps_epi32(comp); 561488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5627aea08667c673713e1f419539e788eedeea047cbTim Rowley // swizzle 5637aea08667c673713e1f419539e788eedeea047cbTim Rowley temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 564488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5657aea08667c673713e1f419539e788eedeea047cbTim Rowley // merge/store data into destination but don't overwrite the X8 bits 566937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0])); 567937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2])); 568488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5697aea08667c673713e1f419539e788eedeea047cbTim Rowley simd16scalari dest = _simd16_setzero_si(); 5707aea08667c673713e1f419539e788eedeea047cbTim Rowley 5717aea08667c673713e1f419539e788eedeea047cbTim Rowley dest = _simd16_insert_si(dest, destlo, 0); 5727aea08667c673713e1f419539e788eedeea047cbTim Rowley dest = _simd16_insert_si(dest, desthi, 1); 5737aea08667c673713e1f419539e788eedeea047cbTim Rowley 5747aea08667c673713e1f419539e788eedeea047cbTim Rowley simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF); 575488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5767aea08667c673713e1f419539e788eedeea047cbTim Rowley dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp)); 577488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 5787aea08667c673713e1f419539e788eedeea047cbTim Rowley _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0)); 5797aea08667c673713e1f419539e788eedeea047cbTim Rowley _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1)); 580488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 5812550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel 5822550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5832550b04179614da4c71dbef195d06a7f53273438Tim Rowley OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; 5842550b04179614da4c71dbef195d06a7f53273438Tim Rowley OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; 5852550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5862550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Convert from SrcFormat --> DstFormat 5872550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdvector src; 5882550b04179614da4c71dbef195d06a7f53273438Tim Rowley LoadSOA<SrcFormat>(pSrc, src); 5892550b04179614da4c71dbef195d06a7f53273438Tim Rowley StoreSOA<DstFormat>(src, soaTile); 5902550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5912550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Convert from SOA --> AOS 5922550b04179614da4c71dbef195d06a7f53273438Tim Rowley FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile); 5932550b04179614da4c71dbef195d06a7f53273438Tim Rowley 5942550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Store data into destination but don't overwrite the X8 bits 5952550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Each 4-pixel row is 16-bytes 5962550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i *pZRow01 = (__m128i*)aosTile; 5972550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vQuad00 = _mm_load_si128(pZRow01); 5982550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); 5992550b04179614da4c71dbef195d06a7f53273438Tim Rowley 6002550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); 6012550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); 6022550b04179614da4c71dbef195d06a7f53273438Tim Rowley 6032550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); 6042550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); 6052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 6062550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vMask = _mm_set1_epi32(0xFFFFFF); 6072550b04179614da4c71dbef195d06a7f53273438Tim Rowley 6082550b04179614da4c71dbef195d06a7f53273438Tim Rowley vDst0 = _mm_andnot_si128(vMask, vDst0); 6092550b04179614da4c71dbef195d06a7f53273438Tim Rowley vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); 6102550b04179614da4c71dbef195d06a7f53273438Tim Rowley vDst1 = _mm_andnot_si128(vMask, vDst1); 6112550b04179614da4c71dbef195d06a7f53273438Tim Rowley vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); 6122550b04179614da4c71dbef195d06a7f53273438Tim Rowley 6132550b04179614da4c71dbef195d06a7f53273438Tim Rowley _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); 6142550b04179614da4c71dbef195d06a7f53273438Tim Rowley _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); 615488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 6162550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 6172550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 6182550b04179614da4c71dbef195d06a7f53273438Tim Rowley 619488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 620488992221056edaf7111f9290afdf216c5e98d62Tim Rowleytemplate<SWR_FORMAT DstFormat> 621488992221056edaf7111f9290afdf216c5e98d62Tim RowleyINLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 622488992221056edaf7111f9290afdf216c5e98d62Tim Rowley{ 623488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // swizzle rgba -> bgra while we load 624488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 625488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 626488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 627488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa 628488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 629937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // clamp 630488992221056edaf7111f9290afdf216c5e98d62Tim Rowley const simd16scalar zero = _simd16_setzero_ps(); 631488992221056edaf7111f9290afdf216c5e98d62Tim Rowley const simd16scalar ones = _simd16_set1_ps(1.0f); 632488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 633488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp0 = _simd16_max_ps(comp0, zero); 634488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp0 = _simd16_min_ps(comp0, ones); 635488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 636488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp1 = _simd16_max_ps(comp1, zero); 637488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp1 = _simd16_min_ps(comp1, ones); 638488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 639488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp2 = _simd16_max_ps(comp2, zero); 640488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp2 = _simd16_min_ps(comp2, ones); 641488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 642488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp3 = _simd16_max_ps(comp3, zero); 643488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp3 = _simd16_min_ps(comp3, ones); 644488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 645937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // gamma-correct only rgb 646488992221056edaf7111f9290afdf216c5e98d62Tim Rowley if (FormatTraits<DstFormat>::isSRGB) 647488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 648488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 649488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 650488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 651488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 652488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 653937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 654488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 655488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 656488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 657488992221056edaf7111f9290afdf216c5e98d62Tim Rowley comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 658488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 659488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // moving to 16 wide integer vector types 660488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 661488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 662488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 663488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa 664488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 665937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // SOA to AOS conversion 6667aea08667c673713e1f419539e788eedeea047cbTim Rowley src1 = _simd16_slli_epi32(src1, 8); 667488992221056edaf7111f9290afdf216c5e98d62Tim Rowley src2 = _simd16_slli_epi32(src2, 16); 668488992221056edaf7111f9290afdf216c5e98d62Tim Rowley src3 = _simd16_slli_epi32(src3, 24); 669488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 670488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 671488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 672937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // de-swizzle conversion 673488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if 1 674488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 675488992221056edaf7111f9290afdf216c5e98d62Tim Rowley simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 676488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 677488992221056edaf7111f9290afdf216c5e98d62Tim Rowley final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 678488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 679488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 680488992221056edaf7111f9290afdf216c5e98d62Tim Rowley final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 681488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 682488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 683937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // store 8x2 memory order: 684937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 685937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 686937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); 687937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); 688488992221056edaf7111f9290afdf216c5e98d62Tim Rowley} 689488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 690488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 6912550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT DstFormat> 6922550b04179614da4c71dbef195d06a7f53273438Tim RowleyINLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 6932550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 6942550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t offset = sizeof(simdscalar); 6952550b04179614da4c71dbef195d06a7f53273438Tim Rowley 6962550b04179614da4c71dbef195d06a7f53273438Tim Rowley // swizzle rgba -> bgra while we load 6972550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 6982550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 6992550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 7002550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa 7012550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7022550b04179614da4c71dbef195d06a7f53273438Tim Rowley // clamp 7032550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 7042550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 7052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7062550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 7072550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 7082550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7092550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 7102550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 7112550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7122550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps()); 7132550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f)); 7142550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7152550b04179614da4c71dbef195d06a7f53273438Tim Rowley if (FormatTraits<DstFormat>::isSRGB) 7162550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 7172550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Gamma-correct only rgb 7182550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 7192550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 7202550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 7212550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 7222550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7232550b04179614da4c71dbef195d06a7f53273438Tim Rowley // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 7242550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 7252550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 7262550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 7272550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3))); 7282550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7292550b04179614da4c71dbef195d06a7f53273438Tim Rowley // moving to 8 wide integer vector types 7302550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 7312550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 7322550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 7332550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa 7342550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7352550b04179614da4c71dbef195d06a7f53273438Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX 7362550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7372550b04179614da4c71dbef195d06a7f53273438Tim Rowley // splitting into two sets of 4 wide integer vector types 7382550b04179614da4c71dbef195d06a7f53273438Tim Rowley // because AVX doesn't have instructions to support this operation at 8 wide 7392550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 7402550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 7412550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 7422550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a 7432550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7442550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 7452550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 7462550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 7472550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a 7482550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7492550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 7502550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 7512550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 7522550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 7532550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 7542550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 7552550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7562550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr 7572550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 7582550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7592550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr 7602550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 7612550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7622550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr 7632550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr 7642550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7652550b04179614da4c71dbef195d06a7f53273438Tim Rowley // unpack into rows that get the tiling order correct 7662550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr 7672550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); 7682550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7692550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i final = _mm256_castsi128_si256(vRow00); 7702550b04179614da4c71dbef195d06a7f53273438Tim Rowley final = _mm256_insertf128_si256(final, vRow10, 1); 7712550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7722550b04179614da4c71dbef195d06a7f53273438Tim Rowley#elif KNOB_ARCH >= KNOB_ARCH_AVX2 7732550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7742550b04179614da4c71dbef195d06a7f53273438Tim Rowley // logic is as above, only wider 7752550b04179614da4c71dbef195d06a7f53273438Tim Rowley src1 = _mm256_slli_si256(src1, 1); 7762550b04179614da4c71dbef195d06a7f53273438Tim Rowley src2 = _mm256_slli_si256(src2, 2); 7772550b04179614da4c71dbef195d06a7f53273438Tim Rowley src3 = _mm256_slli_si256(src3, 3); 7782550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7792550b04179614da4c71dbef195d06a7f53273438Tim Rowley src0 = _mm256_or_si256(src0, src1); 7802550b04179614da4c71dbef195d06a7f53273438Tim Rowley src2 = _mm256_or_si256(src2, src3); 7812550b04179614da4c71dbef195d06a7f53273438Tim Rowley 7822550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i final = _mm256_or_si256(src0, src2); 783488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if 0 784488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 785488992221056edaf7111f9290afdf216c5e98d62Tim Rowley __m256i perm = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); 786488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 787488992221056edaf7111f9290afdf216c5e98d62Tim Rowley final = _mm256_permutevar8x32_epi32(final, perm); 788488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 789488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 7902550b04179614da4c71dbef195d06a7f53273438Tim Rowley // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 7912550b04179614da4c71dbef195d06a7f53273438Tim Rowley final = _mm256_permute4x64_epi64(final, 0xD8); 792488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 7932550b04179614da4c71dbef195d06a7f53273438Tim Rowley#endif 7942550b04179614da4c71dbef195d06a7f53273438Tim Rowley 795937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final); 796937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley} 797937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 798937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 799937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowleytemplate<SWR_FORMAT DstFormat> 800937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim RowleyINLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) 801937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley{ 802937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // swizzle rgba -> bgra while we load 803937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr 804937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg 805937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb 806937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 807937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // clamp 808937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const simd16scalar zero = _simd16_setzero_ps(); 809937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const simd16scalar ones = _simd16_set1_ps(1.0f); 810937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 811937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp0 = _simd16_max_ps(comp0, zero); 812937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp0 = _simd16_min_ps(comp0, ones); 813937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 814937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp1 = _simd16_max_ps(comp1, zero); 815937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp1 = _simd16_min_ps(comp1, ones); 816937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 817937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp2 = _simd16_max_ps(comp2, zero); 818937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp2 = _simd16_min_ps(comp2, ones); 819937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 820937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // gamma-correct only rgb 821937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley if (FormatTraits<DstFormat>::isSRGB) 822937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 823937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); 824937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); 825937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); 826937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 827937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 828937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format 829937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 830937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 831937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 832937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 833937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // moving to 16 wide integer vector types 834937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr 835937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg 836937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb 837937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 838937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // SOA to AOS conversion 8397aea08667c673713e1f419539e788eedeea047cbTim Rowley src1 = _simd16_slli_epi32(src1, 8); 840937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley src2 = _simd16_slli_epi32(src2, 16); 841937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 842937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F 843937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 844937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // de-swizzle conversion 845937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if 1 846937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B 847937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F 848937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 849937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F 850937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 851937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 852937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); 853937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 854937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 855937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // store 8x2 memory order: 856937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } 857937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } 858937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); 859937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); 8602550b04179614da4c71dbef195d06a7f53273438Tim Rowley} 8612550b04179614da4c71dbef195d06a7f53273438Tim Rowley 862937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 8632550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT DstFormat> 8642550b04179614da4c71dbef195d06a7f53273438Tim RowleyINLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) 8652550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 8662550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t offset = sizeof(simdscalar); 8672550b04179614da4c71dbef195d06a7f53273438Tim Rowley 8682550b04179614da4c71dbef195d06a7f53273438Tim Rowley // swizzle rgba -> bgra while we load 8692550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 8702550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg 8712550b04179614da4c71dbef195d06a7f53273438Tim Rowley simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 8722550b04179614da4c71dbef195d06a7f53273438Tim Rowley // clamp 8732550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps()); 8742550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f)); 8752550b04179614da4c71dbef195d06a7f53273438Tim Rowley 8762550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps()); 8772550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f)); 8782550b04179614da4c71dbef195d06a7f53273438Tim Rowley 8792550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps()); 8802550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f)); 8812550b04179614da4c71dbef195d06a7f53273438Tim Rowley 8822550b04179614da4c71dbef195d06a7f53273438Tim Rowley if (FormatTraits<DstFormat>::isSRGB) 8832550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 8842550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Gamma-correct only rgb 8852550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0); 8862550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1); 8872550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2); 8882550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 8892550b04179614da4c71dbef195d06a7f53273438Tim Rowley 8902550b04179614da4c71dbef195d06a7f53273438Tim Rowley // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format 8912550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 8922550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); 8932550b04179614da4c71dbef195d06a7f53273438Tim Rowley vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); 8942550b04179614da4c71dbef195d06a7f53273438Tim Rowley 8952550b04179614da4c71dbef195d06a7f53273438Tim Rowley // moving to 8 wide integer vector types 8962550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr 8972550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 8982550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 8992550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9002550b04179614da4c71dbef195d06a7f53273438Tim Rowley#if KNOB_ARCH == KNOB_ARCH_AVX 9012550b04179614da4c71dbef195d06a7f53273438Tim Rowley 90275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // splitting into two sets of 4 wide integer vector types 90375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // because AVX doesn't have instructions to support this operation at 8 wide 9042550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r 9052550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g 9062550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b 9072550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9082550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r 9092550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g 9102550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b 9112550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9122550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 9132550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 9142550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 9152550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 9162550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9172550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr 9182550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9192550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr 9202550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9212550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr 9222550b04179614da4c71dbef195d06a7f53273438Tim Rowley srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr 9232550b04179614da4c71dbef195d06a7f53273438Tim Rowley 92475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // unpack into rows that get the tiling order correct 9252550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr 9262550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); 9272550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9282550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i final = _mm256_castsi128_si256(vRow00); 9292550b04179614da4c71dbef195d06a7f53273438Tim Rowley final = _mm256_insertf128_si256(final, vRow10, 1); 9302550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9312550b04179614da4c71dbef195d06a7f53273438Tim Rowley#elif KNOB_ARCH >= KNOB_ARCH_AVX2 9322550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9332550b04179614da4c71dbef195d06a7f53273438Tim Rowley // logic is as above, only wider 9342550b04179614da4c71dbef195d06a7f53273438Tim Rowley src1 = _mm256_slli_si256(src1, 1); 9352550b04179614da4c71dbef195d06a7f53273438Tim Rowley src2 = _mm256_slli_si256(src2, 2); 9362550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9372550b04179614da4c71dbef195d06a7f53273438Tim Rowley src0 = _mm256_or_si256(src0, src1); 9382550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9392550b04179614da4c71dbef195d06a7f53273438Tim Rowley __m256i final = _mm256_or_si256(src0, src2); 9402550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9412550b04179614da4c71dbef195d06a7f53273438Tim Rowley // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3 9422550b04179614da4c71dbef195d06a7f53273438Tim Rowley final = _mm256_permute4x64_epi64(final, 0xD8); 9432550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9442550b04179614da4c71dbef195d06a7f53273438Tim Rowley#endif 9452550b04179614da4c71dbef195d06a7f53273438Tim Rowley 946937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final); 9472550b04179614da4c71dbef195d06a7f53273438Tim Rowley} 9482550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9492550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 9502550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> 9512550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 9522550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 9532550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 9542550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 955937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 956937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 957937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 9582550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 959937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 9602550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 9612550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 9622550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9632550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 9642550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> 9652550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 9662550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 9672550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 9682550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 969937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 970937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 971937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 9722550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 973937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 9742550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 9752550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 9762550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9772550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 9782550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > 9792550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 9802550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 9812550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 9822550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 983937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 984937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 985937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 9862550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 987937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 9882550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 9892550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 9902550b04179614da4c71dbef195d06a7f53273438Tim Rowley 9912550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 9922550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > 9932550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 9942550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 9952550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 9962550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 997937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 998937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 999937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 10002550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1001937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 10022550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 10032550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 10042550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10052550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 10062550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM > 10072550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 10082550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 10092550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 10102550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 1011488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 1012488992221056edaf7111f9290afdf216c5e98d62Tim Rowley FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1013488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 10142550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 1015488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 10162550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 10172550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 10182550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10192550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 10202550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > 10212550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 10222550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 10232550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 10242550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 1025937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 1026937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1027937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 10282550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); 1029937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 10302550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 10312550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 10322550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10332550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 10342550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB > 10352550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 10362550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 10372550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 10382550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 1039488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 1040488992221056edaf7111f9290afdf216c5e98d62Tim Rowley FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1041488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 10422550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1043488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 10442550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 10452550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 10462550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10472550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<> 10482550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > 10492550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 10502550b04179614da4c71dbef195d06a7f53273438Tim Rowley template <size_t NumDests> 10512550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) 10522550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 1053937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 1054937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); 1055937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 10562550b04179614da4c71dbef195d06a7f53273438Tim Rowley FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); 1057937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 10582550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 10592550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 10602550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10612550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 10622550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StoreRasterTile 10632550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 10642550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 10652550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StoreRasterTile 10662550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 10672550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 10682550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Retrieve color from hot tile source which is always float. 10692550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 10702550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 10712550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param output - output color 10722550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void GetSwizzledSrcColor( 10732550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pSrc, 10742550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, 10752550b04179614da4c71dbef195d06a7f53273438Tim Rowley float outputColor[4]) 10762550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 1077488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 1078488992221056edaf7111f9290afdf216c5e98d62Tim Rowley typedef SimdTile_16<SrcFormat, DstFormat> SimdT; 1079488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1080937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc); 1081488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1082488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // Compute which simd tile we're accessing within 8x8 tile. 1083488992221056edaf7111f9290afdf216c5e98d62Tim Rowley // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 1084488992221056edaf7111f9290afdf216c5e98d62Tim Rowley uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); 1085488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1086937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley SimdT *pSimdTile = &pSrcSimdTiles[simdIndex]; 1087488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1088488992221056edaf7111f9290afdf216c5e98d62Tim Rowley uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); 1089488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1090488992221056edaf7111f9290afdf216c5e98d62Tim Rowley pSimdTile->GetSwizzledColor(simdOffset, outputColor); 1091488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 10922550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef SimdTile<SrcFormat, DstFormat> SimdT; 10932550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10942550b04179614da4c71dbef195d06a7f53273438Tim Rowley SimdT* pSrcSimdTiles = (SimdT*)pSrc; 10952550b04179614da4c71dbef195d06a7f53273438Tim Rowley 10962550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Compute which simd tile we're accessing within 8x8 tile. 10972550b04179614da4c71dbef195d06a7f53273438Tim Rowley // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. 10982550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM); 10992550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11002550b04179614da4c71dbef195d06a7f53273438Tim Rowley SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; 11012550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11022550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM); 11032550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11042550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSimdTile->GetSwizzledColor(simdOffset, outputColor); 1105488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 11062550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11072550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11082550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 11092550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 11102550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 11112550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 11122550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 11132550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 11142550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 11152550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 11162550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile. 11172550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11182550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 11192550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 11202550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11212550b04179614da4c71dbef195d06a7f53273438Tim Rowley // For each raster tile pixel (rx, ry) 11222550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry) 11232550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11242550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx) 11252550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11262550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Perform bounds checking. 11272550b04179614da4c71dbef195d06a7f53273438Tim Rowley if (((x + rx) < lodWidth) && 11282550b04179614da4c71dbef195d06a7f53273438Tim Rowley ((y + ry) < lodHeight)) 11292550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11302550b04179614da4c71dbef195d06a7f53273438Tim Rowley float srcColor[4]; 11312550b04179614da4c71dbef195d06a7f53273438Tim Rowley GetSwizzledSrcColor(pSrc, rx, ry, srcColor); 11322550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11332550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry), 11342550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, 11352550b04179614da4c71dbef195d06a7f53273438Tim Rowley sampleNum, pDstSurface->lod, pDstSurface); 11362550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11372550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelFromFloat<DstFormat>(pDst, srcColor); 11382550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11392550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11402550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11412550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11422550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11432550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 11442550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11452550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 11462550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat> 11472550b04179614da4c71dbef195d06a7f53273438Tim Rowley{}; 11482550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11492550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 11502550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp 11512550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 11522550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 11531b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> 11542550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 11552550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile; 11562550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 11572550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 11582550b04179614da4c71dbef195d06a7f53273438Tim Rowley 11592550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 11602550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 11612550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 11622550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 11632550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 11642550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 11652550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 11662550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 11672550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 11682550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11692550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 11702550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 11712550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 117275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 117375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 11742550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 11752550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 11762550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 11772550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1178937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 11792550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1180937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 11812550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1182937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1183937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1184937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1185937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* ppDsts[] = 1186937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1187937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst, // row 0, col 0 1188937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + pDstSurface->pitch, // row 1, col 0 1189937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + dx / 2, // row 0, col 1 1190937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1191937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1192937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1193937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1194937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1195937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1196937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1197937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1198937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1199937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1200937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1201937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dx; 1202937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dx; 1203937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dx; 1204937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dx; 1205937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1206937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1207937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1208937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1209937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1210937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1211937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1212937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 1213937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 1214937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1215937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 12162550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 12172550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 12182550b04179614da4c71dbef195d06a7f53273438Tim Rowley 12192550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 12202550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 12212550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Format conversion and convert from SOA to AOS, and store the rows. 12222550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 12232550b04179614da4c71dbef195d06a7f53273438Tim Rowley 12242550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 12252550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 12262550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 12272550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 12282550b04179614da4c71dbef195d06a7f53273438Tim Rowley 12292550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 12302550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 12312550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1232937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 12332550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 12342550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 12352550b04179614da4c71dbef195d06a7f53273438Tim Rowley 12362550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 12372550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp 12382550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 12392550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 12401b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> 12412550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 12422550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile; 12432550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 12442550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 12452550b04179614da4c71dbef195d06a7f53273438Tim Rowley 12462550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 12472550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 12482550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 12492550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 12502550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 12512550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 12522550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 12532550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 12542550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 12552550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 12562550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 12572550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 12582550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 125975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 126075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 12612550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 12622550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 12632550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 12642550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1265937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 12662550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1267937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 1268937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1269937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1270937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1271937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1272937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* ppDsts[] = 1273937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1274937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst, // row 0, col 0 1275937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + pDstSurface->pitch, // row 1, col 0 1276937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + dx / 2, // row 0, col 1 1277937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1278937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1279937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1280937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1281937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1282937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1283937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1284937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1285937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1286937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1287937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1288937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dx; 1289937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dx; 1290937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dx; 1291937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dx; 1292937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1293937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1294937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1295937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1296937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1297937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1298937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1299937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 13002550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 13012550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13022550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 13032550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 13042550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 13052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13062550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 13072550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 13082550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Format conversion and convert from SOA to AOS, and store the rows. 13092550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 13102550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13112550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 13122550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 13132550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 13142550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 13152550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13162550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 13172550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 13182550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1319937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 13202550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 13212550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 13222550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13232550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 13242550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp 13252550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 13262550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 13271b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> 13282550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 13292550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile; 13302550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 13312550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 13322550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13332550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 13342550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 13352550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 13362550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 13372550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 13382550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 13392550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 13402550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 13412550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 13422550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 13432550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 13442550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 13452550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 134675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 134775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 13482550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 13492550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 13502550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 13512550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1352937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 13532550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1354488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#if USE_8x2_TILE_BACKEND 1355488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1356937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1357937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1358937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1359937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* ppDsts[] = 1360488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 1361937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst, // row 0, col 0 1362937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + pDstSurface->pitch, // row 1, col 0 1363937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + dx / 2, // row 0, col 1 1364937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 1365937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1366488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1367937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1368937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1369937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1370488992221056edaf7111f9290afdf216c5e98d62Tim Rowley { 1371937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1372488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1373488992221056edaf7111f9290afdf216c5e98d62Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1374937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1375937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dx; 1376937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dx; 1377937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dx; 1378937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dx; 1379488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 1380488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1381937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1382937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1383937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1384937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1385488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 1386488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 13872550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; 13882550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13892550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 13902550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 13912550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] }; 13922550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13932550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 13942550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 13952550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Format conversion and convert from SOA to AOS, and store the rows. 13962550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); 13972550b04179614da4c71dbef195d06a7f53273438Tim Rowley 13982550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 13992550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 14002550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH; 14012550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 14022550b04179614da4c71dbef195d06a7f53273438Tim Rowley 14032550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 14042550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 14052550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1406488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 14072550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 14082550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 14092550b04179614da4c71dbef195d06a7f53273438Tim Rowley 14102550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 14112550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp 14122550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 141375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 14142550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> 14152550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 14162550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; 14172550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1418937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 14192550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t MAX_DST_COLUMN_BYTES = 16; 1420937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if !USE_8x2_TILE_BACKEND 14212550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 14222550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1423937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 14242550b04179614da4c71dbef195d06a7f53273438Tim Rowley 14252550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 14262550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 14272550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 14282550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 14292550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 14302550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 14312550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 14322550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 14332550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 14342550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 14352550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 14362550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 14372550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 143875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 143975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 14402550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 14412550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 14422550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 14432550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1444937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 14452550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1446937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 1447937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1448937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1449937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1450937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1451937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1452937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); 1453937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1454937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *ppDsts[] = 1455937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 145675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst, // row 0, col 0 145775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch, // row 1, col 0 145875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 145975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 146075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 146175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 146275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 146375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 1464937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1465937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1466937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1467937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1468937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // Raster tile width is same as simd16 tile width 1469937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1470937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1471937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1472937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1473937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1474937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1475937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 1476937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1477937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[i] += dy; 1478937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1479937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1480937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 14812550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[] = 14822550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 14832550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDst, // row 0, col 0 14842550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDst + pDstSurface->pitch, // row 1, col 0 14852550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 14862550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 14872550b04179614da4c71dbef195d06a7f53273438Tim Rowley }; 14882550b04179614da4c71dbef195d06a7f53273438Tim Rowley 14892550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 14902550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 14912550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppStartRows[] = 14922550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 14932550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0], 14942550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1], 14952550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[2], 14962550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[3], 14972550b04179614da4c71dbef195d06a7f53273438Tim Rowley }; 14982550b04179614da4c71dbef195d06a7f53273438Tim Rowley 14992550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 15002550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 15012550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Format conversion and convert from SOA to AOS, and store the rows. 15022550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 15032550b04179614da4c71dbef195d06a7f53273438Tim Rowley 15042550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 15052550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 15062550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 15072550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 15082550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += SRC_COLUMN_BYTES; 15092550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 15102550b04179614da4c71dbef195d06a7f53273438Tim Rowley 15112550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch; 15122550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch; 15132550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; 15142550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; 15152550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1516937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 15172550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 15182550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 15192550b04179614da4c71dbef195d06a7f53273438Tim Rowley 15202550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 15212550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp 15222550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 15232550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 15242550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> 15252550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 15262550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; 15272550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1528937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 15292550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t MAX_DST_COLUMN_BYTES = 16; 1530937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if !USE_8x2_TILE_BACKEND 15312550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 15322550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; 1533937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 15342550b04179614da4c71dbef195d06a7f53273438Tim Rowley 15352550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 15362550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 15372550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 15382550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 15392550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 15402550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 15412550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 15422550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 15432550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 15442550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 15452550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 15462550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 15472550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 154875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 154975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 15502550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 15512550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 15522550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 15532550b04179614da4c71dbef195d06a7f53273438Tim Rowley 1554937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 15552550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1556937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 1557937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1558937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 155975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; 1560937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1561937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1562937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); 1563937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1564937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* ppDsts[] = 1565937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 156675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst, // row 0, col 0 156775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch, // row 1, col 0 156875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 156975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 157075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 157175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 157275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 157375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 157475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 4, // row 0, col 4 157575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4, // row 1, col 4 157675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 5, // row 0, col 5 157775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5, // row 1, col 5 157875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 6, // row 0, col 6 157975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6, // row 1, col 6 158075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + MAX_DST_COLUMN_BYTES * 7, // row 0, col 7 158175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7, // row 1, col 7 1582937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1583937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 158475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1585937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1586937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // Raster tile width is same as simd16 tile width 158775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 1588937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1589937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1590937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1591937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1592937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1593937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 1594937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1595937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[i] += dy; 1596937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1597937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1598937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 15992550b04179614da4c71dbef195d06a7f53273438Tim Rowley struct DstPtrs 16002550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16012550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[8]; 16022550b04179614da4c71dbef195d06a7f53273438Tim Rowley } ptrs; 16032550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16042550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Need 8 pointers, 4 columns of 2 rows each 16052550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t y = 0; y < 2; ++y) 16062550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16072550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t x = 0; x < 4; ++x) 16082550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16092550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; 16102550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 16112550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 16122550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16132550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 16142550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16152550b04179614da4c71dbef195d06a7f53273438Tim Rowley DstPtrs startPtrs = ptrs; 16162550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16172550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 16182550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16192550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Format conversion and convert from SOA to AOS, and store the rows. 16202550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); 16212550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16222550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 16232550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 16242550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 16252550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 16262550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; 16272550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; 16282550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; 16292550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; 16302550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += SRC_COLUMN_BYTES; 16312550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 16322550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16332550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch; 16342550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch; 16352550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch; 16362550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch; 16372550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch; 16382550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch; 16392550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; 16402550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; 16412550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1642937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 16432550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 16442550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 16452550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16462550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 16472550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp 16482550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 16492550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 16502550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> 16512550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 16522550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; 1653937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 16542550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16552550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 16562550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 16572550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 16582550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 16592550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 16602550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 16612550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 16622550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 16632550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 16642550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16652550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestRowWidthBytes = 16; // 16B rows 16662550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16672550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 16682550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 16692550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 167075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 167175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 16722550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 16732550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 16742550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 16752550b04179614da4c71dbef195d06a7f53273438Tim Rowley 16762550b04179614da4c71dbef195d06a7f53273438Tim Rowley // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 16772550b04179614da4c71dbef195d06a7f53273438Tim Rowley // We can compute the offsets to each column within the raster tile once and increment from these. 1678937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 167975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1680937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1681937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1682937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1683937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1684937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 168575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1686937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *ppDsts[] = 1687937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1688937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst, 1689937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes, 1690937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes / 4, 1691937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes + DestRowWidthBytes / 4 1692937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1693937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1694937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1695937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 169675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Raster tile width is same as simd16 tile width 169775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 169875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 1699937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1700937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1701937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1702937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1703937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1704937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1705937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1706937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1707937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1708937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 170975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 8 4x2 simd tiles in an 8x8 raster tile. 17102550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 17112550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 17122550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17132550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 17142550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 17152550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17162550b04179614da4c71dbef195d06a7f53273438Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 17172550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 17182550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 17192550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t rowOffset = row * DestRowWidthBytes; 17202550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17212550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pRow = pCol0 + rowOffset; 17222550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 17232550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17242550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 17252550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 17262550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17272550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0] += DestRowWidthBytes / 4; 17282550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1] += DestRowWidthBytes / 4; 17292550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17302550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 17312550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 17322550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1733937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 17342550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 17352550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 17362550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17372550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 17382550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp 17392550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 17402550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 17412550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> 17422550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 17432550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; 1744937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 17452550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17462550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 17472550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 17482550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 17492550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 17502550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 17512550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 17522550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 17532550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 17542550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 17552550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 17562550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestRowWidthBytes = 16; // 16B rows 17572550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17582550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 17592550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 17602550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 176175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 176275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 17632550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 17642550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 17652550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 17662550b04179614da4c71dbef195d06a7f53273438Tim Rowley 17672550b04179614da4c71dbef195d06a7f53273438Tim Rowley // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 17682550b04179614da4c71dbef195d06a7f53273438Tim Rowley // We can compute the offsets to each column within the raster tile once and increment from these. 1769937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 177075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1771937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1772937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1773937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1774937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1775937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 177675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 1777937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *ppDsts[] = 1778937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1779937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst, 1780937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes, 1781937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes / 2, 1782937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes + DestRowWidthBytes / 2 1783937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1784937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1785937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1786937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 178775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Raster tile width is same as simd16 tile width 178875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 178975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 1790937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1791937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1792937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1793937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1794937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1795937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1796937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1797937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1798937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1799937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 180075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 8 4x2 simd tiles in an 8x8 raster tile. 18012550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 18022550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 18032550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18042550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 18052550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 18062550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18072550b04179614da4c71dbef195d06a7f53273438Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 18082550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 18092550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 18102550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t rowOffset = row * DestRowWidthBytes; 18112550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18122550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pRow = pCol0 + rowOffset; 18132550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 18142550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18152550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 18162550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 18172550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18182550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0] += DestRowWidthBytes / 2; 18192550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1] += DestRowWidthBytes / 2; 18202550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18212550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 18222550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 18232550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1824937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 18252550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 18262550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 18272550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18282550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 18292550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp 18302550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 18312550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 18322550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> 18332550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 18342550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1835937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 1836937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 18372550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18382550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 18392550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 18402550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 18412550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 18422550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 18432550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 18442550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 18452550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 18462550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 18472550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 18482550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestRowWidthBytes = 512; // 512B rows 18492550b04179614da4c71dbef195d06a7f53273438Tim Rowley 185075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Punt non-full tiles to generic store 18512550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 18522550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 185375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 185475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 18552550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 18562550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 18572550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 18582550b04179614da4c71dbef195d06a7f53273438Tim Rowley 18592550b04179614da4c71dbef195d06a7f53273438Tim Rowley // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. 18602550b04179614da4c71dbef195d06a7f53273438Tim Rowley // We can compute the offsets to each column within the raster tile once and increment from these. 1861937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 1862937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1863937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1864937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1865937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1866937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; 1867937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1868937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* ppDsts[] = 1869937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1870937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst, // row 0, col 0 1871937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes, // row 1, col 0 1872937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + dx / 2, // row 0, col 1 1873937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDst + DestRowWidthBytes + dx / 2 // row 1, col 1 1874937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1875937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1876937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1877937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1878937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) 1879937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 1880937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1881937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1882937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1883937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1884937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dx; 1885937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dx; 1886937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dx; 1887937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dx; 1888937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1889937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1890937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1891937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1892937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1893937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1894937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 1895937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 1896937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 18972550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 18982550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pRow1 = pRow0 + DestRowWidthBytes; 18992550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19002550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 19012550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 19022550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM) 19032550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 19042550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8); 19052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19062550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset }; 19072550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 19082550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19092550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 19102550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 19112550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 19122550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19132550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow0 += (DestRowWidthBytes * 2); 19142550b04179614da4c71dbef195d06a7f53273438Tim Rowley pRow1 += (DestRowWidthBytes * 2); 19152550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 1916937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 19172550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 19182550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 19192550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19202550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 19212550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp 19222550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 19232550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 19242550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> 19252550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 19262550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; 1927488992221056edaf7111f9290afdf216c5e98d62Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 19282550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19292550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 19302550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 19312550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 19322550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 19332550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 19342550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 19352550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 19362550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 19372550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 19382550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 19392550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestRowWidthBytes = 16; // 16B rows 19402550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 19412550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19422550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 19432550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 19442550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 194575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 194675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 19472550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 19482550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 19492550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 19502550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19512550b04179614da4c71dbef195d06a7f53273438Tim Rowley // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 19522550b04179614da4c71dbef195d06a7f53273438Tim Rowley // We can compute the offsets to each column within the raster tile once and increment from these. 1953937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 195475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 4 8x2 simd tiles in an 8x8 raster tile. 1955937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 19562550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 19572550b04179614da4c71dbef195d06a7f53273438Tim Rowley 195875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 1959937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 1960488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 196175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 196275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley uint8_t *ppDsts[] = 1963937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 196475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst, // row 0, col 0 196575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes, // row 1, col 0 196675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes, // row 0, col 1 196775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes // row 1, col 1 1968937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 1969488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1970937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 1971937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 197275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Raster tile width is same as simd16 tile width 197375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 197475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 1975488992221056edaf7111f9290afdf216c5e98d62Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 1976488992221056edaf7111f9290afdf216c5e98d62Tim Rowley 1977488992221056edaf7111f9290afdf216c5e98d62Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 1978937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 1979937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[0] += dy; 1980937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[1] += dy; 1981937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[2] += dy; 1982937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[3] += dy; 1983488992221056edaf7111f9290afdf216c5e98d62Tim Rowley } 1984488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#else 198575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 8 4x2 simd tiles in an 8x8 raster tile. 1986937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 1987937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 1988937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 19892550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 19902550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 19912550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19922550b04179614da4c71dbef195d06a7f53273438Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 19932550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 19942550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 19952550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t rowOffset = row * DestRowWidthBytes; 19962550b04179614da4c71dbef195d06a7f53273438Tim Rowley 19972550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pRow = pCol0 + rowOffset; 19982550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes }; 19992550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20002550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 20012550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 20022550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20032550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0] += DestColumnBytes; 20042550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1] += DestColumnBytes; 20052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20062550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 20072550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 20082550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 2009488992221056edaf7111f9290afdf216c5e98d62Tim Rowley#endif 20102550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 20112550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 20122550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20132550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 20142550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp 20152550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 20162550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 20172550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> 20182550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 20192550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; 2020937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 20212550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20222550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 20232550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 20242550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 20252550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 20262550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 20272550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 20282550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 20292550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 20302550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 20312550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 20322550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestRowWidthBytes = 16; // 16B rows 20332550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 20342550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20352550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 20362550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 20372550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 203875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 203975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 20402550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 20412550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 20422550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 20432550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20442550b04179614da4c71dbef195d06a7f53273438Tim Rowley // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 20452550b04179614da4c71dbef195d06a7f53273438Tim Rowley // We can compute the offsets to each column within the raster tile once and increment from these. 2046937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 204775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2048937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2049937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2050937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2051937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 205275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2053937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 205475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2055937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *ppDsts[] = 2056937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 205775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst, // row 0, col 0 205875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes, // row 1, col 0 205975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes, // row 0, col 1 206075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 206175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 2, // row 0, col 2 206275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 206375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 3, // row 0, col 3 206475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 3 // row 1, col 3 2065937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley }; 2066937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2067937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2068937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 206975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Raster tile width is same as simd16 tile width 207075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 207175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 2072937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2073937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2074937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2075937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2076937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 2077937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 2078937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[i] += dy; 2079937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 2080937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 2081937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 208275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2083937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 20842550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 20852550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* pCol1 = pCol0 + DestColumnBytes; 20862550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20872550b04179614da4c71dbef195d06a7f53273438Tim Rowley // There are 4 columns, each 2 pixels wide when we have 64bpp pixels. 20882550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. 20892550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; 20902550b04179614da4c71dbef195d06a7f53273438Tim Rowley 20912550b04179614da4c71dbef195d06a7f53273438Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 20922550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM) 20932550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 20942550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t rowOffset = row * DestRowWidthBytes; 20952550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[] = 20962550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 20972550b04179614da4c71dbef195d06a7f53273438Tim Rowley pCol0 + rowOffset, 20982550b04179614da4c71dbef195d06a7f53273438Tim Rowley pCol0 + rowOffset + DestRowWidthBytes, 20992550b04179614da4c71dbef195d06a7f53273438Tim Rowley pCol1 + rowOffset, 21002550b04179614da4c71dbef195d06a7f53273438Tim Rowley pCol1 + rowOffset + DestRowWidthBytes, 21012550b04179614da4c71dbef195d06a7f53273438Tim Rowley }; 21022550b04179614da4c71dbef195d06a7f53273438Tim Rowley 21032550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 21042550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 21052550b04179614da4c71dbef195d06a7f53273438Tim Rowley 21062550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[0] += DestColumnBytes * 2; 21072550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[1] += DestColumnBytes * 2; 21082550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[2] += DestColumnBytes * 2; 21092550b04179614da4c71dbef195d06a7f53273438Tim Rowley ppDsts[3] += DestColumnBytes * 2; 21102550b04179614da4c71dbef195d06a7f53273438Tim Rowley 21112550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 21122550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += pSrcInc; 21132550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 2114937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 21152550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 21162550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 21172550b04179614da4c71dbef195d06a7f53273438Tim Rowley 21182550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 21192550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp 21202550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 21212550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 21222550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> 21232550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 21245dd0b8d3c635b67d8274c64653d825b8855b8167Ilia Mirkin typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile; 2125937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 2126937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2127937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2128937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 2129937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t TILE_Y_COL_WIDTH_BYTES = 16; 2130937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t TILE_Y_ROWS = 32; 2131937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; 2132937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2133937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; 2134937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; 2135937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t MAX_DST_COLUMN_BYTES = 16; 2136937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2137937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; 21382550b04179614da4c71dbef195d06a7f53273438Tim Rowley static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; 21392550b04179614da4c71dbef195d06a7f53273438Tim Rowley 2140937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 21412550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 21422550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores an 8x8 raster tile to the destination surface. 21432550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to raster tile. 21442550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 21452550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to raster tile. 21462550b04179614da4c71dbef195d06a7f53273438Tim Rowley INLINE static void Store( 21472550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrc, 21482550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 21492550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) 21502550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 2151937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 2152937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const uint32_t DestRowWidthBytes = 16; // 16B rows 2153937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. 2154937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 2155937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 21562550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Punt non-full tiles to generic store 21572550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); 21582550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); 215975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 216075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight) 21612550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 21622550b04179614da4c71dbef195d06a7f53273438Tim Rowley return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); 21632550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 21642550b04179614da4c71dbef195d06a7f53273438Tim Rowley 216575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. 216675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // We can compute the offsets to each column within the raster tile once and increment from these. 2167937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#if USE_8x2_TILE_BACKEND 216875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 4 8x2 simd tiles in an 8x8 raster tile. 2169937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 2170937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 2171937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2172937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) 217375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; 2174937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 217575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. 2176937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t *ppDsts[] = 2177937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 217875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst, // row 0, col 0 217975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes, // row 1, col 0 218075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes, // row 0, col 1 218175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes, // row 1, col 1 218275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 2, // row 0, col 2 218375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2 218475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 3, // row 0, col 3 218575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3 218675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 4, // row 0, col 4 218775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4 218875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 5, // row 0, col 5 218975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5 219075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 6, // row 0, col 6 219175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6 219275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestColumnBytes * 7, // row 0, col 7 219375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDst + DestRowWidthBytes + DestColumnBytes * 7 // row 1, col 7 219475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley }; 2195937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 219675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) 2197937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 219875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Raster tile width is same as simd16 tile width 219975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); 2200937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2201937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); 2202937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2203937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; 2204937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley 2205937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) 2206937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley { 2207937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley ppDsts[i] += dy; 2208937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 2209937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley } 2210937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#else 221175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // There will be 8 4x2 simd tiles in an 8x8 raster tile. 2212937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 22132550b04179614da4c71dbef195d06a7f53273438Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); 22142550b04179614da4c71dbef195d06a7f53273438Tim Rowley struct DstPtrs 22152550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22162550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t* ppDsts[8]; 22172550b04179614da4c71dbef195d06a7f53273438Tim Rowley } ptrs; 22182550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22192550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Need 8 pointers, 4 columns of 2 rows each 22202550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t y = 0; y < 2; ++y) 22212550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22222550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t x = 0; x < 4; ++x) 22232550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22242550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES; 22252550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22262550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22272550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22282550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) 22292550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22302550b04179614da4c71dbef195d06a7f53273438Tim Rowley DstPtrs startPtrs = ptrs; 22312550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22322550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col) 22332550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22342550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Format conversion and convert from SOA to AOS, and store the rows. 22352550b04179614da4c71dbef195d06a7f53273438Tim Rowley ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts); 22362550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22372550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC; 22382550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC; 22392550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC; 22402550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC; 22412550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC; 22422550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC; 22432550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC; 22442550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC; 22452550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrc += SRC_COLUMN_BYTES; 22462550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22472550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22482550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES; 22492550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES; 22502550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES; 22512550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES; 22522550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES; 22532550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES; 22542550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; 22552550b04179614da4c71dbef195d06a7f53273438Tim Rowley ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; 22562550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 2257937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017Tim Rowley#endif 22582550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22592550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 22602550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22612550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 22622550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// StoreMacroTile - Stores a macro tile which consists of raster tiles. 22632550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 22642550b04179614da4c71dbef195d06a7f53273438Tim Rowleytemplate<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> 22652550b04179614da4c71dbef195d06a7f53273438Tim Rowleystruct StoreMacroTile 22662550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 22672550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 22682550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores a macrotile to the destination surface using safe implementation. 22692550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to macro tile. 22702550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 22712550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to macro tile 22722550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void StoreGeneric( 22732550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrcHotTile, 22742550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 22752550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 22762550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22772550b04179614da4c71dbef195d06a7f53273438Tim Rowley PFN_STORE_TILES_INTERNAL pfnStore; 22782550b04179614da4c71dbef195d06a7f53273438Tim Rowley pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 22792550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22802550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Store each raster tile from the hot tile to the destination surface. 22812550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 22822550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22832550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 22842550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22852550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 22862550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 22872550b04179614da4c71dbef195d06a7f53273438Tim Rowley pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 22882550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 22892550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22902550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22912550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22922550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22932550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 22942550b04179614da4c71dbef195d06a7f53273438Tim Rowley 22952550b04179614da4c71dbef195d06a7f53273438Tim Rowley typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t); 22962550b04179614da4c71dbef195d06a7f53273438Tim Rowley ////////////////////////////////////////////////////////////////////////// 22972550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @brief Stores a macrotile to the destination surface. 22982550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pSrc - Pointer to macro tile. 22992550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param pDstSurface - Destination surface state 23002550b04179614da4c71dbef195d06a7f53273438Tim Rowley /// @param x, y - Coordinates to macro tile 23012550b04179614da4c71dbef195d06a7f53273438Tim Rowley static void Store( 23022550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint8_t *pSrcHotTile, 23032550b04179614da4c71dbef195d06a7f53273438Tim Rowley SWR_SURFACE_STATE* pDstSurface, 23042550b04179614da4c71dbef195d06a7f53273438Tim Rowley uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) 23052550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 23062550b04179614da4c71dbef195d06a7f53273438Tim Rowley PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; 230775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 23082550b04179614da4c71dbef195d06a7f53273438Tim Rowley for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 23092550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 231075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>( 231175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 0, 231275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 0, 231375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces 231475149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays 231575149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley sampleNum, 231675149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDstSurface->lod, 231775149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pDstSurface); 231875149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 231975149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear 232075149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || 232175149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley (pDstSurface->bInterleavedSamples); 232275149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley 232375149088bea168a10f47df08fc62bcfeed744ce9Tim Rowley pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store; 23242550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 23252550b04179614da4c71dbef195d06a7f53273438Tim Rowley 23262550b04179614da4c71dbef195d06a7f53273438Tim Rowley // Store each raster tile from the hot tile to the destination surface. 23272550b04179614da4c71dbef195d06a7f53273438Tim Rowley for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) 23282550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 23292550b04179614da4c71dbef195d06a7f53273438Tim Rowley for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) 23302550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 23312550b04179614da4c71dbef195d06a7f53273438Tim Rowley for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) 23322550b04179614da4c71dbef195d06a7f53273438Tim Rowley { 23332550b04179614da4c71dbef195d06a7f53273438Tim Rowley pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex); 23342550b04179614da4c71dbef195d06a7f53273438Tim Rowley pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8); 23352550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 23362550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 23372550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 23382550b04179614da4c71dbef195d06a7f53273438Tim Rowley } 23392550b04179614da4c71dbef195d06a7f53273438Tim Rowley}; 23402550b04179614da4c71dbef195d06a7f53273438Tim Rowley 23412550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 23422550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// InitStoreTilesTable - Helper for setting up the tables. 23431b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 23442550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableColor_Half1( 23452550b04179614da4c71dbef195d06a7f53273438Tim Rowley PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT]) 23462550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 23471b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store; 23481b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32A32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store; 23491b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32A32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store; 23501b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32X32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store; 23511b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32A32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store; 23521b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32A32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store; 23531b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store; 23541b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store; 23551b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store; 23561b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store; 23571b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32B32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store; 23581b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store; 23591b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store; 23601b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store; 23611b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store; 23621b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store; 23631b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store; 23641b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store; 23651b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store; 23661b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 23671b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][X32_TYPELESS_G8X24_UINT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store; 23681b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16X16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store; 23691b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16X16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store; 23701b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store; 23711b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16A16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store; 23721b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store; 23731b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32G32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store; 23741b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B8G8R8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store; 23751b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B8G8R8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store; 23761b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric; 23771b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric; 23781b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric; 23791b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store; 23801b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store; 23811b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store; 23821b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store; 23831b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store; 23841b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store; 23851b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store; 23861b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store; 23871b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store; 23881b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store; 23891b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric; 23901b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric; 23911b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R11G11B10_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric; 23921b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10_FLOAT_A2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric; 23931b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store; 23941b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store; 23951b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store; 23961b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric; 23971b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][X24_TYPELESS_G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric; 23981b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][A32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store; 23991b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B8G8R8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store; 24001b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B8G8R8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store; 24011b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8X8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store; 24021b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8X8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store; 24032550b04179614da4c71dbef195d06a7f53273438Tim Rowley} 24042550b04179614da4c71dbef195d06a7f53273438Tim Rowley 24051b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT> 24062550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableColor_Half2( 24072550b04179614da4c71dbef195d06a7f53273438Tim Rowley PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT]) 24082550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 24091b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R9G9B9E5_SHAREDEXP] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric; 24101b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10X2_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric; 24111b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10X2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric; 24121b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store; 24131b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8A8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store; 24141b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store; 24151b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store; 24161b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store; 24171b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store; 24181b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B5G6R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store; 24191b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B5G6R5_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric; 24201b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B5G5R5A1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric; 24211b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B5G5R5A1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric; 24221b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B4G4R4A4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric; 24231b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B4G4R4A4_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric; 24241b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store; 24251b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store; 24261b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store; 24271b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store; 24281b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store; 24291b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store; 24301b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store; 24311b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store; 24321b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store; 24331b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][A16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store; 24341b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][A16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store; 24351b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B5G5R5X1_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric; 24361b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B5G5R5X1_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric; 24371b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store; 24381b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store; 24391b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store; 24401b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store; 24411b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][A1B5G5R5_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric; 24421b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][A4B4G4R4_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric; 24431b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store; 24441b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store; 24451b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store; 24461b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store; 24471b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][A8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store; 24481b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store; 24491b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store; 24501b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store; 24511b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store; 24521b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store; 24531b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store; 24541b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store; 24551b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store; 24561b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store; 24571b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store; 24581b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store; 24591b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_UNORM_SRGB] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store; 24601b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_UINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store; 24611b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16G16B16_SINT] = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store; 24621b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric; 24631b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric; 24641b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric; 24651b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R10G10B10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric; 24661b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_SNORM] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric; 24671b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_USCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric; 24681b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_SSCALED] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric; 24691b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_UINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric; 24701b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][B10G10R10A2_SINT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric; 24711b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store; 24721b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8G8B8_SINT] = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store; 24732550b04179614da4c71dbef195d06a7f53273438Tim Rowley} 24742550b04179614da4c71dbef195d06a7f53273438Tim Rowley 24752550b04179614da4c71dbef195d06a7f53273438Tim Rowley////////////////////////////////////////////////////////////////////////// 24762550b04179614da4c71dbef195d06a7f53273438Tim Rowley/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables. 24771b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 24782550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableDepth( 24792550b04179614da4c71dbef195d06a7f53273438Tim Rowley PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 24802550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 24811b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R32_FLOAT] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store; 248245d9cd36fe9a3132e32f3efda0fbcbade2c71d21Ilia Mirkin table[TTileMode][R32_FLOAT_X8X24_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store; 24831b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R24_UNORM_X8_TYPELESS] = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store; 24841b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R16_UNORM] = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store; 24852550b04179614da4c71dbef195d06a7f53273438Tim Rowley} 24862550b04179614da4c71dbef195d06a7f53273438Tim Rowley 24871b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowleytemplate <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT> 24882550b04179614da4c71dbef195d06a7f53273438Tim Rowleyvoid InitStoreTilesTableStencil( 24892550b04179614da4c71dbef195d06a7f53273438Tim Rowley PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT]) 24902550b04179614da4c71dbef195d06a7f53273438Tim Rowley{ 24911b86c050adcb9c166c2aab2f4c6e41cc07686bf3Tim Rowley table[TTileMode][R8_UINT] = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store; 24922550b04179614da4c71dbef195d06a7f53273438Tim Rowley} 2493