1c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/****************************************************************************
2c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
4c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Permission is hereby granted, free of charge, to any person obtaining a
5c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* copy of this software and associated documentation files (the "Software"),
6c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* to deal in the Software without restriction, including without limitation
7c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* and/or sell copies of the Software, and to permit persons to whom the
9c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software is furnished to do so, subject to the following conditions:
10c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
11c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* The above copyright notice and this permission notice (including the next
12c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* paragraph) shall be included in all copies or substantial portions of the
13c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* Software.
14c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
15c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* IN THE SOFTWARE.
22c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
23c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @file frontend.cpp
24c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
25c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley* @brief Implementation for Frontend which handles vertex processing,
26c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*        primitive assembly, clipping, binning, etc.
27c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley*
28c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley******************************************************************************/
29c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
30c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "api.h"
31c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "frontend.h"
32c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "backend.h"
33c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "context.h"
34c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "rdtsc_core.h"
35c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "utils.h"
36c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "threads.h"
37c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "pa.h"
38c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "clip.h"
39c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "tilemgr.h"
40c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#include "tessellator.h"
410487377dcec9122173c963360f8d302d071d3434Tim Rowley#include <limits>
42c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
43c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
44c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Helper macro to generate a bitmask
45c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic INLINE uint32_t GenMask(uint32_t numBits)
46c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
47c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return ((1U << numBits) - 1);
49c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
50c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
51c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
52c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief FE handler for SwrSync.
53c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pContext - pointer to SWR context.
54c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
55c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
56c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pUserData - Pointer to user data passed back to sync callback.
57c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @todo This should go away when we switch this to use compute threading.
58c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid ProcessSync(
59c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_CONTEXT *pContext,
60c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
61c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
62c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void *pUserData)
63c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
64c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    BE_WORK work;
65c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    work.type = SYNC;
66c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    work.pfnWork = ProcessSyncBE;
67c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
68c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    MacroTileMgr *pTileMgr = pDC->pTileMgr;
69c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    pTileMgr->enqueue(0, 0, &work);
70c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
71c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
72c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
7392ec820244710e1b13267d8e93f3a81d7114080eTim Rowley/// @brief FE handler for SwrDestroyContext.
7492ec820244710e1b13267d8e93f3a81d7114080eTim Rowley/// @param pContext - pointer to SWR context.
7592ec820244710e1b13267d8e93f3a81d7114080eTim Rowley/// @param pDC - pointer to draw context.
7692ec820244710e1b13267d8e93f3a81d7114080eTim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
7792ec820244710e1b13267d8e93f3a81d7114080eTim Rowley/// @param pUserData - Pointer to user data passed back to sync callback.
7892ec820244710e1b13267d8e93f3a81d7114080eTim Rowleyvoid ProcessShutdown(
7992ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    SWR_CONTEXT *pContext,
8092ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    DRAW_CONTEXT *pDC,
8192ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    uint32_t workerId,
8292ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    void *pUserData)
8392ec820244710e1b13267d8e93f3a81d7114080eTim Rowley{
8492ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    BE_WORK work;
8592ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    work.type = SHUTDOWN;
8692ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    work.pfnWork = ProcessShutdownBE;
8792ec820244710e1b13267d8e93f3a81d7114080eTim Rowley
8892ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    MacroTileMgr *pTileMgr = pDC->pTileMgr;
8992ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    // Enqueue at least 1 work item for each worker thread
9092ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    // account for number of numa nodes
9192ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
9292ec820244710e1b13267d8e93f3a81d7114080eTim Rowley
9392ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
9492ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    {
9592ec820244710e1b13267d8e93f3a81d7114080eTim Rowley        for (uint32_t n = 0; n < numNumaNodes; ++n)
9692ec820244710e1b13267d8e93f3a81d7114080eTim Rowley        {
9792ec820244710e1b13267d8e93f3a81d7114080eTim Rowley            pTileMgr->enqueue(i, n, &work);
9892ec820244710e1b13267d8e93f3a81d7114080eTim Rowley        }
9992ec820244710e1b13267d8e93f3a81d7114080eTim Rowley    }
10092ec820244710e1b13267d8e93f3a81d7114080eTim Rowley}
10192ec820244710e1b13267d8e93f3a81d7114080eTim Rowley
10292ec820244710e1b13267d8e93f3a81d7114080eTim Rowley//////////////////////////////////////////////////////////////////////////
103c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief FE handler for SwrClearRenderTarget.
104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pContext - pointer to SWR context.
105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
106c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
107c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pUserData - Pointer to user data passed back to clear callback.
108c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @todo This should go away when we switch this to use compute threading.
109c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid ProcessClear(
110c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_CONTEXT *pContext,
111c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void *pUserData)
114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1150ff57446e3786243c6d752c91be2108595f2663eTim Rowley    CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
116c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    MacroTileMgr *pTileMgr = pDC->pTileMgr;
117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // queue a clear to each macro tile
1190ff57446e3786243c6d752c91be2108595f2663eTim Rowley    // compute macro tile bounds for the specified rect
1200ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
1210ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
1220ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
1230ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    BE_WORK work;
126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    work.type = CLEAR;
127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    work.pfnWork = ProcessClearBE;
1280ff57446e3786243c6d752c91be2108595f2663eTim Rowley    work.desc.clear = *pDesc;
129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1300ff57446e3786243c6d752c91be2108595f2663eTim Rowley    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1320ff57446e3786243c6d752c91be2108595f2663eTim Rowley        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pTileMgr->enqueue(x, y, &work);
135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
140c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief FE handler for SwrStoreTiles.
141c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pContext - pointer to SWR context.
142c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
143c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
144c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pUserData - Pointer to user data passed back to callback.
145c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @todo This should go away when we switch this to use compute threading.
146c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid ProcessStoreTiles(
147c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_CONTEXT *pContext,
148c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
149c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
150c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void *pUserData)
151c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1522f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
153c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    MacroTileMgr *pTileMgr = pDC->pTileMgr;
1540ff57446e3786243c6d752c91be2108595f2663eTim Rowley    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
155c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
156c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // queue a store to each macro tile
1570ff57446e3786243c6d752c91be2108595f2663eTim Rowley    // compute macro tile bounds for the specified rect
1580ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
1590ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
1600ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
1610ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
162c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
163c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // store tiles
164c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    BE_WORK work;
165c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    work.type = STORETILES;
166a907b7a5f74169906c04e9702f3c8fda99636c56Tim Rowley    work.pfnWork = ProcessStoreTilesBE;
1670ff57446e3786243c6d752c91be2108595f2663eTim Rowley    work.desc.storeTiles = *pDesc;
168c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1690ff57446e3786243c6d752c91be2108595f2663eTim Rowley    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
170c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1710ff57446e3786243c6d752c91be2108595f2663eTim Rowley        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
172c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
173c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pTileMgr->enqueue(x, y, &work);
174c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
175c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
176c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1772f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_END(FEProcessStoreTiles, 0);
178c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
179c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
180c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
181c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief FE handler for SwrInvalidateTiles.
182c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pContext - pointer to SWR context.
183c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
184c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
185c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pUserData - Pointer to user data passed back to callback.
186c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @todo This should go away when we switch this to use compute threading.
187e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowleyvoid ProcessDiscardInvalidateTiles(
188c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_CONTEXT *pContext,
189c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
190c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
191c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void *pUserData)
192c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1932f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
1940ff57446e3786243c6d752c91be2108595f2663eTim Rowley    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
195c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    MacroTileMgr *pTileMgr = pDC->pTileMgr;
196c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1970ff57446e3786243c6d752c91be2108595f2663eTim Rowley    // compute macro tile bounds for the specified rect
1980ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
1990ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
2000ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
2010ff57446e3786243c6d752c91be2108595f2663eTim Rowley    uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
202e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowley
2030ff57446e3786243c6d752c91be2108595f2663eTim Rowley    if (pDesc->fullTilesOnly == false)
204e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowley    {
205e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowley        // include partial tiles
2060ff57446e3786243c6d752c91be2108595f2663eTim Rowley        macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
2070ff57446e3786243c6d752c91be2108595f2663eTim Rowley        macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
2080ff57446e3786243c6d752c91be2108595f2663eTim Rowley        macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
2090ff57446e3786243c6d752c91be2108595f2663eTim Rowley        macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
210e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowley    }
211c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
2120ff57446e3786243c6d752c91be2108595f2663eTim Rowley    SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
2130ff57446e3786243c6d752c91be2108595f2663eTim Rowley    SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
214fee56fda6fd78f7fb10b0e8fced0a604ca43f0c0Tim Rowley
2150ff57446e3786243c6d752c91be2108595f2663eTim Rowley    macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
2160ff57446e3786243c6d752c91be2108595f2663eTim Rowley    macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
217fee56fda6fd78f7fb10b0e8fced0a604ca43f0c0Tim Rowley
218c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // load tiles
219c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    BE_WORK work;
220e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowley    work.type = DISCARDINVALIDATETILES;
221e374d2d24b0d755c9380da0eb33e4151b1ad145fTim Rowley    work.pfnWork = ProcessDiscardInvalidateTilesBE;
2220ff57446e3786243c6d752c91be2108595f2663eTim Rowley    work.desc.discardInvalidateTiles = *pDesc;
223c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
2240ff57446e3786243c6d752c91be2108595f2663eTim Rowley    for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
225c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
2260ff57446e3786243c6d752c91be2108595f2663eTim Rowley        for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
227c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
228c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pTileMgr->enqueue(x, y, &work);
229c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
230c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
231c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
2322f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_END(FEProcessInvalidateTiles, 0);
233c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
234c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
235c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
236c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Computes the number of primitives given the number of verts.
237c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param mode - primitive topology for draw operation.
238c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param numPrims - number of vertices or indices for draw.
239c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @todo Frontend needs to be refactored. This will go in appropriate place then.
240c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyuint32_t GetNumPrims(
241c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PRIMITIVE_TOPOLOGY mode,
242c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numPrims)
243c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
244c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    switch (mode)
245c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
246c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POINT_LIST: return numPrims;
247c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_LIST: return numPrims / 3;
248c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
249c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
250c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
251c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_QUAD_LIST: return numPrims / 4;
252c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
253c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
254c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LIST: return numPrims / 2;
255c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LOOP: return numPrims;
256c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_RECT_LIST: return numPrims / 3;
257c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LIST_ADJ: return numPrims / 4;
258c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
259c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_LIST_ADJ: return numPrims / 6;
260c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
261c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
262c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_1:
263c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_2:
264c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_3:
265c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_4:
266c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_5:
267c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_6:
268c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_7:
269c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_8:
270c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_9:
271c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_10:
272c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_11:
273c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_12:
274c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_13:
275c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_14:
276c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_15:
277c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_16:
278c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_17:
279c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_18:
280c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_19:
281c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_20:
282c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_21:
283c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_22:
284c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_23:
285c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_24:
286c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_25:
287c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_26:
288c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_27:
289c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_28:
290c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_29:
291c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_30:
292c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_31:
293c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_32:
294c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return numPrims / (mode - TOP_PATCHLIST_BASE);
295c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
296c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POLYGON:
297c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POINT_LIST_BF:
298c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_CONT:
299c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_BF:
300c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_CONT_BF:
301c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_FAN_NOSTIPPLE:
302c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_STRIP_REVERSE:
303c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_BASE:
304c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_UNKNOWN:
305c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(false, "Unsupported topology: %d", mode);
306c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return 0;
307c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
308c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
309c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return 0;
310c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
311c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
312c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
313c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Computes the number of verts given the number of primitives.
314c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param mode - primitive topology for draw operation.
315c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param numPrims - number of primitives for draw.
316c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyuint32_t GetNumVerts(
317c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PRIMITIVE_TOPOLOGY mode,
318c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numPrims)
319c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
320c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    switch (mode)
321c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
322c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POINT_LIST: return numPrims;
323c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_LIST: return numPrims * 3;
324c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
325c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
326c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
327c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_QUAD_LIST: return numPrims * 4;
328c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
329c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
330c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LIST: return numPrims * 2;
331c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LOOP: return numPrims;
332c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_RECT_LIST: return numPrims * 3;
333c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LIST_ADJ: return numPrims * 4;
334c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
335c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_LIST_ADJ: return numPrims * 6;
336c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
337c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
338c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_1:
339c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_2:
340c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_3:
341c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_4:
342c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_5:
343c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_6:
344c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_7:
345c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_8:
346c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_9:
347c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_10:
348c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_11:
349c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_12:
350c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_13:
351c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_14:
352c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_15:
353c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_16:
354c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_17:
355c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_18:
356c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_19:
357c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_20:
358c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_21:
359c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_22:
360c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_23:
361c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_24:
362c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_25:
363c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_26:
364c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_27:
365c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_28:
366c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_29:
367c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_30:
368c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_31:
369c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_32:
370c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return numPrims * (mode - TOP_PATCHLIST_BASE);
371c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
372c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POLYGON:
373c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POINT_LIST_BF:
374c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_CONT:
375c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_BF:
376c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_CONT_BF:
377c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_FAN_NOSTIPPLE:
378c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_STRIP_REVERSE:
379c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_BASE:
380c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_UNKNOWN:
381c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(false, "Unsupported topology: %d", mode);
382c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return 0;
383c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
384c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
385c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return 0;
386c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
387c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
388c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
389c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Return number of verts per primitive.
390c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param topology - topology
391c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param includeAdjVerts - include adjacent verts in primitive vertices
392c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
393c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
394c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numVerts = 0;
395c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    switch (topology)
396c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
397c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POINT_LIST:
398c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_POINT_LIST_BF:
399c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        numVerts = 1;
400c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        break;
401c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LIST:
402c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP:
403c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LIST_ADJ:
404c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_LOOP:
405c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_CONT:
406c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LINE_STRIP_BF:
407c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_LISTSTRIP_ADJ:
408c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        numVerts = 2;
409c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        break;
410c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_LIST:
411c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_STRIP:
412c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRIANGLE_FAN:
413c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_LIST_ADJ:
414c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_STRIP_ADJ:
415c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_TRI_STRIP_REVERSE:
416c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_RECT_LIST:
417c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        numVerts = 3;
418c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        break;
419c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_QUAD_LIST:
420c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_QUAD_STRIP:
421c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        numVerts = 4;
422c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        break;
423c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_1:
424c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_2:
425c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_3:
426c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_4:
427c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_5:
428c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_6:
429c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_7:
430c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_8:
431c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_9:
432c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_10:
433c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_11:
434c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_12:
435c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_13:
436c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_14:
437c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_15:
438c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_16:
439c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_17:
440c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_18:
441c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_19:
442c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_20:
443c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_21:
444c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_22:
445c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_23:
446c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_24:
447c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_25:
448c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_26:
449c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_27:
450c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_28:
451c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_29:
452c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_30:
453c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_31:
454c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    case TOP_PATCHLIST_32:
455c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        numVerts = topology - TOP_PATCHLIST_BASE;
456c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        break;
457c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    default:
458c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(false, "Unsupported topology: %d", topology);
459c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        break;
460c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
461c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
462c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (includeAdjVerts)
463c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
464c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        switch (topology)
465c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
466c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_LISTSTRIP_ADJ:
467c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_LINE_LIST_ADJ: numVerts = 4; break;
468c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_TRI_STRIP_ADJ:
469c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_TRI_LIST_ADJ: numVerts = 6; break;
470c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        default: break;
471c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
472c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
473c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
474c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return numVerts;
475c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
476c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
477c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
478c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Generate mask from remaining work.
479c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param numWorkItems - Number of items being worked on by a SIMD.
480c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
481c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
482c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
483c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
484c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return _simd_castps_si(vMask(mask));
485c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
486c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
487c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
488c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief StreamOut - Streams vertex data out to SO buffers.
489c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        Generally, we are only streaming out a SIMDs worth of triangles.
490c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
491c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
492c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
493c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic void StreamOut(
494c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT* pDC,
495c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PA_STATE& pa,
496c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
497c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pPrimData,
498c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t streamIndex)
499c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
5002f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    SWR_CONTEXT *pContext = pDC->pContext;
5012f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley
5022f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_BEGIN(FEStreamout, pDC->drawId);
503c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
504c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const API_STATE& state = GetApiState(pDC);
505c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const SWR_STREAMOUT_STATE &soState = state.soState;
506c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
507c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
508c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
509c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
510c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
511c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
512c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_STREAMOUT_CONTEXT soContext = { 0 };
513c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
514c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // Setup buffer state pointers.
515c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < 4; ++i)
516c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
517c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        soContext.pBuffer[i] = &state.soBuffer[i];
518c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
519c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
520c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numPrims = pa.NumPrims();
521c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
522c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
523c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        DWORD slot = 0;
524c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint32_t soMask = soState.streamMasks[streamIndex];
525c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
526c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // Write all entries into primitive data buffer for SOS.
527c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        while (_BitScanForward(&slot, soMask))
528c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
529c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
530c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
531c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pa.AssembleSingle(paSlot, primIndex, attrib);
532c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
533c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // Attribute offset is relative offset from start of vertex.
534c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // Note that attributes start at slot 1 in the PA buffer. We need to write this
535c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // to prim data starting at slot 0. Which is why we do (slot - 1).
536c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // Also note: GL works slightly differently, and needs slot 0
537c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
538c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
539c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // Store each vertex's attrib at appropriate locations in pPrimData buffer.
540c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            for (uint32_t v = 0; v < soVertsPerPrim; ++v)
541c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
542c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
543c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
544c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
545c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
546c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            soMask &= ~(1 << slot);
547c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
548c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
549c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // Update pPrimData pointer
550c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        soContext.pPrimData = pPrimData;
551c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
552c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // Call SOS
553c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
554c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        state.pfnSoFunc[streamIndex](soContext);
555c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
556c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
557c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // Update SO write offset. The driver provides memory for the update.
558c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < 4; ++i)
559c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
560c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        if (state.soBuffer[i].pWriteOffset)
561c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
562c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
5637cf187d08ae6a64c959de1cdf9004f5fb2fd097aTim Rowley        }
564c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
5657cf187d08ae6a64c959de1cdf9004f5fb2fd097aTim Rowley        if (state.soBuffer[i].soWriteEnable)
5667cf187d08ae6a64c959de1cdf9004f5fb2fd097aTim Rowley        {
5677cf187d08ae6a64c959de1cdf9004f5fb2fd097aTim Rowley            pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
5687cf187d08ae6a64c959de1cdf9004f5fb2fd097aTim Rowley            pDC->dynState.SoWriteOffsetDirty[i] = true;
569c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
570c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
571c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
5724e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley    UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
5734e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley    UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
574c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
5752f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_END(FEStreamout, 1);
576c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
577c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
578c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
579c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Computes number of invocations. The current index represents
580c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        the start of the SIMD. The max index represents how much work
5810ff57446e3786243c6d752c91be2108595f2663eTim Rowley///        items are remaining. If there is less then a SIMD's xmin of work
582c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        then return the remaining amount of work.
583c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param curIndex - The start index for the SIMD.
584c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param maxIndex - The last index for all work items.
585c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic INLINE uint32_t GetNumInvocations(
586c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t curIndex,
587c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t maxIndex)
588c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
589c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t remainder = (maxIndex - curIndex);
590c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
591c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
592c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
593c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
594c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Converts a streamId buffer to a cut buffer for the given stream id.
595c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        The geometry shader will loop over each active streamout buffer, assembling
596c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        primitives for the downstream stages. When multistream output is enabled,
597c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        the generated stream ID buffer from the GS needs to be converted to a cut
598c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley///        buffer for the primitive assembler.
599c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param stream - stream id to generate the cut buffer for
600c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pStreamIdBase - pointer to the stream ID buffer
601c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param numEmittedVerts - Number of total verts emitted by the GS
602c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pCutBuffer - output buffer to write cuts to
603c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
604c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
605c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(stream < MAX_SO_STREAMS);
606c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
607c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
608c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
609c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
610c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t b = 0; b < numOutputBytes; ++b)
611c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
612c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint8_t curInputByte = pStreamIdBase[2*b];
613c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint8_t outByte = 0;
614c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        for (uint32_t i = 0; i < 4; ++i)
615c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
616c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            if ((curInputByte & 0x3) != stream)
617c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
618c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                outByte |= (1 << i);
619c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
620c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            curInputByte >>= 2;
621c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
622c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
623c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        curInputByte = pStreamIdBase[2 * b + 1];
624c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        for (uint32_t i = 0; i < 4; ++i)
625c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
626c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            if ((curInputByte & 0x3) != stream)
627c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
628c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                outByte |= (1 << (i + 4));
629c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
630c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            curInputByte >>= 2;
631c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
632c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
633c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        *pCutBuffer++ = outByte;
634c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
635c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
636c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
637bef222db22365c2518110d30cd1227625a86195bTim RowleyTHREAD SWR_GS_CONTEXT tlsGsContext;
638bef222db22365c2518110d30cd1227625a86195bTim Rowley
639c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
640c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Implements GS stage.
641c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
642c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
643c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pa - The primitive assembly object.
644c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pGsOut - output stream for GS
645c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <
64627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasStreamOutT,
64727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasRastT>
648c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic void GeometryShaderStage(
649c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
650c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
651c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PA_STATE& pa,
652c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pGsOut,
653c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pCutBuffer,
654c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pStreamCutBuffer,
655c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pSoPrimData,
656c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    simdscalari primID)
657c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
6582f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    SWR_CONTEXT *pContext = pDC->pContext;
6592f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley
6602f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_BEGIN(FEGeometryShader, pDC->drawId);
661c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
662c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const API_STATE& state = GetApiState(pDC);
663c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const SWR_GS_STATE* pState = &state.gsState;
664c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
665c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
666c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
667c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
668bef222db22365c2518110d30cd1227625a86195bTim Rowley    tlsGsContext.pStream = (uint8_t*)pGsOut;
669bef222db22365c2518110d30cd1227625a86195bTim Rowley    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
670bef222db22365c2518110d30cd1227625a86195bTim Rowley    tlsGsContext.PrimitiveID = primID;
671c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
672c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
673c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    simdvector attrib[MAX_ATTRIBUTES];
674c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
675c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // assemble all attributes for the input primitive
676c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
677c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
678c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
679c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        pa.Assemble(attribSlot, attrib);
680c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
681c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
682c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
683bef222db22365c2518110d30cd1227625a86195bTim Rowley            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
684c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
685c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
686c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
687c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // assemble position
688c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    pa.Assemble(VERTEX_POSITION_SLOT, attrib);
689c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t i = 0; i < numVertsPerPrim; ++i)
690c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
691bef222db22365c2518110d30cd1227625a86195bTim Rowley        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
692c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
693c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
694c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t vertexStride = sizeof(simdvertex);
695c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
696c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t inputPrimStride = numSimdBatches * vertexStride;
697c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
698c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t cutPrimStride;
699c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t cutInstanceStride;
700c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
701c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (pState->isSingleStream)
702c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
703c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
704c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
705c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
706c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
707c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
708c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
709c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
710c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
711c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
712c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // record valid prims from the frontend to avoid over binning the newly generated
713c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // prims from the GS
714c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numInputPrims = pa.NumPrims();
715c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
716c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
717c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
718bef222db22365c2518110d30cd1227625a86195bTim Rowley        tlsGsContext.InstanceID = instance;
719bef222db22365c2518110d30cd1227625a86195bTim Rowley        tlsGsContext.mask = GenerateMask(numInputPrims);
720c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
721c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // execute the geometry shader
722bef222db22365c2518110d30cd1227625a86195bTim Rowley        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
723c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
724bef222db22365c2518110d30cd1227625a86195bTim Rowley        tlsGsContext.pStream += instanceStride;
725bef222db22365c2518110d30cd1227625a86195bTim Rowley        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
726c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
727c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
728c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // set up new binner and state for the GS output topology
729c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
73027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (HasRastT::value)
731c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
732c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        switch (pState->outputTopology)
733c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
734c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
735c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
736c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
737c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
738c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
739c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
740c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
741c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // foreach input prim:
742c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // - setup a new PA based on the emitted verts for that prim
743c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // - loop over the new verts, calling PA to assemble each prim
744bef222db22365c2518110d30cd1227625a86195bTim Rowley    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
745c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pPrimitiveId = (uint32_t*)&primID;
746c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
747c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t totalPrimsGenerated = 0;
748c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
749c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
750c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
751c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
752c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
753c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
754c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            uint32_t numEmittedVerts = pVertexCount[inputPrim];
755c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            if (numEmittedVerts == 0)
756c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
757c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                continue;
758c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
759c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
760c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            uint8_t* pBase = pInstanceBase + instance * instanceStride;
761c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
762c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
763efdaf5fa3e74ca4f3d9217dc6955aef6dc698a68Tim Rowley            uint32_t numAttribs = state.feNumAttributes;
764c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
765c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
766c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
767c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                bool processCutVerts = false;
768c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
769c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                uint8_t* pCutBuffer = pCutBase;
770c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
771c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                // assign default stream ID, only relevant when GS is outputting a single stream
772c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                uint32_t streamID = 0;
773c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                if (pState->isSingleStream)
774c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
775c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    processCutVerts = true;
776c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    streamID = pState->singleStreamID;
777c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    if (streamID != stream) continue;
778c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
779c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                else
780c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
781c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    // early exit if this stream is not enabled for streamout
78227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                    if (HasStreamOutT::value && !state.soState.streamEnable[stream])
783c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    {
784c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        continue;
785c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    }
786c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
787c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    // multi-stream output, need to translate StreamID buffer to a cut buffer
788c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
789c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    pCutBuffer = (uint8_t*)pStreamCutBuffer;
790c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    processCutVerts = false;
791c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
792c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
793c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
794c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
795c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                while (gsPa.GetNextStreamOutput())
796c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
797c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    do
798c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    {
799c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
800c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
801c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        if (assemble)
802c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        {
803c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            totalPrimsGenerated += gsPa.NumPrims();
804c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
80527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                            if (HasStreamOutT::value)
806c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            {
807c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
808c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            }
809c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
81027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                            if (HasRastT::value && state.soState.streamToRasterizer == stream)
811c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            {
812c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                simdscalari vPrimId;
813c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                // pull primitiveID from the GS output if available
814c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                if (state.gsState.emitsPrimitiveID)
815c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                {
816c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    simdvector primIdAttrib[3];
817c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
818c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    vPrimId = _simd_castps_si(primIdAttrib[0].x);
819c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                }
820c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                else
821c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                {
822c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
823c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                }
824c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
82592621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
82692621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                simdscalari vViewPortIdx;
82792621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                if (state.gsState.emitsViewportArrayIndex)
82892621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                {
82992621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                    simdvector vpiAttrib[3];
83092621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                    gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
83192621ac5d526e73469c43d524068315a81bbc869Tim Rowley
83292621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                    // OOB indices => forced to zero.
83392621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                    simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
834b311bdf92d65a0704faf8e89caa506d6d18c07e6Tim Rowley                                    simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
835b311bdf92d65a0704faf8e89caa506d6d18c07e6Tim Rowley                                    vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
83692621ac5d526e73469c43d524068315a81bbc869Tim Rowley
83792621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                    vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
83892621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                }
83992621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                else
84092621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                {
84192621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                    vViewPortIdx = _simd_set1_epi32(0);
84292621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                }
84392621ac5d526e73469c43d524068315a81bbc869Tim Rowley
84492621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
845c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            }
846c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        }
847c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    } while (gsPa.NextPrim());
848c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
849c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
850c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
851c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
852c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
853c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // update GS pipeline stats
8544e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley    UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
8554e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley    UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
856cd8d840ce1a8abab490d1e8c7bafa2cbb5399c4fTim Rowley	AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
8572f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_END(FEGeometryShader, 1);
858c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
859c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
860c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
861c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Allocate GS buffers
862c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
863c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param state - API state
864c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param ppGsOut - pointer to GS output buffer allocation
865c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param ppCutBuffer - pointer to GS output cut buffer allocation
866c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
867c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void **ppStreamCutBuffer)
868c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
869ec9d4c4b372df773e4453c228b938e7c6c526c4cTim Rowley    auto pArena = pDC->pArena;
870c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(pArena != nullptr);
871c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(state.gsState.gsEnable);
872c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // allocate arena space to hold GS output verts
873c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // @todo pack attribs
874c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // @todo support multiple streams
875c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t vertexStride = sizeof(simdvertex);
876c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
877c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
878c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
879c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
880c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
881c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
882c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
883c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
884c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
885c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
886c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
887c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
888c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // allocate space for temporary per-stream cut buffer if multi-stream is enabled
889c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (state.gsState.isSingleStream)
890c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
891c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
892c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        *ppStreamCutBuffer = nullptr;
893c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
894c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
895c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
896c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
897c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
898c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
899c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
900c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
901c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
902c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
903c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Contains all data generated by the HS and passed to the
904c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// tessellator and DS.
905c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystruct TessellationThreadLocalData
906c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
907c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_HS_CONTEXT hsContext;
908c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    ScalarPatch patchData[KNOB_SIMD_WIDTH];
909c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pTxCtx;
910c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    size_t tsCtxSize;
911c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
912c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    simdscalar* pDSOutput;
913c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    size_t numDSOutputVectors;
914c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley};
915c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
916c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyTHREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
917c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
918c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
919c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Allocate tessellation data for this worker thread.
920c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim RowleyINLINE
921c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic void AllocateTessellationData(SWR_CONTEXT* pContext)
922c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
923c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
924c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (gt_pTessellationThreadData == nullptr)
925c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
926c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        gt_pTessellationThreadData = (TessellationThreadLocalData*)
9274997169779069692c0e64df2dfb89185ae48f193Tim Rowley            AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
928c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
929c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
930c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
931c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
932c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
933c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief Implements Tessellation Stages.
934c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
935c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id. Even thread has a unique id.
936c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pa - The primitive assembly object.
937c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pGsOut - output stream for GS
938c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <
93927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasGeometryShaderT,
94027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasStreamOutT,
94127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasRastT>
942c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleystatic void TessellationStages(
943c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
944c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
945c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PA_STATE& pa,
946c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pGsOut,
947c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pCutBuffer,
948c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pCutStreamBuffer,
949c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pSoPrimData,
950c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    simdscalari primID)
951c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
9522f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    SWR_CONTEXT *pContext = pDC->pContext;
953c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const API_STATE& state = GetApiState(pDC);
954c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const SWR_TS_STATE& tsState = state.tsState;
955c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
956c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(gt_pTessellationThreadData);
957c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
958c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    HANDLE tsCtx = TSInitCtx(
959c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        tsState.domain,
960c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        tsState.partitioning,
961c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        tsState.tsOutputTopology,
962c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        gt_pTessellationThreadData->pTxCtx,
963c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        gt_pTessellationThreadData->tsCtxSize);
964c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (tsCtx == nullptr)
965c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
9664997169779069692c0e64df2dfb89185ae48f193Tim Rowley        gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
967c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        tsCtx = TSInitCtx(
968c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsState.domain,
969c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsState.partitioning,
970c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsState.tsOutputTopology,
971c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            gt_pTessellationThreadData->pTxCtx,
972c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            gt_pTessellationThreadData->tsCtxSize);
973c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
974c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_ASSERT(tsCtx);
975c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
976c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
97727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (HasRastT::value)
978c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
979c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        switch (tsState.postDSTopology)
980c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
981c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
982c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
983c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
984c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
985c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
986c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
987c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
988c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
989c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    hsContext.pCPout = gt_pTessellationThreadData->patchData;
990c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    hsContext.PrimitiveID = primID;
991c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
992c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
993c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // Max storage for one attribute for an entire simdprimitive
994c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
995c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
996c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // assemble all attributes for the input primitives
997c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
998c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
999c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1000c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        pa.Assemble(attribSlot, simdattrib);
1001c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1002c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1003c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1004c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1005c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1006c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1007c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1008c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if defined(_DEBUG)
1009c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1010c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1011c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1012c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numPrims = pa.NumPrims();
1013c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    hsContext.mask = GenerateMask(numPrims);
1014c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1015c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // Run the HS
10162f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_BEGIN(FEHullShader, pDC->drawId);
1017c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
10182f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_END(FEHullShader, 0);
1019c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
10204e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley    UPDATE_STAT_FE(HsInvocations, numPrims);
1021c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1022c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const uint32_t* pPrimId = (const uint32_t*)&primID;
1023c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1024c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t p = 0; p < numPrims; ++p)
1025c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1026c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // Run Tessellator
1027c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_TS_TESSELLATED_DATA tsData = { 0 };
10282f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley        AR_BEGIN(FETessellation, pDC->drawId);
1029c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1030cd8d840ce1a8abab490d1e8c7bafa2cbb5399c4fTim Rowley		AR_EVENT(TessPrimCount(1));
10312f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley        AR_END(FETessellation, 0);
1032c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1033c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        if (tsData.NumPrimitives == 0)
1034c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1035c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            continue;
1036c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1037c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(tsData.NumDomainPoints);
1038c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1039c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // Allocate DS Output memory
1040c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1041c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1042c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1043c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1044c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
10454997169779069692c0e64df2dfb89185ae48f193Tim Rowley            AlignedFree(gt_pTessellationThreadData->pDSOutput);
10464997169779069692c0e64df2dfb89185ae48f193Tim Rowley            gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1047c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1048c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1049c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1050c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1051c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1052c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if defined(_DEBUG)
1053c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1054c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1055c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1056c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // Run Domain Shader
1057c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_DS_CONTEXT dsContext;
1058c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dsContext.PrimitiveID = pPrimId[p];
1059c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dsContext.pCpIn = &hsContext.pCPout[p];
1060c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1061c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1062c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1063c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        dsContext.vectorStride = requiredDSVectorInvocations;
1064c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1065c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint32_t dsInvocations = 0;
1066c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1067c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1068c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1069c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1070c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
10712f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley            AR_BEGIN(FEDomainShader, pDC->drawId);
1072c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
10732f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley            AR_END(FEDomainShader, 0);
1074c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1075c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            dsInvocations += KNOB_SIMD_WIDTH;
1076c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
10774e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley        UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1078c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1079c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        PA_TESS tessPa(
1080c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pDC,
1081c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            dsContext.pOutputData,
1082c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            dsContext.vectorStride,
1083c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsState.numDsOutputAttribs,
1084c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsData.ppIndices,
1085c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsData.NumPrimitives,
1086c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tsState.postDSTopology);
1087c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1088c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        while (tessPa.HasWork())
1089c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
109027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            if (HasGeometryShaderT::value)
1091c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
1092c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                GeometryShaderStage<HasStreamOutT, HasRastT>(
1093c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1094c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    _simd_set1_epi32(dsContext.PrimitiveID));
1095c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
1096c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            else
1097c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
109827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                if (HasStreamOutT::value)
1099c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
1100c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1101c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
1102c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
110327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                if (HasRastT::value)
1104c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
1105c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    simdvector prim[3]; // Only deal with triangles, lines, or points
11062f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                    AR_BEGIN(FEPAAssemble, pDC->drawId);
11072be7c3e780977678eb423e75cf063f92f7d03916Tim Rowley#if SWR_ENABLE_ASSERTS
11082be7c3e780977678eb423e75cf063f92f7d03916Tim Rowley                    bool assemble =
11092be7c3e780977678eb423e75cf063f92f7d03916Tim Rowley#endif
11102be7c3e780977678eb423e75cf063f92f7d03916Tim Rowley                        tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
11112f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                    AR_END(FEPAAssemble, 1);
1112c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    SWR_ASSERT(assemble);
1113c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1114c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    SWR_ASSERT(pfnClipFunc);
1115c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    pfnClipFunc(pDC, tessPa, workerId, prim,
111692621ac5d526e73469c43d524068315a81bbc869Tim Rowley                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1117c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
1118c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
1119c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1120c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            tessPa.NextPrim();
1121c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1122c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        } // while (tessPa.HasWork())
1123c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    } // for (uint32_t p = 0; p < numPrims; ++p)
1124c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1125c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    TSDestroyCtx(tsCtx);
1126c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
1127c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1128c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley//////////////////////////////////////////////////////////////////////////
1129c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @brief FE handler for SwrDraw.
1130c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @tparam IsIndexedT - Is indexed drawing enabled
1131c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @tparam HasTessellationT - Is tessellation enabled
113227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley/// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1133c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @tparam HasStreamOutT - Is stream-out enabled
1134c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @tparam HasRastT - Is rasterization enabled
1135c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pContext - pointer to SWR context.
1136c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pDC - pointer to draw context.
1137c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param workerId - thread's worker id.
1138c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley/// @param pUserData - Pointer to DRAW_WORK
1139c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleytemplate <
114027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename IsIndexedT,
1141c8835a592471a0238e296f6529b5dadb431cc622Bruce Cherniak    typename IsCutIndexEnabledT,
114227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasTessellationT,
114327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasGeometryShaderT,
114427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasStreamOutT,
114527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typename HasRastT>
1146c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowleyvoid ProcessDraw(
1147c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_CONTEXT *pContext,
1148c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_CONTEXT *pDC,
1149c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t workerId,
1150c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void *pUserData)
1151c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley{
1152c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1153c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ENABLE_TOSS_POINTS
1154c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    if (KNOB_TOSS_QUEUE_FE)
1155c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1156c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        return;
1157c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1158c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1159c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
11602f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_BEGIN(FEProcessDraw, pDC->drawId);
1161c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1162c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1163c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const API_STATE&    state = GetApiState(pDC);
1164c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1165c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_VS_CONTEXT      vsContext;
1166c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    simdvertex          vin;
1167c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1168c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    int indexSize = 0;
1169c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t endVertex = work.numVerts;
1170c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1171c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    const int32_t* pLastRequestedIndex = nullptr;
117227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (IsIndexedT::value)
1173c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1174c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        switch (work.type)
1175c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1176c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case R32_UINT:
1177c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            indexSize = sizeof(uint32_t);
1178c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pLastRequestedIndex = &(work.pIB[endVertex]);
1179c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            break;
1180c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case R16_UINT:
1181c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            indexSize = sizeof(uint16_t);
1182c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // nasty address offset to last index
1183c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1184c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            break;
1185c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        case R8_UINT:
1186c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            indexSize = sizeof(uint8_t);
1187c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // nasty address offset to last index
1188c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1189c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            break;
1190c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        default:
1191c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            SWR_ASSERT(0);
1192c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1193c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1194c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
1195c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1196c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // No cuts, prune partial primitives.
1197c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1198c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1199c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1200c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    SWR_FETCH_CONTEXT fetchInfo = { 0 };
1201c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    fetchInfo.pStreams = &state.vertexBuffers[0];
1202c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    fetchInfo.StartInstance = work.startInstance;
1203c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    fetchInfo.StartVertex = 0;
1204c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1205c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    vsContext.pVin = &vin;
1206c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
120727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (IsIndexedT::value)
1208c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1209c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        fetchInfo.BaseVertex = work.baseVertex;
1210c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1211c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // if the entire index buffer isn't being consumed, set the last index
1212c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        // so that fetches < a SIMD wide will be masked off
1213542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1214c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        if (pLastRequestedIndex < fetchInfo.pLastIndex)
1215c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1216c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            fetchInfo.pLastIndex = pLastRequestedIndex;
1217c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1218c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1219c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
1220c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1221c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        fetchInfo.StartVertex = work.startVertex;
1222c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1223c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
12244198520a82463aa392762ec156231b95bf2685acTim Rowley#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1225c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1226c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1227c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1228c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pGsOut = nullptr;
1229c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pCutBuffer = nullptr;
1230c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    void* pStreamCutBuffer = nullptr;
123127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (HasGeometryShaderT::value)
1232c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1233c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1234c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1235c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
123627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (HasTessellationT::value)
1237c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1238c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.tsState.tsEnable == true);
1239c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.pfnHsFunc != nullptr);
1240c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.pfnDsFunc != nullptr);
1241c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1242c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        AllocateTessellationData(pContext);
1243c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1244c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    else
1245c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1246c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.tsState.tsEnable == false);
1247c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.pfnHsFunc == nullptr);
1248c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        SWR_ASSERT(state.pfnDsFunc == nullptr);
1249c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1250c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1251c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // allocate space for streamout input prim data
1252c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    uint32_t* pSoPrimData = nullptr;
125327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    if (HasStreamOutT::value)
1254c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1255c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1256c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1257c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1258c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    // choose primitive assembler
1259c8835a592471a0238e296f6529b5dadb431cc622Bruce Cherniak    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1260c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    PA_STATE& pa = paFactory.GetPA();
1261c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1262c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1263c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1264c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    {
1265c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        simdscalari vIndex;
1266c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        uint32_t  i = 0;
1267c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
126827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        if (IsIndexedT::value)
1269c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1270c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            fetchInfo.pIndices = work.pIB;
1271c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1272c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        else
1273c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1274c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1275c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            fetchInfo.pIndices = (const int32_t*)&vIndex;
1276c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1277c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1278c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        fetchInfo.CurInstance = instanceNum;
1279c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        vsContext.InstanceID = instanceNum;
1280c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1281c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        while (pa.HasWork())
1282c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        {
1283c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1284c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // So we need to keep this outside of (i < endVertex) check.
1285c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            simdmask* pvCutIndices = nullptr;
128627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            if (IsIndexedT::value)
1287c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
1288c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                pvCutIndices = &pa.GetNextVsIndices();
1289c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
1290c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1291c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            simdvertex& vout = pa.GetNextVsOutput();
1292c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            vsContext.pVout = &vout;
1293c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1294c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            if (i < endVertex)
1295c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
1296c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1297c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                // 1. Execute FS/VS for a single SIMD.
12982f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                AR_BEGIN(FEFetchShader, pDC->drawId);
1299c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                state.pfnFetchFunc(fetchInfo, vin);
13002f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                AR_END(FEFetchShader, 0);
1301c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1302c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                // forward fetch generated vertex IDs to the vertex shader
1303c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                vsContext.VertexID = fetchInfo.VertexID;
1304c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1305c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                // Setup active mask for vertex shader.
1306c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                vsContext.mask = GenerateMask(endVertex - i);
1307c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1308c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                // forward cut mask to the PA
130927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                if (IsIndexedT::value)
1310c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
1311c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1312c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
1313c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
13144e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1315c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1316c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ENABLE_TOSS_POINTS
1317c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                if (!KNOB_TOSS_FETCH)
1318c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1319c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
13202f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                    AR_BEGIN(FEVertexShader, pDC->drawId);
1321c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
13222f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                    AR_END(FEVertexShader, 0);
1323c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
13244e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1325c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
1326c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
1327c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1328c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            // 2. Assemble primitives given the last two SIMD.
1329c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            do
1330c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
1331c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1332c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                // PaAssemble returns false if there is not enough verts to assemble.
13332f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                AR_BEGIN(FEPAAssemble, pDC->drawId);
1334c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
13352f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley                AR_END(FEPAAssemble, 1);
1336c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1337c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ENABLE_TOSS_POINTS
1338c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                if (!KNOB_TOSS_FETCH)
1339c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1340c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                {
1341c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#if KNOB_ENABLE_TOSS_POINTS
1342c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    if (!KNOB_TOSS_VS)
1343c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley#endif
1344c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    {
1345c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        if (assemble)
1346c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        {
13474e8763cb0904c30d1962cf5ad52fe3a87be7b4bdTim Rowley                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1348c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
134927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                            if (HasTessellationT::value)
1350c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            {
1351c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1352c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1353c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            }
135427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                            else if (HasGeometryShaderT::value)
1355c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            {
1356c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                GeometryShaderStage<HasStreamOutT, HasRastT>(
1357c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1358c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            }
1359c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            else
1360c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            {
1361c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                // If streamout is enabled then stream vertices out to memory.
136227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                                if (HasStreamOutT::value)
1363c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                {
1364c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1365c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                }
1366c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
136727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley                                if (HasRastT::value)
1368c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                {
1369c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
1370c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
137192621ac5d526e73469c43d524068315a81bbc869Tim Rowley                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1372c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                                }
1373c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                            }
1374c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                        }
1375c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                    }
1376c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                }
1377c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            } while (pa.NextPrim());
1378c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
1379c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            i += KNOB_SIMD_WIDTH;
138027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley            if (IsIndexedT::value)
1381c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
1382542d7dec7b8748b164150bd0818e880ed31918e3Tim Rowley                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1383c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
1384c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            else
1385c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            {
1386c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1387c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley            }
1388c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        }
1389c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley        pa.Reset();
1390c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley    }
1391c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley
13925912552947cc84e794543589388d18aaf4ff6c86Tim Rowley
13932f86a9577adf5c43e892f899224d0f73ff1d37c2Tim Rowley    AR_END(FEProcessDraw, numPrims * work.numInstances);
1394c6e67f5a9373e916a8d2333585cb5787aa5f7bb7Tim Rowley}
139527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
139627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowleystruct FEDrawChooser
139727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley{
139827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    typedef PFN_FE_WORK_FUNC FuncType;
139927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
140027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    template <typename... ArgsB>
140127cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    static FuncType GetFunc()
140227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    {
140327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley        return ProcessDraw<ArgsB...>;
140427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    }
140527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley};
140627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
140727cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley
140827cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley// Selector for correct templated Draw front-end function
140927cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim RowleyPFN_FE_WORK_FUNC GetProcessDrawFunc(
141027cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    bool IsIndexed,
1411c8835a592471a0238e296f6529b5dadb431cc622Bruce Cherniak    bool IsCutIndexEnabled,
141227cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    bool HasTessellation,
141327cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    bool HasGeometryShader,
141427cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    bool HasStreamOut,
141527cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley    bool HasRasterization)
141627cc5924ea95d5a00ddb9d5c6ffb8853c92b1f4eTim Rowley{
1417c8835a592471a0238e296f6529b5dadb431cc622Bruce Cherniak    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1418ada27b503eab3c53d9ec1bca2cef48c5353e81f9Tim Rowley}