1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file frontend.cpp
24*
25* @brief Implementation for Frontend which handles vertex processing,
26*        primitive assembly, clipping, binning, etc.
27*
28******************************************************************************/
29
30#include "api.h"
31#include "frontend.h"
32#include "backend.h"
33#include "context.h"
34#include "rdtsc_core.h"
35#include "utils.h"
36#include "threads.h"
37#include "pa.h"
38#include "clip.h"
39#include "tilemgr.h"
40#include "tessellator.h"
41#include <limits>
42
43//////////////////////////////////////////////////////////////////////////
44/// @brief Helper macro to generate a bitmask
45static INLINE uint32_t GenMask(uint32_t numBits)
46{
47    SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
48    return ((1U << numBits) - 1);
49}
50
51//////////////////////////////////////////////////////////////////////////
52/// @brief FE handler for SwrSync.
53/// @param pContext - pointer to SWR context.
54/// @param pDC - pointer to draw context.
55/// @param workerId - thread's worker id. Even thread has a unique id.
56/// @param pUserData - Pointer to user data passed back to sync callback.
57/// @todo This should go away when we switch this to use compute threading.
58void ProcessSync(
59    SWR_CONTEXT *pContext,
60    DRAW_CONTEXT *pDC,
61    uint32_t workerId,
62    void *pUserData)
63{
64    BE_WORK work;
65    work.type = SYNC;
66    work.pfnWork = ProcessSyncBE;
67
68    MacroTileMgr *pTileMgr = pDC->pTileMgr;
69    pTileMgr->enqueue(0, 0, &work);
70}
71
72//////////////////////////////////////////////////////////////////////////
73/// @brief FE handler for SwrDestroyContext.
74/// @param pContext - pointer to SWR context.
75/// @param pDC - pointer to draw context.
76/// @param workerId - thread's worker id. Even thread has a unique id.
77/// @param pUserData - Pointer to user data passed back to sync callback.
78void ProcessShutdown(
79    SWR_CONTEXT *pContext,
80    DRAW_CONTEXT *pDC,
81    uint32_t workerId,
82    void *pUserData)
83{
84    BE_WORK work;
85    work.type = SHUTDOWN;
86    work.pfnWork = ProcessShutdownBE;
87
88    MacroTileMgr *pTileMgr = pDC->pTileMgr;
89    // Enqueue at least 1 work item for each worker thread
90    // account for number of numa nodes
91    uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
92
93    for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
94    {
95        for (uint32_t n = 0; n < numNumaNodes; ++n)
96        {
97            pTileMgr->enqueue(i, n, &work);
98        }
99    }
100}
101
102//////////////////////////////////////////////////////////////////////////
103/// @brief FE handler for SwrClearRenderTarget.
104/// @param pContext - pointer to SWR context.
105/// @param pDC - pointer to draw context.
106/// @param workerId - thread's worker id. Even thread has a unique id.
107/// @param pUserData - Pointer to user data passed back to clear callback.
108/// @todo This should go away when we switch this to use compute threading.
109void ProcessClear(
110    SWR_CONTEXT *pContext,
111    DRAW_CONTEXT *pDC,
112    uint32_t workerId,
113    void *pUserData)
114{
115    CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
116    MacroTileMgr *pTileMgr = pDC->pTileMgr;
117
118    // queue a clear to each macro tile
119    // compute macro tile bounds for the specified rect
120    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
121    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
122    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
123    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
124
125    BE_WORK work;
126    work.type = CLEAR;
127    work.pfnWork = ProcessClearBE;
128    work.desc.clear = *pDesc;
129
130    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
131    {
132        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
133        {
134            pTileMgr->enqueue(x, y, &work);
135        }
136    }
137}
138
139//////////////////////////////////////////////////////////////////////////
140/// @brief FE handler for SwrStoreTiles.
141/// @param pContext - pointer to SWR context.
142/// @param pDC - pointer to draw context.
143/// @param workerId - thread's worker id. Even thread has a unique id.
144/// @param pUserData - Pointer to user data passed back to callback.
145/// @todo This should go away when we switch this to use compute threading.
146void ProcessStoreTiles(
147    SWR_CONTEXT *pContext,
148    DRAW_CONTEXT *pDC,
149    uint32_t workerId,
150    void *pUserData)
151{
152    AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
153    MacroTileMgr *pTileMgr = pDC->pTileMgr;
154    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
155
156    // queue a store to each macro tile
157    // compute macro tile bounds for the specified rect
158    uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
159    uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
160    uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
161    uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
162
163    // store tiles
164    BE_WORK work;
165    work.type = STORETILES;
166    work.pfnWork = ProcessStoreTilesBE;
167    work.desc.storeTiles = *pDesc;
168
169    for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
170    {
171        for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
172        {
173            pTileMgr->enqueue(x, y, &work);
174        }
175    }
176
177    AR_END(FEProcessStoreTiles, 0);
178}
179
180//////////////////////////////////////////////////////////////////////////
181/// @brief FE handler for SwrInvalidateTiles.
182/// @param pContext - pointer to SWR context.
183/// @param pDC - pointer to draw context.
184/// @param workerId - thread's worker id. Even thread has a unique id.
185/// @param pUserData - Pointer to user data passed back to callback.
186/// @todo This should go away when we switch this to use compute threading.
187void ProcessDiscardInvalidateTiles(
188    SWR_CONTEXT *pContext,
189    DRAW_CONTEXT *pDC,
190    uint32_t workerId,
191    void *pUserData)
192{
193    AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
194    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
195    MacroTileMgr *pTileMgr = pDC->pTileMgr;
196
197    // compute macro tile bounds for the specified rect
198    uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
199    uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
200    uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
201    uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
202
203    if (pDesc->fullTilesOnly == false)
204    {
205        // include partial tiles
206        macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
207        macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
208        macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
209        macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
210    }
211
212    SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
213    SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
214
215    macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
216    macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
217
218    // load tiles
219    BE_WORK work;
220    work.type = DISCARDINVALIDATETILES;
221    work.pfnWork = ProcessDiscardInvalidateTilesBE;
222    work.desc.discardInvalidateTiles = *pDesc;
223
224    for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
225    {
226        for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
227        {
228            pTileMgr->enqueue(x, y, &work);
229        }
230    }
231
232    AR_END(FEProcessInvalidateTiles, 0);
233}
234
235//////////////////////////////////////////////////////////////////////////
236/// @brief Computes the number of primitives given the number of verts.
237/// @param mode - primitive topology for draw operation.
238/// @param numPrims - number of vertices or indices for draw.
239/// @todo Frontend needs to be refactored. This will go in appropriate place then.
240uint32_t GetNumPrims(
241    PRIMITIVE_TOPOLOGY mode,
242    uint32_t numPrims)
243{
244    switch (mode)
245    {
246    case TOP_POINT_LIST: return numPrims;
247    case TOP_TRIANGLE_LIST: return numPrims / 3;
248    case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
249    case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
250    case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
251    case TOP_QUAD_LIST: return numPrims / 4;
252    case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
253    case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
254    case TOP_LINE_LIST: return numPrims / 2;
255    case TOP_LINE_LOOP: return numPrims;
256    case TOP_RECT_LIST: return numPrims / 3;
257    case TOP_LINE_LIST_ADJ: return numPrims / 4;
258    case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
259    case TOP_TRI_LIST_ADJ: return numPrims / 6;
260    case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
261
262    case TOP_PATCHLIST_1:
263    case TOP_PATCHLIST_2:
264    case TOP_PATCHLIST_3:
265    case TOP_PATCHLIST_4:
266    case TOP_PATCHLIST_5:
267    case TOP_PATCHLIST_6:
268    case TOP_PATCHLIST_7:
269    case TOP_PATCHLIST_8:
270    case TOP_PATCHLIST_9:
271    case TOP_PATCHLIST_10:
272    case TOP_PATCHLIST_11:
273    case TOP_PATCHLIST_12:
274    case TOP_PATCHLIST_13:
275    case TOP_PATCHLIST_14:
276    case TOP_PATCHLIST_15:
277    case TOP_PATCHLIST_16:
278    case TOP_PATCHLIST_17:
279    case TOP_PATCHLIST_18:
280    case TOP_PATCHLIST_19:
281    case TOP_PATCHLIST_20:
282    case TOP_PATCHLIST_21:
283    case TOP_PATCHLIST_22:
284    case TOP_PATCHLIST_23:
285    case TOP_PATCHLIST_24:
286    case TOP_PATCHLIST_25:
287    case TOP_PATCHLIST_26:
288    case TOP_PATCHLIST_27:
289    case TOP_PATCHLIST_28:
290    case TOP_PATCHLIST_29:
291    case TOP_PATCHLIST_30:
292    case TOP_PATCHLIST_31:
293    case TOP_PATCHLIST_32:
294        return numPrims / (mode - TOP_PATCHLIST_BASE);
295
296    case TOP_POLYGON:
297    case TOP_POINT_LIST_BF:
298    case TOP_LINE_STRIP_CONT:
299    case TOP_LINE_STRIP_BF:
300    case TOP_LINE_STRIP_CONT_BF:
301    case TOP_TRIANGLE_FAN_NOSTIPPLE:
302    case TOP_TRI_STRIP_REVERSE:
303    case TOP_PATCHLIST_BASE:
304    case TOP_UNKNOWN:
305        SWR_ASSERT(false, "Unsupported topology: %d", mode);
306        return 0;
307    }
308
309    return 0;
310}
311
312//////////////////////////////////////////////////////////////////////////
313/// @brief Computes the number of verts given the number of primitives.
314/// @param mode - primitive topology for draw operation.
315/// @param numPrims - number of primitives for draw.
316uint32_t GetNumVerts(
317    PRIMITIVE_TOPOLOGY mode,
318    uint32_t numPrims)
319{
320    switch (mode)
321    {
322    case TOP_POINT_LIST: return numPrims;
323    case TOP_TRIANGLE_LIST: return numPrims * 3;
324    case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
325    case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
326    case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
327    case TOP_QUAD_LIST: return numPrims * 4;
328    case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
329    case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
330    case TOP_LINE_LIST: return numPrims * 2;
331    case TOP_LINE_LOOP: return numPrims;
332    case TOP_RECT_LIST: return numPrims * 3;
333    case TOP_LINE_LIST_ADJ: return numPrims * 4;
334    case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
335    case TOP_TRI_LIST_ADJ: return numPrims * 6;
336    case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
337
338    case TOP_PATCHLIST_1:
339    case TOP_PATCHLIST_2:
340    case TOP_PATCHLIST_3:
341    case TOP_PATCHLIST_4:
342    case TOP_PATCHLIST_5:
343    case TOP_PATCHLIST_6:
344    case TOP_PATCHLIST_7:
345    case TOP_PATCHLIST_8:
346    case TOP_PATCHLIST_9:
347    case TOP_PATCHLIST_10:
348    case TOP_PATCHLIST_11:
349    case TOP_PATCHLIST_12:
350    case TOP_PATCHLIST_13:
351    case TOP_PATCHLIST_14:
352    case TOP_PATCHLIST_15:
353    case TOP_PATCHLIST_16:
354    case TOP_PATCHLIST_17:
355    case TOP_PATCHLIST_18:
356    case TOP_PATCHLIST_19:
357    case TOP_PATCHLIST_20:
358    case TOP_PATCHLIST_21:
359    case TOP_PATCHLIST_22:
360    case TOP_PATCHLIST_23:
361    case TOP_PATCHLIST_24:
362    case TOP_PATCHLIST_25:
363    case TOP_PATCHLIST_26:
364    case TOP_PATCHLIST_27:
365    case TOP_PATCHLIST_28:
366    case TOP_PATCHLIST_29:
367    case TOP_PATCHLIST_30:
368    case TOP_PATCHLIST_31:
369    case TOP_PATCHLIST_32:
370        return numPrims * (mode - TOP_PATCHLIST_BASE);
371
372    case TOP_POLYGON:
373    case TOP_POINT_LIST_BF:
374    case TOP_LINE_STRIP_CONT:
375    case TOP_LINE_STRIP_BF:
376    case TOP_LINE_STRIP_CONT_BF:
377    case TOP_TRIANGLE_FAN_NOSTIPPLE:
378    case TOP_TRI_STRIP_REVERSE:
379    case TOP_PATCHLIST_BASE:
380    case TOP_UNKNOWN:
381        SWR_ASSERT(false, "Unsupported topology: %d", mode);
382        return 0;
383    }
384
385    return 0;
386}
387
388//////////////////////////////////////////////////////////////////////////
389/// @brief Return number of verts per primitive.
390/// @param topology - topology
391/// @param includeAdjVerts - include adjacent verts in primitive vertices
392INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
393{
394    uint32_t numVerts = 0;
395    switch (topology)
396    {
397    case TOP_POINT_LIST:
398    case TOP_POINT_LIST_BF:
399        numVerts = 1;
400        break;
401    case TOP_LINE_LIST:
402    case TOP_LINE_STRIP:
403    case TOP_LINE_LIST_ADJ:
404    case TOP_LINE_LOOP:
405    case TOP_LINE_STRIP_CONT:
406    case TOP_LINE_STRIP_BF:
407    case TOP_LISTSTRIP_ADJ:
408        numVerts = 2;
409        break;
410    case TOP_TRIANGLE_LIST:
411    case TOP_TRIANGLE_STRIP:
412    case TOP_TRIANGLE_FAN:
413    case TOP_TRI_LIST_ADJ:
414    case TOP_TRI_STRIP_ADJ:
415    case TOP_TRI_STRIP_REVERSE:
416    case TOP_RECT_LIST:
417        numVerts = 3;
418        break;
419    case TOP_QUAD_LIST:
420    case TOP_QUAD_STRIP:
421        numVerts = 4;
422        break;
423    case TOP_PATCHLIST_1:
424    case TOP_PATCHLIST_2:
425    case TOP_PATCHLIST_3:
426    case TOP_PATCHLIST_4:
427    case TOP_PATCHLIST_5:
428    case TOP_PATCHLIST_6:
429    case TOP_PATCHLIST_7:
430    case TOP_PATCHLIST_8:
431    case TOP_PATCHLIST_9:
432    case TOP_PATCHLIST_10:
433    case TOP_PATCHLIST_11:
434    case TOP_PATCHLIST_12:
435    case TOP_PATCHLIST_13:
436    case TOP_PATCHLIST_14:
437    case TOP_PATCHLIST_15:
438    case TOP_PATCHLIST_16:
439    case TOP_PATCHLIST_17:
440    case TOP_PATCHLIST_18:
441    case TOP_PATCHLIST_19:
442    case TOP_PATCHLIST_20:
443    case TOP_PATCHLIST_21:
444    case TOP_PATCHLIST_22:
445    case TOP_PATCHLIST_23:
446    case TOP_PATCHLIST_24:
447    case TOP_PATCHLIST_25:
448    case TOP_PATCHLIST_26:
449    case TOP_PATCHLIST_27:
450    case TOP_PATCHLIST_28:
451    case TOP_PATCHLIST_29:
452    case TOP_PATCHLIST_30:
453    case TOP_PATCHLIST_31:
454    case TOP_PATCHLIST_32:
455        numVerts = topology - TOP_PATCHLIST_BASE;
456        break;
457    default:
458        SWR_ASSERT(false, "Unsupported topology: %d", topology);
459        break;
460    }
461
462    if (includeAdjVerts)
463    {
464        switch (topology)
465        {
466        case TOP_LISTSTRIP_ADJ:
467        case TOP_LINE_LIST_ADJ: numVerts = 4; break;
468        case TOP_TRI_STRIP_ADJ:
469        case TOP_TRI_LIST_ADJ: numVerts = 6; break;
470        default: break;
471        }
472    }
473
474    return numVerts;
475}
476
477//////////////////////////////////////////////////////////////////////////
478/// @brief Generate mask from remaining work.
479/// @param numWorkItems - Number of items being worked on by a SIMD.
480static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
481{
482    uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
483    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
484    return _simd_castps_si(vMask(mask));
485}
486
487//////////////////////////////////////////////////////////////////////////
488/// @brief StreamOut - Streams vertex data out to SO buffers.
489///        Generally, we are only streaming out a SIMDs worth of triangles.
490/// @param pDC - pointer to draw context.
491/// @param workerId - thread's worker id. Even thread has a unique id.
492/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
493static void StreamOut(
494    DRAW_CONTEXT* pDC,
495    PA_STATE& pa,
496    uint32_t workerId,
497    uint32_t* pPrimData,
498    uint32_t streamIndex)
499{
500    SWR_CONTEXT *pContext = pDC->pContext;
501
502    AR_BEGIN(FEStreamout, pDC->drawId);
503
504    const API_STATE& state = GetApiState(pDC);
505    const SWR_STREAMOUT_STATE &soState = state.soState;
506
507    uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
508
509    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
510    uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
511
512    SWR_STREAMOUT_CONTEXT soContext = { 0 };
513
514    // Setup buffer state pointers.
515    for (uint32_t i = 0; i < 4; ++i)
516    {
517        soContext.pBuffer[i] = &state.soBuffer[i];
518    }
519
520    uint32_t numPrims = pa.NumPrims();
521    for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
522    {
523        DWORD slot = 0;
524        uint32_t soMask = soState.streamMasks[streamIndex];
525
526        // Write all entries into primitive data buffer for SOS.
527        while (_BitScanForward(&slot, soMask))
528        {
529            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
530            uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
531            pa.AssembleSingle(paSlot, primIndex, attrib);
532
533            // Attribute offset is relative offset from start of vertex.
534            // Note that attributes start at slot 1 in the PA buffer. We need to write this
535            // to prim data starting at slot 0. Which is why we do (slot - 1).
536            // Also note: GL works slightly differently, and needs slot 0
537            uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
538
539            // Store each vertex's attrib at appropriate locations in pPrimData buffer.
540            for (uint32_t v = 0; v < soVertsPerPrim; ++v)
541            {
542                uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
543
544                _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
545            }
546            soMask &= ~(1 << slot);
547        }
548
549        // Update pPrimData pointer
550        soContext.pPrimData = pPrimData;
551
552        // Call SOS
553        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
554        state.pfnSoFunc[streamIndex](soContext);
555    }
556
557    // Update SO write offset. The driver provides memory for the update.
558    for (uint32_t i = 0; i < 4; ++i)
559    {
560        if (state.soBuffer[i].pWriteOffset)
561        {
562            *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
563        }
564
565        if (state.soBuffer[i].soWriteEnable)
566        {
567            pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
568            pDC->dynState.SoWriteOffsetDirty[i] = true;
569        }
570    }
571
572    UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
573    UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
574
575    AR_END(FEStreamout, 1);
576}
577
578//////////////////////////////////////////////////////////////////////////
579/// @brief Computes number of invocations. The current index represents
580///        the start of the SIMD. The max index represents how much work
581///        items are remaining. If there is less then a SIMD's xmin of work
582///        then return the remaining amount of work.
583/// @param curIndex - The start index for the SIMD.
584/// @param maxIndex - The last index for all work items.
585static INLINE uint32_t GetNumInvocations(
586    uint32_t curIndex,
587    uint32_t maxIndex)
588{
589    uint32_t remainder = (maxIndex - curIndex);
590    return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
591}
592
593//////////////////////////////////////////////////////////////////////////
594/// @brief Converts a streamId buffer to a cut buffer for the given stream id.
595///        The geometry shader will loop over each active streamout buffer, assembling
596///        primitives for the downstream stages. When multistream output is enabled,
597///        the generated stream ID buffer from the GS needs to be converted to a cut
598///        buffer for the primitive assembler.
599/// @param stream - stream id to generate the cut buffer for
600/// @param pStreamIdBase - pointer to the stream ID buffer
601/// @param numEmittedVerts - Number of total verts emitted by the GS
602/// @param pCutBuffer - output buffer to write cuts to
603void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
604{
605    SWR_ASSERT(stream < MAX_SO_STREAMS);
606
607    uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
608    uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
609
610    for (uint32_t b = 0; b < numOutputBytes; ++b)
611    {
612        uint8_t curInputByte = pStreamIdBase[2*b];
613        uint8_t outByte = 0;
614        for (uint32_t i = 0; i < 4; ++i)
615        {
616            if ((curInputByte & 0x3) != stream)
617            {
618                outByte |= (1 << i);
619            }
620            curInputByte >>= 2;
621        }
622
623        curInputByte = pStreamIdBase[2 * b + 1];
624        for (uint32_t i = 0; i < 4; ++i)
625        {
626            if ((curInputByte & 0x3) != stream)
627            {
628                outByte |= (1 << (i + 4));
629            }
630            curInputByte >>= 2;
631        }
632
633        *pCutBuffer++ = outByte;
634    }
635}
636
637THREAD SWR_GS_CONTEXT tlsGsContext;
638
639//////////////////////////////////////////////////////////////////////////
640/// @brief Implements GS stage.
641/// @param pDC - pointer to draw context.
642/// @param workerId - thread's worker id. Even thread has a unique id.
643/// @param pa - The primitive assembly object.
644/// @param pGsOut - output stream for GS
645template <
646    typename HasStreamOutT,
647    typename HasRastT>
648static void GeometryShaderStage(
649    DRAW_CONTEXT *pDC,
650    uint32_t workerId,
651    PA_STATE& pa,
652    void* pGsOut,
653    void* pCutBuffer,
654    void* pStreamCutBuffer,
655    uint32_t* pSoPrimData,
656    simdscalari primID)
657{
658    SWR_CONTEXT *pContext = pDC->pContext;
659
660    AR_BEGIN(FEGeometryShader, pDC->drawId);
661
662    const API_STATE& state = GetApiState(pDC);
663    const SWR_GS_STATE* pState = &state.gsState;
664
665    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
666    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
667
668    tlsGsContext.pStream = (uint8_t*)pGsOut;
669    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
670    tlsGsContext.PrimitiveID = primID;
671
672    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
673    simdvector attrib[MAX_ATTRIBUTES];
674
675    // assemble all attributes for the input primitive
676    for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
677    {
678        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
679        pa.Assemble(attribSlot, attrib);
680
681        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
682        {
683            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
684        }
685    }
686
687    // assemble position
688    pa.Assemble(VERTEX_POSITION_SLOT, attrib);
689    for (uint32_t i = 0; i < numVertsPerPrim; ++i)
690    {
691        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
692    }
693
694    const uint32_t vertexStride = sizeof(simdvertex);
695    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
696    const uint32_t inputPrimStride = numSimdBatches * vertexStride;
697    const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
698    uint32_t cutPrimStride;
699    uint32_t cutInstanceStride;
700
701    if (pState->isSingleStream)
702    {
703        cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
704        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
705    }
706    else
707    {
708        cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
709        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
710    }
711
712    // record valid prims from the frontend to avoid over binning the newly generated
713    // prims from the GS
714    uint32_t numInputPrims = pa.NumPrims();
715
716    for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
717    {
718        tlsGsContext.InstanceID = instance;
719        tlsGsContext.mask = GenerateMask(numInputPrims);
720
721        // execute the geometry shader
722        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
723
724        tlsGsContext.pStream += instanceStride;
725        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
726    }
727
728    // set up new binner and state for the GS output topology
729    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
730    if (HasRastT::value)
731    {
732        switch (pState->outputTopology)
733        {
734        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
735        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
736        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
737        default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
738        }
739    }
740
741    // foreach input prim:
742    // - setup a new PA based on the emitted verts for that prim
743    // - loop over the new verts, calling PA to assemble each prim
744    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
745    uint32_t* pPrimitiveId = (uint32_t*)&primID;
746
747    uint32_t totalPrimsGenerated = 0;
748    for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
749    {
750        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
751        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
752        for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
753        {
754            uint32_t numEmittedVerts = pVertexCount[inputPrim];
755            if (numEmittedVerts == 0)
756            {
757                continue;
758            }
759
760            uint8_t* pBase = pInstanceBase + instance * instanceStride;
761            uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
762
763            uint32_t numAttribs = state.feNumAttributes;
764
765            for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
766            {
767                bool processCutVerts = false;
768
769                uint8_t* pCutBuffer = pCutBase;
770
771                // assign default stream ID, only relevant when GS is outputting a single stream
772                uint32_t streamID = 0;
773                if (pState->isSingleStream)
774                {
775                    processCutVerts = true;
776                    streamID = pState->singleStreamID;
777                    if (streamID != stream) continue;
778                }
779                else
780                {
781                    // early exit if this stream is not enabled for streamout
782                    if (HasStreamOutT::value && !state.soState.streamEnable[stream])
783                    {
784                        continue;
785                    }
786
787                    // multi-stream output, need to translate StreamID buffer to a cut buffer
788                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
789                    pCutBuffer = (uint8_t*)pStreamCutBuffer;
790                    processCutVerts = false;
791                }
792
793                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
794
795                while (gsPa.GetNextStreamOutput())
796                {
797                    do
798                    {
799                        bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
800
801                        if (assemble)
802                        {
803                            totalPrimsGenerated += gsPa.NumPrims();
804
805                            if (HasStreamOutT::value)
806                            {
807                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
808                            }
809
810                            if (HasRastT::value && state.soState.streamToRasterizer == stream)
811                            {
812                                simdscalari vPrimId;
813                                // pull primitiveID from the GS output if available
814                                if (state.gsState.emitsPrimitiveID)
815                                {
816                                    simdvector primIdAttrib[3];
817                                    gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
818                                    vPrimId = _simd_castps_si(primIdAttrib[0].x);
819                                }
820                                else
821                                {
822                                    vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
823                                }
824
825                                // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
826                                simdscalari vViewPortIdx;
827                                if (state.gsState.emitsViewportArrayIndex)
828                                {
829                                    simdvector vpiAttrib[3];
830                                    gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
831
832                                    // OOB indices => forced to zero.
833                                    simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
834                                    simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
835                                    vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
836
837                                    vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
838                                }
839                                else
840                                {
841                                    vViewPortIdx = _simd_set1_epi32(0);
842                                }
843
844                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
845                            }
846                        }
847                    } while (gsPa.NextPrim());
848                }
849            }
850        }
851    }
852
853    // update GS pipeline stats
854    UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
855    UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
856	AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
857    AR_END(FEGeometryShader, 1);
858}
859
860//////////////////////////////////////////////////////////////////////////
861/// @brief Allocate GS buffers
862/// @param pDC - pointer to draw context.
863/// @param state - API state
864/// @param ppGsOut - pointer to GS output buffer allocation
865/// @param ppCutBuffer - pointer to GS output cut buffer allocation
866static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
867    void **ppStreamCutBuffer)
868{
869    auto pArena = pDC->pArena;
870    SWR_ASSERT(pArena != nullptr);
871    SWR_ASSERT(state.gsState.gsEnable);
872    // allocate arena space to hold GS output verts
873    // @todo pack attribs
874    // @todo support multiple streams
875    const uint32_t vertexStride = sizeof(simdvertex);
876    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
877    uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
878    *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
879
880    const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
881    const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
882    const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
883    const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
884
885    // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
886    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
887
888    // allocate space for temporary per-stream cut buffer if multi-stream is enabled
889    if (state.gsState.isSingleStream)
890    {
891        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
892        *ppStreamCutBuffer = nullptr;
893    }
894    else
895    {
896        *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
897        *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
898    }
899
900}
901
902//////////////////////////////////////////////////////////////////////////
903/// @brief Contains all data generated by the HS and passed to the
904/// tessellator and DS.
905struct TessellationThreadLocalData
906{
907    SWR_HS_CONTEXT hsContext;
908    ScalarPatch patchData[KNOB_SIMD_WIDTH];
909    void* pTxCtx;
910    size_t tsCtxSize;
911
912    simdscalar* pDSOutput;
913    size_t numDSOutputVectors;
914};
915
916THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
917
918//////////////////////////////////////////////////////////////////////////
919/// @brief Allocate tessellation data for this worker thread.
920INLINE
921static void AllocateTessellationData(SWR_CONTEXT* pContext)
922{
923    /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
924    if (gt_pTessellationThreadData == nullptr)
925    {
926        gt_pTessellationThreadData = (TessellationThreadLocalData*)
927            AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
928        memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
929    }
930}
931
932//////////////////////////////////////////////////////////////////////////
933/// @brief Implements Tessellation Stages.
934/// @param pDC - pointer to draw context.
935/// @param workerId - thread's worker id. Even thread has a unique id.
936/// @param pa - The primitive assembly object.
937/// @param pGsOut - output stream for GS
938template <
939    typename HasGeometryShaderT,
940    typename HasStreamOutT,
941    typename HasRastT>
942static void TessellationStages(
943    DRAW_CONTEXT *pDC,
944    uint32_t workerId,
945    PA_STATE& pa,
946    void* pGsOut,
947    void* pCutBuffer,
948    void* pCutStreamBuffer,
949    uint32_t* pSoPrimData,
950    simdscalari primID)
951{
952    SWR_CONTEXT *pContext = pDC->pContext;
953    const API_STATE& state = GetApiState(pDC);
954    const SWR_TS_STATE& tsState = state.tsState;
955
956    SWR_ASSERT(gt_pTessellationThreadData);
957
958    HANDLE tsCtx = TSInitCtx(
959        tsState.domain,
960        tsState.partitioning,
961        tsState.tsOutputTopology,
962        gt_pTessellationThreadData->pTxCtx,
963        gt_pTessellationThreadData->tsCtxSize);
964    if (tsCtx == nullptr)
965    {
966        gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
967        tsCtx = TSInitCtx(
968            tsState.domain,
969            tsState.partitioning,
970            tsState.tsOutputTopology,
971            gt_pTessellationThreadData->pTxCtx,
972            gt_pTessellationThreadData->tsCtxSize);
973    }
974    SWR_ASSERT(tsCtx);
975
976    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
977    if (HasRastT::value)
978    {
979        switch (tsState.postDSTopology)
980        {
981        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
982        case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
983        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
984        default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
985        }
986    }
987
988    SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
989    hsContext.pCPout = gt_pTessellationThreadData->patchData;
990    hsContext.PrimitiveID = primID;
991
992    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
993    // Max storage for one attribute for an entire simdprimitive
994    simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
995
996    // assemble all attributes for the input primitives
997    for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
998    {
999        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1000        pa.Assemble(attribSlot, simdattrib);
1001
1002        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1003        {
1004            hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1005        }
1006    }
1007
1008#if defined(_DEBUG)
1009    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1010#endif
1011
1012    uint32_t numPrims = pa.NumPrims();
1013    hsContext.mask = GenerateMask(numPrims);
1014
1015    // Run the HS
1016    AR_BEGIN(FEHullShader, pDC->drawId);
1017    state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1018    AR_END(FEHullShader, 0);
1019
1020    UPDATE_STAT_FE(HsInvocations, numPrims);
1021
1022    const uint32_t* pPrimId = (const uint32_t*)&primID;
1023
1024    for (uint32_t p = 0; p < numPrims; ++p)
1025    {
1026        // Run Tessellator
1027        SWR_TS_TESSELLATED_DATA tsData = { 0 };
1028        AR_BEGIN(FETessellation, pDC->drawId);
1029        TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1030		AR_EVENT(TessPrimCount(1));
1031        AR_END(FETessellation, 0);
1032
1033        if (tsData.NumPrimitives == 0)
1034        {
1035            continue;
1036        }
1037        SWR_ASSERT(tsData.NumDomainPoints);
1038
1039        // Allocate DS Output memory
1040        uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1041        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1042        size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1043        if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1044        {
1045            AlignedFree(gt_pTessellationThreadData->pDSOutput);
1046            gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1047            gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1048        }
1049        SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1050        SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1051
1052#if defined(_DEBUG)
1053        memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1054#endif
1055
1056        // Run Domain Shader
1057        SWR_DS_CONTEXT dsContext;
1058        dsContext.PrimitiveID = pPrimId[p];
1059        dsContext.pCpIn = &hsContext.pCPout[p];
1060        dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1061        dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1062        dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1063        dsContext.vectorStride = requiredDSVectorInvocations;
1064
1065        uint32_t dsInvocations = 0;
1066
1067        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1068        {
1069            dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1070
1071            AR_BEGIN(FEDomainShader, pDC->drawId);
1072            state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1073            AR_END(FEDomainShader, 0);
1074
1075            dsInvocations += KNOB_SIMD_WIDTH;
1076        }
1077        UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1078
1079        PA_TESS tessPa(
1080            pDC,
1081            dsContext.pOutputData,
1082            dsContext.vectorStride,
1083            tsState.numDsOutputAttribs,
1084            tsData.ppIndices,
1085            tsData.NumPrimitives,
1086            tsState.postDSTopology);
1087
1088        while (tessPa.HasWork())
1089        {
1090            if (HasGeometryShaderT::value)
1091            {
1092                GeometryShaderStage<HasStreamOutT, HasRastT>(
1093                    pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1094                    _simd_set1_epi32(dsContext.PrimitiveID));
1095            }
1096            else
1097            {
1098                if (HasStreamOutT::value)
1099                {
1100                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1101                }
1102
1103                if (HasRastT::value)
1104                {
1105                    simdvector prim[3]; // Only deal with triangles, lines, or points
1106                    AR_BEGIN(FEPAAssemble, pDC->drawId);
1107#if SWR_ENABLE_ASSERTS
1108                    bool assemble =
1109#endif
1110                        tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1111                    AR_END(FEPAAssemble, 1);
1112                    SWR_ASSERT(assemble);
1113
1114                    SWR_ASSERT(pfnClipFunc);
1115                    pfnClipFunc(pDC, tessPa, workerId, prim,
1116                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1117                }
1118            }
1119
1120            tessPa.NextPrim();
1121
1122        } // while (tessPa.HasWork())
1123    } // for (uint32_t p = 0; p < numPrims; ++p)
1124
1125    TSDestroyCtx(tsCtx);
1126}
1127
1128//////////////////////////////////////////////////////////////////////////
1129/// @brief FE handler for SwrDraw.
1130/// @tparam IsIndexedT - Is indexed drawing enabled
1131/// @tparam HasTessellationT - Is tessellation enabled
1132/// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1133/// @tparam HasStreamOutT - Is stream-out enabled
1134/// @tparam HasRastT - Is rasterization enabled
1135/// @param pContext - pointer to SWR context.
1136/// @param pDC - pointer to draw context.
1137/// @param workerId - thread's worker id.
1138/// @param pUserData - Pointer to DRAW_WORK
1139template <
1140    typename IsIndexedT,
1141    typename IsCutIndexEnabledT,
1142    typename HasTessellationT,
1143    typename HasGeometryShaderT,
1144    typename HasStreamOutT,
1145    typename HasRastT>
1146void ProcessDraw(
1147    SWR_CONTEXT *pContext,
1148    DRAW_CONTEXT *pDC,
1149    uint32_t workerId,
1150    void *pUserData)
1151{
1152
1153#if KNOB_ENABLE_TOSS_POINTS
1154    if (KNOB_TOSS_QUEUE_FE)
1155    {
1156        return;
1157    }
1158#endif
1159
1160    AR_BEGIN(FEProcessDraw, pDC->drawId);
1161
1162    DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1163    const API_STATE&    state = GetApiState(pDC);
1164    __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1165    SWR_VS_CONTEXT      vsContext;
1166    simdvertex          vin;
1167
1168    int indexSize = 0;
1169    uint32_t endVertex = work.numVerts;
1170
1171    const int32_t* pLastRequestedIndex = nullptr;
1172    if (IsIndexedT::value)
1173    {
1174        switch (work.type)
1175        {
1176        case R32_UINT:
1177            indexSize = sizeof(uint32_t);
1178            pLastRequestedIndex = &(work.pIB[endVertex]);
1179            break;
1180        case R16_UINT:
1181            indexSize = sizeof(uint16_t);
1182            // nasty address offset to last index
1183            pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1184            break;
1185        case R8_UINT:
1186            indexSize = sizeof(uint8_t);
1187            // nasty address offset to last index
1188            pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1189            break;
1190        default:
1191            SWR_ASSERT(0);
1192        }
1193    }
1194    else
1195    {
1196        // No cuts, prune partial primitives.
1197        endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1198    }
1199
1200    SWR_FETCH_CONTEXT fetchInfo = { 0 };
1201    fetchInfo.pStreams = &state.vertexBuffers[0];
1202    fetchInfo.StartInstance = work.startInstance;
1203    fetchInfo.StartVertex = 0;
1204
1205    vsContext.pVin = &vin;
1206
1207    if (IsIndexedT::value)
1208    {
1209        fetchInfo.BaseVertex = work.baseVertex;
1210
1211        // if the entire index buffer isn't being consumed, set the last index
1212        // so that fetches < a SIMD wide will be masked off
1213        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1214        if (pLastRequestedIndex < fetchInfo.pLastIndex)
1215        {
1216            fetchInfo.pLastIndex = pLastRequestedIndex;
1217        }
1218    }
1219    else
1220    {
1221        fetchInfo.StartVertex = work.startVertex;
1222    }
1223
1224#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1225    uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1226#endif
1227
1228    void* pGsOut = nullptr;
1229    void* pCutBuffer = nullptr;
1230    void* pStreamCutBuffer = nullptr;
1231    if (HasGeometryShaderT::value)
1232    {
1233        AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1234    }
1235
1236    if (HasTessellationT::value)
1237    {
1238        SWR_ASSERT(state.tsState.tsEnable == true);
1239        SWR_ASSERT(state.pfnHsFunc != nullptr);
1240        SWR_ASSERT(state.pfnDsFunc != nullptr);
1241
1242        AllocateTessellationData(pContext);
1243    }
1244    else
1245    {
1246        SWR_ASSERT(state.tsState.tsEnable == false);
1247        SWR_ASSERT(state.pfnHsFunc == nullptr);
1248        SWR_ASSERT(state.pfnDsFunc == nullptr);
1249    }
1250
1251    // allocate space for streamout input prim data
1252    uint32_t* pSoPrimData = nullptr;
1253    if (HasStreamOutT::value)
1254    {
1255        pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1256    }
1257
1258    // choose primitive assembler
1259    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1260    PA_STATE& pa = paFactory.GetPA();
1261
1262    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1263    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1264    {
1265        simdscalari vIndex;
1266        uint32_t  i = 0;
1267
1268        if (IsIndexedT::value)
1269        {
1270            fetchInfo.pIndices = work.pIB;
1271        }
1272        else
1273        {
1274            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1275            fetchInfo.pIndices = (const int32_t*)&vIndex;
1276        }
1277
1278        fetchInfo.CurInstance = instanceNum;
1279        vsContext.InstanceID = instanceNum;
1280
1281        while (pa.HasWork())
1282        {
1283            // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1284            // So we need to keep this outside of (i < endVertex) check.
1285            simdmask* pvCutIndices = nullptr;
1286            if (IsIndexedT::value)
1287            {
1288                pvCutIndices = &pa.GetNextVsIndices();
1289            }
1290
1291            simdvertex& vout = pa.GetNextVsOutput();
1292            vsContext.pVout = &vout;
1293
1294            if (i < endVertex)
1295            {
1296
1297                // 1. Execute FS/VS for a single SIMD.
1298                AR_BEGIN(FEFetchShader, pDC->drawId);
1299                state.pfnFetchFunc(fetchInfo, vin);
1300                AR_END(FEFetchShader, 0);
1301
1302                // forward fetch generated vertex IDs to the vertex shader
1303                vsContext.VertexID = fetchInfo.VertexID;
1304
1305                // Setup active mask for vertex shader.
1306                vsContext.mask = GenerateMask(endVertex - i);
1307
1308                // forward cut mask to the PA
1309                if (IsIndexedT::value)
1310                {
1311                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1312                }
1313
1314                UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1315
1316#if KNOB_ENABLE_TOSS_POINTS
1317                if (!KNOB_TOSS_FETCH)
1318#endif
1319                {
1320                    AR_BEGIN(FEVertexShader, pDC->drawId);
1321                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1322                    AR_END(FEVertexShader, 0);
1323
1324                    UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1325                }
1326            }
1327
1328            // 2. Assemble primitives given the last two SIMD.
1329            do
1330            {
1331                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1332                // PaAssemble returns false if there is not enough verts to assemble.
1333                AR_BEGIN(FEPAAssemble, pDC->drawId);
1334                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1335                AR_END(FEPAAssemble, 1);
1336
1337#if KNOB_ENABLE_TOSS_POINTS
1338                if (!KNOB_TOSS_FETCH)
1339#endif
1340                {
1341#if KNOB_ENABLE_TOSS_POINTS
1342                    if (!KNOB_TOSS_VS)
1343#endif
1344                    {
1345                        if (assemble)
1346                        {
1347                            UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1348
1349                            if (HasTessellationT::value)
1350                            {
1351                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1352                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1353                            }
1354                            else if (HasGeometryShaderT::value)
1355                            {
1356                                GeometryShaderStage<HasStreamOutT, HasRastT>(
1357                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1358                            }
1359                            else
1360                            {
1361                                // If streamout is enabled then stream vertices out to memory.
1362                                if (HasStreamOutT::value)
1363                                {
1364                                    StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1365                                }
1366
1367                                if (HasRastT::value)
1368                                {
1369                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
1370                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1371                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1372                                }
1373                            }
1374                        }
1375                    }
1376                }
1377            } while (pa.NextPrim());
1378
1379            i += KNOB_SIMD_WIDTH;
1380            if (IsIndexedT::value)
1381            {
1382                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1383            }
1384            else
1385            {
1386                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1387            }
1388        }
1389        pa.Reset();
1390    }
1391
1392
1393    AR_END(FEProcessDraw, numPrims * work.numInstances);
1394}
1395
1396struct FEDrawChooser
1397{
1398    typedef PFN_FE_WORK_FUNC FuncType;
1399
1400    template <typename... ArgsB>
1401    static FuncType GetFunc()
1402    {
1403        return ProcessDraw<ArgsB...>;
1404    }
1405};
1406
1407
1408// Selector for correct templated Draw front-end function
1409PFN_FE_WORK_FUNC GetProcessDrawFunc(
1410    bool IsIndexed,
1411    bool IsCutIndexEnabled,
1412    bool HasTessellation,
1413    bool HasGeometryShader,
1414    bool HasStreamOut,
1415    bool HasRasterization)
1416{
1417    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1418}