1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file pa.h
24*
25* @brief Definitions for primitive assembly.
26*        N primitives are assembled at a time, where N is the SIMD width.
27*        A state machine, that is specific for a given topology, drives the
28*        assembly of vertices into triangles.
29*
30******************************************************************************/
31#pragma once
32
33#include "frontend.h"
34
35struct PA_STATE
36{
37    DRAW_CONTEXT *pDC{ nullptr };              // draw context
38    uint8_t* pStreamBase{ nullptr };           // vertex stream
39    uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
40
41    // The topology the binner will use. In some cases the FE changes the topology from the api state.
42    PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
43
44    PA_STATE() {}
45    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
46        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
47
48    virtual bool HasWork() = 0;
49    virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
50    virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
51    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
52    virtual bool NextPrim() = 0;
53    virtual simdvertex& GetNextVsOutput() = 0;
54    virtual bool GetNextStreamOutput() = 0;
55    virtual simdmask& GetNextVsIndices() = 0;
56    virtual uint32_t NumPrims() = 0;
57    virtual void Reset() = 0;
58    virtual simdscalari GetPrimID(uint32_t startID) = 0;
59};
60
61// The Optimized PA is a state machine that assembles triangles from vertex shader simd
62// output. Here is the sequence
63//    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
64//    2. Execute PA function to assemble and bin triangles.
65//        a.    The PA function is a set of functions that collectively make up the
66//            state machine for a given topology.
67//                1.    We use a state index to track which PA function to call.
68//        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
69//                1.    We call this the current and previous simd vertex.
70//                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
71//                    order to assemble the second triangle, for a triangle list, we'll need the
72//                    last vertex from the previous simd and the first 2 vertices from the current simd.
73//                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
74//
75// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
76// cuts
77struct PA_STATE_OPT : public PA_STATE
78{
79    simdvertex leadingVertex;            // For tri-fan
80    uint32_t numPrims{ 0 };              // Total number of primitives for draw.
81    uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
82
83    uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
84
85    uint32_t cur{ 0 };                   // index to current VS output.
86    uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
87    uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
88
89    uint32_t counter{ 0 };               // state counter
90    bool reset{ false };                 // reset state
91
92    uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
93    simdscalari primID;
94
95    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
96    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
97
98    PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
99    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
100    PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
101
102    // state used to advance the PA when Next is called
103    PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
104    uint32_t           nextNumSimdPrims{ 0 };
105    uint32_t           nextNumPrimsIncrement{ 0 };
106    bool               nextReset{ false };
107    bool               isStreaming{ false };
108
109    simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
110
111    PA_STATE_OPT() {}
112    PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
113        bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
114
115    bool HasWork()
116    {
117        return (this->numPrimsComplete < this->numPrims) ? true : false;
118    }
119
120    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
121    {
122        simdvertex* pVertex = (simdvertex*)pStreamBase;
123        return pVertex[index].attrib[slot];
124    }
125
126    // Assembles 4 triangles. Each simdvector is a single vertex from 4
127    // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
128    bool Assemble(uint32_t slot, simdvector verts[])
129    {
130        return this->pfnPaFunc(*this, slot, verts);
131    }
132
133    // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
134    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
135    {
136        return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
137    }
138
139    bool NextPrim()
140    {
141        this->pfnPaFunc = this->pfnPaNextFunc;
142        this->numSimdPrims = this->nextNumSimdPrims;
143        this->numPrimsComplete += this->nextNumPrimsIncrement;
144        this->reset = this->nextReset;
145
146        if (this->isStreaming)
147        {
148            this->reset = false;
149        }
150
151        bool morePrims = false;
152
153        if (this->numSimdPrims > 0)
154        {
155            morePrims = true;
156            this->numSimdPrims--;
157        }
158        else
159        {
160            this->counter = (this->reset) ? 0 : (this->counter + 1);
161            this->reset = false;
162        }
163
164        this->pfnPaFunc = this->pfnPaNextFunc;
165
166        if (!HasWork())
167        {
168            morePrims = false;    // no more to do
169        }
170
171        return morePrims;
172    }
173
174    simdvertex& GetNextVsOutput()
175    {
176        // increment cur and prev indices
177        const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
178        this->prev = this->cur;  // prev is undefined for first state.
179        this->cur = this->counter % numSimdVerts;
180
181        simdvertex* pVertex = (simdvertex*)pStreamBase;
182        return pVertex[this->cur];
183    }
184
185    simdmask& GetNextVsIndices()
186    {
187        // unused in optimized PA, pass tmp buffer back
188        return tmpIndices;
189    }
190
191    bool GetNextStreamOutput()
192    {
193        this->prev = this->cur;
194        this->cur = this->counter;
195
196        return HasWork();
197    }
198
199    uint32_t NumPrims()
200    {
201        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
202            (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
203    }
204
205    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
206        PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
207        uint32_t numSimdPrims = 0,
208        uint32_t numPrimsIncrement = 0,
209        bool reset = false)
210    {
211        this->pfnPaNextFunc = pfnPaNextFunc;
212        this->nextNumSimdPrims = numSimdPrims;
213        this->nextNumPrimsIncrement = numPrimsIncrement;
214        this->nextReset = reset;
215
216        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
217    }
218
219    void Reset()
220    {
221        this->pfnPaFunc = this->pfnPaFuncReset;
222        this->numPrimsComplete = 0;
223        this->numSimdPrims = 0;
224        this->cur = 0;
225        this->prev = 0;
226        this->first = 0;
227        this->counter = 0;
228        this->reset = false;
229    }
230
231    simdscalari GetPrimID(uint32_t startID)
232    {
233        return _simd_add_epi32(this->primID,
234            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
235    }
236};
237
238// helper C wrappers to avoid having to rewrite all the PA topology state functions
239INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
240    PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
241    uint32_t numSimdPrims = 0,
242    uint32_t numPrimsIncrement = 0,
243    bool reset = false)
244{
245    return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
246}
247INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
248{
249    return pa.GetSimdVector(index, slot);
250}
251
252INLINE __m128 swizzleLane0(const simdvector &a)
253{
254    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
255    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
256    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
257}
258
259INLINE __m128 swizzleLane1(const simdvector &a)
260{
261    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
262    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
263    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
264}
265
266INLINE __m128 swizzleLane2(const simdvector &a)
267{
268    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
269    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
270    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
271}
272
273INLINE __m128 swizzleLane3(const simdvector &a)
274{
275    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
276    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
277    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
278}
279
280INLINE __m128 swizzleLane4(const simdvector &a)
281{
282    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
283    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
284    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
285
286}
287
288INLINE __m128 swizzleLane5(const simdvector &a)
289{
290    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
291    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
292    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
293}
294
295INLINE __m128 swizzleLane6(const simdvector &a)
296{
297    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
298    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
299    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
300}
301
302INLINE __m128 swizzleLane7(const simdvector &a)
303{
304    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
305    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
306    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
307}
308
309INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
310{
311    switch (lane) {
312    case 0:
313        return swizzleLane0(a);
314    case 1:
315        return swizzleLane1(a);
316    case 2:
317        return swizzleLane2(a);
318    case 3:
319        return swizzleLane3(a);
320    case 4:
321        return swizzleLane4(a);
322    case 5:
323        return swizzleLane5(a);
324    case 6:
325        return swizzleLane6(a);
326    case 7:
327        return swizzleLane7(a);
328    default:
329        return _mm_setzero_ps();
330    }
331}
332
333// Cut-aware primitive assembler.
334struct PA_STATE_CUT : public PA_STATE
335{
336    simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
337    uint32_t numVerts{ 0 };              // number of vertices available in buffer store
338    uint32_t numAttribs{ 0 };            // number of attributes
339    int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
340    uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
341    OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
342    simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
343    uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
344    uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
345    uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
346    uint32_t curVertex{ 0 };             // current unprocessed vertex
347    uint32_t startPrimId{ 0 };           // starting prim id
348    simdscalari vPrimId;                 // vector of prim ID
349    bool needOffsets{ false };           // need to compute gather offsets for current SIMD
350    uint32_t vertsPerPrim{ 0 };
351    simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
352    bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
353                                         // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
354                                         // while the GS sends valid verts for every index
355    // Topology state tracking
356    uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
357    uint32_t curIndex{ 0 };
358    bool reverseWinding{ false };        // indicates reverse winding for strips
359    int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
360
361    typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
362    PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
363
364    PA_STATE_CUT() {}
365    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
366        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
367        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
368    {
369        numVerts = in_streamSizeInVerts;
370        numAttribs = in_numAttribs;
371        binTopology = topo;
372        needOffsets = false;
373        processCutVerts = in_processCutVerts;
374
375        numVertsToAssemble = numRemainingVerts = in_numVerts;
376        numPrimsAssembled = 0;
377        headVertex = tailVertex = curVertex = 0;
378
379        curIndex = 0;
380        pCutIndices = in_pIndices;
381        memset(indices, 0, sizeof(indices));
382        vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
383        reverseWinding = false;
384        adjExtraVert = -1;
385
386        bool gsEnabled = pDC->pState->state.gsState.gsEnable;
387        vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
388
389        switch (topo)
390        {
391        case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
392        case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
393        case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
394        case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
395                                    {
396                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
397                                    }
398                                    else
399                                    {
400                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
401                                    }
402                                    break;
403
404        case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
405        case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
406        case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
407        case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
408        case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
409        default: assert(0 && "Unimplemented topology");
410        }
411    }
412
413    simdvertex& GetNextVsOutput()
414    {
415        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
416        this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
417        this->needOffsets = true;
418        return ((simdvertex*)pStreamBase)[vertexIndex];
419    }
420
421    simdmask& GetNextVsIndices()
422    {
423        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
424        simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
425        return *pCurCutIndex;
426    }
427
428    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
429    {
430        // unused
431        SWR_ASSERT(0 && "Not implemented");
432        return this->tmpVertex.attrib[0];
433    }
434
435    bool GetNextStreamOutput()
436    {
437        this->headVertex += KNOB_SIMD_WIDTH;
438        this->needOffsets = true;
439        return HasWork();
440    }
441
442    simdscalari GetPrimID(uint32_t startID)
443    {
444        return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
445    }
446
447    void Reset()
448    {
449        this->numRemainingVerts = this->numVertsToAssemble;
450        this->numPrimsAssembled = 0;
451        this->curIndex = 0;
452        this->curVertex = 0;
453        this->tailVertex = 0;
454        this->headVertex = 0;
455        this->reverseWinding = false;
456        this->adjExtraVert = -1;
457        this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
458    }
459
460    bool HasWork()
461    {
462        return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
463    }
464
465    bool IsVertexStoreFull()
466    {
467        return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
468    }
469
470    void RestartTopology()
471    {
472        this->curIndex = 0;
473        this->reverseWinding = false;
474        this->adjExtraVert = -1;
475    }
476
477    bool IsCutIndex(uint32_t vertex)
478    {
479        uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
480        uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
481        return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
482    }
483
484    // iterates across the unprocessed verts until we hit the end or we
485    // have assembled SIMD prims
486    void ProcessVerts()
487    {
488        while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
489            this->numRemainingVerts > 0 &&
490            this->curVertex != this->headVertex)
491        {
492            // if cut index, restart topology
493            if (IsCutIndex(this->curVertex))
494            {
495                if (this->processCutVerts)
496                {
497                    (this->*pfnPa)(this->curVertex, false);
498                }
499                // finish off tri strip w/ adj before restarting topo
500                if (this->adjExtraVert != -1)
501                {
502                    (this->*pfnPa)(this->curVertex, true);
503                }
504                RestartTopology();
505            }
506            else
507            {
508                (this->*pfnPa)(this->curVertex, false);
509            }
510
511            this->curVertex++;
512            if (this->curVertex >= this->numVerts) {
513               this->curVertex = 0;
514            }
515            this->numRemainingVerts--;
516        }
517
518        // special case last primitive for tri strip w/ adj
519        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
520        {
521            (this->*pfnPa)(this->curVertex, true);
522        }
523    }
524
525    void Advance()
526    {
527        // done with current batch
528        // advance tail to the current unsubmitted vertex
529        this->tailVertex = this->curVertex;
530        this->numPrimsAssembled = 0;
531        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
532    }
533
534    bool NextPrim()
535    {
536        // if we've assembled enough prims, we can advance to the next set of verts
537        if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
538        {
539            Advance();
540        }
541        return false;
542    }
543
544    void ComputeOffsets()
545    {
546        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
547        {
548            simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
549
550            // step to simdvertex batch
551            const uint32_t simdShift = 3; // @todo make knob
552            simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
553            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
554
555            // step to index
556            const uint32_t simdMask = 0x7; // @todo make knob
557            simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
558            this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
559        }
560    }
561
562    bool Assemble(uint32_t slot, simdvector result[])
563    {
564        // process any outstanding verts
565        ProcessVerts();
566
567        // return false if we don't have enough prims assembled
568        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
569        {
570            return false;
571        }
572
573        // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
574        if (this->needOffsets)
575        {
576            ComputeOffsets();
577            this->needOffsets = false;
578        }
579
580        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
581        {
582            simdscalari offsets = this->vOffsets[v];
583
584            // step to attribute
585            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
586
587            float* pBase = (float*)this->pStreamBase;
588            for (uint32_t c = 0; c < 4; ++c)
589            {
590                result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
591
592                // move base to next component
593                pBase += KNOB_SIMD_WIDTH;
594            }
595        }
596
597        return true;
598    }
599
600    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
601    {
602        // move to slot
603        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
604        {
605            uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
606            uint32_t offset = pOffset[triIndex];
607            offset += sizeof(simdvector) * slot;
608            float* pVert = (float*)&tri[v];
609            for (uint32_t c = 0; c < 4; ++c)
610            {
611                float* pComponent = (float*)(this->pStreamBase + offset);
612                pVert[c] = *pComponent;
613                offset += KNOB_SIMD_WIDTH * sizeof(float);
614            }
615        }
616    }
617
618    uint32_t NumPrims()
619    {
620        return this->numPrimsAssembled;
621    }
622
623    // Per-topology functions
624    void ProcessVertTriStrip(uint32_t index, bool finish)
625    {
626        this->vert[this->curIndex] = index;
627        this->curIndex++;
628        if (this->curIndex == 3)
629        {
630            // assembled enough verts for prim, add to gather indices
631            this->indices[0][this->numPrimsAssembled] = this->vert[0];
632            if (reverseWinding)
633            {
634                this->indices[1][this->numPrimsAssembled] = this->vert[2];
635                this->indices[2][this->numPrimsAssembled] = this->vert[1];
636            }
637            else
638            {
639                this->indices[1][this->numPrimsAssembled] = this->vert[1];
640                this->indices[2][this->numPrimsAssembled] = this->vert[2];
641            }
642
643            // increment numPrimsAssembled
644            this->numPrimsAssembled++;
645
646            // set up next prim state
647            this->vert[0] = this->vert[1];
648            this->vert[1] = this->vert[2];
649            this->curIndex = 2;
650            this->reverseWinding ^= 1;
651        }
652    }
653
654    template<bool gsEnabled>
655    void AssembleTriStripAdj()
656    {
657        if (!gsEnabled)
658        {
659            this->vert[1] = this->vert[2];
660            this->vert[2] = this->vert[4];
661
662            this->indices[0][this->numPrimsAssembled] = this->vert[0];
663            this->indices[1][this->numPrimsAssembled] = this->vert[1];
664            this->indices[2][this->numPrimsAssembled] = this->vert[2];
665
666            this->vert[4] = this->vert[2];
667            this->vert[2] = this->vert[1];
668        }
669        else
670        {
671            this->indices[0][this->numPrimsAssembled] = this->vert[0];
672            this->indices[1][this->numPrimsAssembled] = this->vert[1];
673            this->indices[2][this->numPrimsAssembled] = this->vert[2];
674            this->indices[3][this->numPrimsAssembled] = this->vert[3];
675            this->indices[4][this->numPrimsAssembled] = this->vert[4];
676            this->indices[5][this->numPrimsAssembled] = this->vert[5];
677        }
678        this->numPrimsAssembled++;
679    }
680
681
682    template<bool gsEnabled>
683    void ProcessVertTriStripAdj(uint32_t index, bool finish)
684    {
685        // handle last primitive of tristrip
686        if (finish && this->adjExtraVert != -1)
687        {
688            this->vert[3] = this->adjExtraVert;
689            AssembleTriStripAdj<gsEnabled>();
690            this->adjExtraVert = -1;
691            return;
692        }
693
694        switch (this->curIndex)
695        {
696        case 0:
697        case 1:
698        case 2:
699        case 4:
700            this->vert[this->curIndex] = index;
701            this->curIndex++;
702            break;
703        case 3:
704            this->vert[5] = index;
705            this->curIndex++;
706            break;
707        case 5:
708            if (this->adjExtraVert == -1)
709            {
710                this->adjExtraVert = index;
711            }
712            else
713            {
714                this->vert[3] = index;
715                if (!gsEnabled)
716                {
717                    AssembleTriStripAdj<gsEnabled>();
718
719                    uint32_t nextTri[6];
720                    if (this->reverseWinding)
721                    {
722                        nextTri[0] = this->vert[4];
723                        nextTri[1] = this->vert[0];
724                        nextTri[2] = this->vert[2];
725                        nextTri[4] = this->vert[3];
726                        nextTri[5] = this->adjExtraVert;
727                    }
728                    else
729                    {
730                        nextTri[0] = this->vert[2];
731                        nextTri[1] = this->adjExtraVert;
732                        nextTri[2] = this->vert[3];
733                        nextTri[4] = this->vert[4];
734                        nextTri[5] = this->vert[0];
735                    }
736                    for (uint32_t i = 0; i < 6; ++i)
737                    {
738                        this->vert[i] = nextTri[i];
739                    }
740
741                    this->adjExtraVert = -1;
742                    this->reverseWinding ^= 1;
743                }
744                else
745                {
746                    this->curIndex++;
747                }
748            }
749            break;
750        case 6:
751            SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
752            AssembleTriStripAdj<gsEnabled>();
753
754            uint32_t nextTri[6];
755            if (this->reverseWinding)
756            {
757                nextTri[0] = this->vert[4];
758                nextTri[1] = this->vert[0];
759                nextTri[2] = this->vert[2];
760                nextTri[4] = this->vert[3];
761                nextTri[5] = this->adjExtraVert;
762            }
763            else
764            {
765                nextTri[0] = this->vert[2];
766                nextTri[1] = this->adjExtraVert;
767                nextTri[2] = this->vert[3];
768                nextTri[4] = this->vert[4];
769                nextTri[5] = this->vert[0];
770            }
771            for (uint32_t i = 0; i < 6; ++i)
772            {
773                this->vert[i] = nextTri[i];
774            }
775            this->reverseWinding ^= 1;
776            this->adjExtraVert = index;
777            this->curIndex--;
778            break;
779        }
780    }
781
782    void ProcessVertTriList(uint32_t index, bool finish)
783    {
784        this->vert[this->curIndex] = index;
785        this->curIndex++;
786        if (this->curIndex == 3)
787        {
788            // assembled enough verts for prim, add to gather indices
789            this->indices[0][this->numPrimsAssembled] = this->vert[0];
790            this->indices[1][this->numPrimsAssembled] = this->vert[1];
791            this->indices[2][this->numPrimsAssembled] = this->vert[2];
792
793            // increment numPrimsAssembled
794            this->numPrimsAssembled++;
795
796            // set up next prim state
797            this->curIndex = 0;
798        }
799    }
800
801    void ProcessVertTriListAdj(uint32_t index, bool finish)
802    {
803        this->vert[this->curIndex] = index;
804        this->curIndex++;
805        if (this->curIndex == 6)
806        {
807            // assembled enough verts for prim, add to gather indices
808            this->indices[0][this->numPrimsAssembled] = this->vert[0];
809            this->indices[1][this->numPrimsAssembled] = this->vert[1];
810            this->indices[2][this->numPrimsAssembled] = this->vert[2];
811            this->indices[3][this->numPrimsAssembled] = this->vert[3];
812            this->indices[4][this->numPrimsAssembled] = this->vert[4];
813            this->indices[5][this->numPrimsAssembled] = this->vert[5];
814
815            // increment numPrimsAssembled
816            this->numPrimsAssembled++;
817
818            // set up next prim state
819            this->curIndex = 0;
820        }
821    }
822
823    void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
824    {
825        this->vert[this->curIndex] = index;
826        this->curIndex++;
827        if (this->curIndex == 6)
828        {
829            // assembled enough verts for prim, add to gather indices
830            this->indices[0][this->numPrimsAssembled] = this->vert[0];
831            this->indices[1][this->numPrimsAssembled] = this->vert[2];
832            this->indices[2][this->numPrimsAssembled] = this->vert[4];
833
834            // increment numPrimsAssembled
835            this->numPrimsAssembled++;
836
837            // set up next prim state
838            this->curIndex = 0;
839        }
840    }
841
842
843    void ProcessVertLineList(uint32_t index, bool finish)
844    {
845        this->vert[this->curIndex] = index;
846        this->curIndex++;
847        if (this->curIndex == 2)
848        {
849            this->indices[0][this->numPrimsAssembled] = this->vert[0];
850            this->indices[1][this->numPrimsAssembled] = this->vert[1];
851
852            this->numPrimsAssembled++;
853            this->curIndex = 0;
854        }
855    }
856
857    void ProcessVertLineStrip(uint32_t index, bool finish)
858    {
859        this->vert[this->curIndex] = index;
860        this->curIndex++;
861        if (this->curIndex == 2)
862        {
863            // assembled enough verts for prim, add to gather indices
864            this->indices[0][this->numPrimsAssembled] = this->vert[0];
865            this->indices[1][this->numPrimsAssembled] = this->vert[1];
866
867            // increment numPrimsAssembled
868            this->numPrimsAssembled++;
869
870            // set up next prim state
871            this->vert[0] = this->vert[1];
872            this->curIndex = 1;
873        }
874    }
875
876    void ProcessVertLineStripAdj(uint32_t index, bool finish)
877    {
878        this->vert[this->curIndex] = index;
879        this->curIndex++;
880        if (this->curIndex == 4)
881        {
882            // assembled enough verts for prim, add to gather indices
883            this->indices[0][this->numPrimsAssembled] = this->vert[0];
884            this->indices[1][this->numPrimsAssembled] = this->vert[1];
885            this->indices[2][this->numPrimsAssembled] = this->vert[2];
886            this->indices[3][this->numPrimsAssembled] = this->vert[3];
887
888            // increment numPrimsAssembled
889            this->numPrimsAssembled++;
890
891            // set up next prim state
892            this->vert[0] = this->vert[1];
893            this->vert[1] = this->vert[2];
894            this->vert[2] = this->vert[3];
895            this->curIndex = 3;
896        }
897    }
898
899    void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
900    {
901        this->vert[this->curIndex] = index;
902        this->curIndex++;
903        if (this->curIndex == 4)
904        {
905            // assembled enough verts for prim, add to gather indices
906            this->indices[0][this->numPrimsAssembled] = this->vert[1];
907            this->indices[1][this->numPrimsAssembled] = this->vert[2];
908
909            // increment numPrimsAssembled
910            this->numPrimsAssembled++;
911
912            // set up next prim state
913            this->vert[0] = this->vert[1];
914            this->vert[1] = this->vert[2];
915            this->vert[2] = this->vert[3];
916            this->curIndex = 3;
917        }
918    }
919
920    void ProcessVertLineListAdj(uint32_t index, bool finish)
921    {
922        this->vert[this->curIndex] = index;
923        this->curIndex++;
924        if (this->curIndex == 4)
925        {
926            this->indices[0][this->numPrimsAssembled] = this->vert[0];
927            this->indices[1][this->numPrimsAssembled] = this->vert[1];
928            this->indices[2][this->numPrimsAssembled] = this->vert[2];
929            this->indices[3][this->numPrimsAssembled] = this->vert[3];
930
931            this->numPrimsAssembled++;
932            this->curIndex = 0;
933        }
934    }
935
936    void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
937    {
938        this->vert[this->curIndex] = index;
939        this->curIndex++;
940        if (this->curIndex == 4)
941        {
942            this->indices[0][this->numPrimsAssembled] = this->vert[1];
943            this->indices[1][this->numPrimsAssembled] = this->vert[2];
944
945            this->numPrimsAssembled++;
946            this->curIndex = 0;
947        }
948    }
949
950    void ProcessVertPointList(uint32_t index, bool finish)
951    {
952        this->vert[this->curIndex] = index;
953        this->curIndex++;
954        if (this->curIndex == 1)
955        {
956            this->indices[0][this->numPrimsAssembled] = this->vert[0];
957            this->numPrimsAssembled++;
958            this->curIndex = 0;
959        }
960    }
961};
962
963// Primitive Assembly for data output from the DomainShader.
964struct PA_TESS : PA_STATE
965{
966    PA_TESS(
967        DRAW_CONTEXT *in_pDC,
968        const simdscalar* in_pVertData,
969        uint32_t in_attributeStrideInVectors,
970        uint32_t in_numAttributes,
971        uint32_t* (&in_ppIndices)[3],
972        uint32_t in_numPrims,
973        PRIMITIVE_TOPOLOGY in_binTopology) :
974
975        PA_STATE(in_pDC, nullptr, 0),
976        m_pVertexData(in_pVertData),
977        m_attributeStrideInVectors(in_attributeStrideInVectors),
978        m_numAttributes(in_numAttributes),
979        m_numPrims(in_numPrims)
980    {
981        m_vPrimId = _simd_setzero_si();
982        binTopology = in_binTopology;
983        m_ppIndices[0] = in_ppIndices[0];
984        m_ppIndices[1] = in_ppIndices[1];
985        m_ppIndices[2] = in_ppIndices[2];
986
987        switch (binTopology)
988        {
989        case TOP_POINT_LIST:
990            m_numVertsPerPrim = 1;
991            break;
992
993        case TOP_LINE_LIST:
994            m_numVertsPerPrim = 2;
995            break;
996
997        case TOP_TRIANGLE_LIST:
998            m_numVertsPerPrim = 3;
999            break;
1000
1001        default:
1002            SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1003            break;
1004        }
1005    }
1006
1007    bool HasWork()
1008    {
1009        return m_numPrims != 0;
1010    }
1011
1012    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1013    {
1014        SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1015        static simdvector junk;
1016        return junk;
1017    }
1018
1019    static simdscalari GenPrimMask(uint32_t numPrims)
1020    {
1021        SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
1022#if KNOB_SIMD_WIDTH == 8
1023        static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1024        {
1025            -1, -1, -1, -1, -1, -1, -1, -1,
1026             0,  0,  0,  0,  0,  0,  0,  0
1027        };
1028#elif KNOB_SIMD_WIDTH == 16
1029        static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1030        {
1031            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1032             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1033        };
1034#else
1035#error "Help, help, I can't get up!"
1036#endif
1037
1038        return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
1039    }
1040
1041    bool Assemble(uint32_t slot, simdvector verts[])
1042    {
1043        static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
1044        SWR_ASSERT(slot < m_numAttributes);
1045
1046        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1047        if (0 == numPrimsToAssemble)
1048        {
1049            return false;
1050        }
1051
1052        simdscalari mask = GenPrimMask(numPrimsToAssemble);
1053
1054        const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1055        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1056        {
1057            simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
1058
1059            const float* pBase = pBaseAttrib;
1060            for (uint32_t c = 0; c < 4; ++c)
1061            {
1062                verts[i].v[c] = _simd_mask_i32gather_ps(
1063                    _simd_setzero_ps(),
1064                    pBase,
1065                    indices,
1066                    _simd_castsi_ps(mask),
1067                    4 /* gcc doesn't like sizeof(float) */);
1068                pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1069            }
1070        }
1071
1072        return true;
1073    }
1074
1075    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1076    {
1077        SWR_ASSERT(slot < m_numAttributes);
1078        SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1079
1080        const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1081        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1082        {
1083            uint32_t index = m_ppIndices[i][primIndex];
1084            const float* pVertData = pVertDataBase;
1085            float* pVert = (float*)&verts[i];
1086
1087            for (uint32_t c = 0; c < 4; ++c)
1088            {
1089                pVert[c] = pVertData[index];
1090                pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1091            }
1092        }
1093    }
1094
1095    bool NextPrim()
1096    {
1097        uint32_t numPrims = PA_TESS::NumPrims();
1098        m_numPrims -= numPrims;
1099        m_ppIndices[0] += numPrims;
1100        m_ppIndices[1] += numPrims;
1101        m_ppIndices[2] += numPrims;
1102
1103        return HasWork();
1104    }
1105
1106    simdvertex& GetNextVsOutput()
1107    {
1108        SWR_ASSERT(0, "%s", __FUNCTION__);
1109        static simdvertex junk;
1110        return junk;
1111    }
1112
1113    bool GetNextStreamOutput()
1114    {
1115        SWR_ASSERT(0, "%s", __FUNCTION__);
1116        return false;
1117    }
1118
1119    simdmask& GetNextVsIndices()
1120    {
1121        SWR_ASSERT(0, "%s", __FUNCTION__);
1122        static simdmask junk;
1123        return junk;
1124    }
1125
1126    uint32_t NumPrims()
1127    {
1128        return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
1129    }
1130
1131    void Reset() { SWR_ASSERT(0); };
1132
1133    simdscalari GetPrimID(uint32_t startID)
1134    {
1135        return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1136    }
1137
1138private:
1139    const simdscalar*   m_pVertexData = nullptr;
1140    uint32_t            m_attributeStrideInVectors = 0;
1141    uint32_t            m_numAttributes = 0;
1142    uint32_t            m_numPrims = 0;
1143    uint32_t*           m_ppIndices[3];
1144
1145    uint32_t            m_numVertsPerPrim = 0;
1146
1147    simdscalari         m_vPrimId;
1148};
1149
1150// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1151// based on state.
1152template <typename IsIndexedT, typename IsCutIndexEnabledT>
1153struct PA_FACTORY
1154{
1155    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1156    {
1157#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1158        const API_STATE& state = GetApiState(pDC);
1159        if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1160            topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1161            topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1162            topo == TOP_TRIANGLE_LIST)) ||
1163
1164            // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1165            // for them in the optimized PA
1166            (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1167        {
1168            memset(&indexStore, 0, sizeof(indexStore));
1169            uint32_t numAttribs = state.feNumAttributes;
1170
1171            new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
1172                &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1173            cutPA = true;
1174        }
1175        else
1176#endif
1177        {
1178            uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1179            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
1180            cutPA = false;
1181        }
1182
1183    }
1184
1185    PA_STATE& GetPA()
1186    {
1187#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1188        if (cutPA)
1189        {
1190            return this->paCut;
1191        }
1192        else
1193#endif
1194        {
1195            return this->paOpt;
1196        }
1197    }
1198
1199    PA_STATE_OPT paOpt;
1200    PA_STATE_CUT paCut;
1201    bool cutPA{ false };
1202
1203    PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1204
1205    simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
1206    simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
1207};
1208