1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file clip.h
24*
25* @brief Definitions for clipping
26*
27******************************************************************************/
28#pragma once
29
30#include "common/simdintrin.h"
31#include "core/context.h"
32#include "core/pa.h"
33#include "rdtsc_core.h"
34
35// Temp storage used by the clipper
36extern THREAD simdvertex tlsTempVertices[7];
37
38enum SWR_CLIPCODES
39{
40    // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
41    // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
42#define CLIPCODE_SHIFT 23
43    FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
44    FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
45    FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
46    FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
47
48    FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
49    FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
50
51    NEGW            = (0x40 << CLIPCODE_SHIFT),
52
53    GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
54    GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
55    GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
56    GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
57};
58
59#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
60#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
61
62void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles,
63          int *numVerts, float *pOutAttribs);
64
65INLINE
66void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
67{
68    clipCodes = _simd_setzero_ps();
69
70    // -w
71    simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
72
73    // FRUSTUM_LEFT
74    simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
75    clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
76
77    // FRUSTUM_TOP
78    vRes = _simd_cmplt_ps(vertex.y, vNegW);
79    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
80
81    // FRUSTUM_RIGHT
82    vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
83    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
84
85    // FRUSTUM_BOTTOM
86    vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
87    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
88
89    if (state.rastState.depthClipEnable)
90    {
91        // FRUSTUM_NEAR
92        // DX clips depth [0..w], GL clips [-w..w]
93        if (state.rastState.clipHalfZ)
94        {
95            vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
96        }
97        else
98        {
99            vRes = _simd_cmplt_ps(vertex.z, vNegW);
100        }
101        clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
102
103        // FRUSTUM_FAR
104        vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
105        clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
106    }
107
108    // NEGW
109    vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
110    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
111
112    // GUARDBAND_LEFT
113    simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
114    vRes = _simd_cmplt_ps(vertex.x, gbMult);
115    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
116
117    // GUARDBAND_TOP
118    gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
119    vRes = _simd_cmplt_ps(vertex.y, gbMult);
120    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
121
122    // GUARDBAND_RIGHT
123    gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
124    vRes = _simd_cmpgt_ps(vertex.x, gbMult);
125    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
126
127    // GUARDBAND_BOTTOM
128    gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
129    vRes = _simd_cmpgt_ps(vertex.y, gbMult);
130    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
131}
132
133template<uint32_t NumVertsPerPrim>
134class Clipper
135{
136public:
137    Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
138        workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
139    {
140        static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
141    }
142
143    void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
144    {
145        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
146        {
147            ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
148        }
149    }
150
151    simdscalar ComputeClipCodeIntersection()
152    {
153        simdscalar result = this->clipCodes[0];
154        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
155        {
156            result = _simd_and_ps(result, this->clipCodes[i]);
157        }
158        return result;
159    }
160
161    simdscalar ComputeClipCodeUnion()
162    {
163        simdscalar result = this->clipCodes[0];
164        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
165        {
166            result = _simd_or_ps(result, this->clipCodes[i]);
167        }
168        return result;
169    }
170
171    int ComputeNegWMask()
172    {
173        simdscalar clipCodeUnion = ComputeClipCodeUnion();
174        clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
175        return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
176    }
177
178    int ComputeClipMask()
179    {
180        simdscalar clipUnion = ComputeClipCodeUnion();
181        clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
182        return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
183    }
184
185    // clipper is responsible for culling any prims with NAN coordinates
186    int ComputeNaNMask(simdvector prim[])
187    {
188        simdscalar vNanMask = _simd_setzero_ps();
189        for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
190        {
191            simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
192            vNanMask = _simd_or_ps(vNanMask, vNan01);
193            simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
194            vNanMask = _simd_or_ps(vNanMask, vNan23);
195        }
196
197        return _simd_movemask_ps(vNanMask);
198    }
199
200    int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
201    {
202        uint8_t cullMask = this->state.rastState.cullDistanceMask;
203        simdscalar vClipCullMask = _simd_setzero_ps();
204        DWORD index;
205
206        simdvector vClipCullDistLo[3];
207        simdvector vClipCullDistHi[3];
208
209        pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
210        pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
211        while (_BitScanForward(&index, cullMask))
212        {
213            cullMask &= ~(1 << index);
214            uint32_t slot = index >> 2;
215            uint32_t component = index & 0x3;
216
217            simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
218            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
219            {
220                simdscalar vCullComp;
221                if (slot == 0)
222                {
223                    vCullComp = vClipCullDistLo[e][component];
224                }
225                else
226                {
227                    vCullComp = vClipCullDistHi[e][component];
228                }
229
230                // cull if cull distance < 0 || NAN
231                simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
232                vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
233            }
234            vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
235        }
236
237        // clipper should also discard any primitive with NAN clip distance
238        uint8_t clipMask = this->state.rastState.clipDistanceMask;
239        while (_BitScanForward(&index, clipMask))
240        {
241            clipMask &= ~(1 << index);
242            uint32_t slot = index >> 2;
243            uint32_t component = index & 0x3;
244
245            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
246            {
247                simdscalar vClipComp;
248                if (slot == 0)
249                {
250                    vClipComp = vClipCullDistLo[e][component];
251                }
252                else
253                {
254                    vClipComp = vClipCullDistHi[e][component];
255                }
256
257                simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
258                vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
259            }
260        }
261
262        return _simd_movemask_ps(vClipCullMask);
263    }
264
265    // clip SIMD primitives
266    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
267    {
268        // input/output vertex store for clipper
269        simdvertex vertices[7]; // maximum 7 verts generated per triangle
270
271        LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
272        uint32_t provokingVertex = 0;
273        if(pa.binTopology == TOP_TRIANGLE_FAN)
274        {
275            provokingVertex = this->state.frontendState.provokingVertex.triFan;
276        }
277        ///@todo: line topology for wireframe?
278
279        // assemble pos
280        simdvector tmpVector[NumVertsPerPrim];
281        pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
282        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
283        {
284            vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
285        }
286
287        // assemble attribs
288        const SWR_BACKEND_STATE& backendState = this->state.backendState;
289
290        int32_t maxSlot = -1;
291        for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
292        {
293            // Compute absolute attrib slot in vertex array
294            uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
295            maxSlot = std::max<int32_t>(maxSlot, mapSlot);
296            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
297
298            pa.Assemble(inputSlot, tmpVector);
299
300            // if constant interpolation enabled for this attribute, assign the provoking
301            // vertex values to all edges
302            if (_bittest(&constantInterpMask, slot))
303            {
304                for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
305                {
306                    vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
307                }
308            }
309            else
310            {
311                for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
312                {
313                    vertices[i].attrib[inputSlot] = tmpVector[i];
314                }
315            }
316        }
317
318        // assemble user clip distances if enabled
319        if (this->state.rastState.clipDistanceMask & 0xf)
320        {
321            pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
322            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
323            {
324                vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
325            }
326        }
327
328        if (this->state.rastState.clipDistanceMask & 0xf0)
329        {
330            pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
331            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
332            {
333                vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
334            }
335        }
336
337        uint32_t numAttribs = maxSlot + 1;
338
339        simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
340
341        // set up new PA for binning clipped primitives
342        PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
343        PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
344        if (NumVertsPerPrim == 3)
345        {
346            pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
347            clipTopology = TOP_TRIANGLE_FAN;
348
349            // so that the binner knows to bloat wide points later
350            if (pa.binTopology == TOP_POINT_LIST)
351                clipTopology = TOP_POINT_LIST;
352
353        }
354        else if (NumVertsPerPrim == 2)
355        {
356            pfnBinFunc = BinLines;
357            clipTopology = TOP_LINE_LIST;
358        }
359        else
360        {
361            SWR_ASSERT(0 && "Unexpected points in clipper.");
362        }
363
364        uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
365        uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
366        uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
367
368        const simdscalari vOffsets = _mm256_set_epi32(
369            0 * sizeof(simdvertex),  // unused lane
370            6 * sizeof(simdvertex),
371            5 * sizeof(simdvertex),
372            4 * sizeof(simdvertex),
373            3 * sizeof(simdvertex),
374            2 * sizeof(simdvertex),
375            1 * sizeof(simdvertex),
376            0 * sizeof(simdvertex));
377
378        // only need to gather 7 verts
379        // @todo dynamic mask based on actual # of verts generated per lane
380        const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
381
382        uint32_t numClippedPrims = 0;
383        for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
384        {
385            uint32_t numEmittedVerts = pVertexCount[inputPrim];
386            if (numEmittedVerts < NumVertsPerPrim)
387            {
388                continue;
389            }
390            SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
391
392            uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
393            numClippedPrims += numEmittedPrims;
394
395            // tranpose clipper output so that each lane's vertices are in SIMD order
396            // set aside space for 2 vertices, as the PA will try to read up to 16 verts
397            // for triangle fan
398            simdvertex transposedPrims[2];
399
400            // transpose pos
401            uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
402            for (uint32_t c = 0; c < 4; ++c)
403            {
404                transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
405                pBase += sizeof(simdscalar);
406            }
407
408            // transpose attribs
409            pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
410            for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
411            {
412                uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
413                for (uint32_t c = 0; c < 4; ++c)
414                {
415                    transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
416                    pBase += sizeof(simdscalar);
417                }
418            }
419
420            // transpose user clip distances if enabled
421            if (this->state.rastState.clipDistanceMask & 0xf)
422            {
423                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
424                for (uint32_t c = 0; c < 4; ++c)
425                {
426                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
427                    pBase += sizeof(simdscalar);
428                }
429            }
430
431            if (this->state.rastState.clipDistanceMask & 0xf0)
432            {
433                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
434                for (uint32_t c = 0; c < 4; ++c)
435                {
436                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
437                    pBase += sizeof(simdscalar);
438                }
439            }
440
441            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
442
443            while (clipPa.GetNextStreamOutput())
444            {
445                do
446                {
447                    simdvector attrib[NumVertsPerPrim];
448                    bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
449                    if (assemble)
450                    {
451                        static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
452                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
453                    }
454                } while (clipPa.NextPrim());
455            }
456        }
457
458        // update global pipeline stat
459        UPDATE_STAT_FE(CPrimitives, numClippedPrims);
460    }
461
462    // execute the clipper stage
463    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
464    {
465        SWR_ASSERT(pa.pDC != nullptr);
466        SWR_CONTEXT* pContext = pa.pDC->pContext;
467
468        // set up binner based on PA state
469        PFN_PROCESS_PRIMS pfnBinner;
470        switch (pa.binTopology)
471        {
472        case TOP_POINT_LIST:
473            pfnBinner = BinPoints;
474            break;
475        case TOP_LINE_LIST:
476        case TOP_LINE_STRIP:
477        case TOP_LINE_LOOP:
478        case TOP_LINE_LIST_ADJ:
479        case TOP_LISTSTRIP_ADJ:
480            pfnBinner = BinLines;
481            break;
482        default:
483            pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
484            break;
485        };
486
487        // update clipper invocations pipeline stat
488        uint32_t numInvoc = _mm_popcnt_u32(primMask);
489        UPDATE_STAT_FE(CInvocations, numInvoc);
490
491        ComputeClipCodes(prim, viewportIdx);
492
493        // cull prims with NAN coords
494        primMask &= ~ComputeNaNMask(prim);
495
496        // user cull distance cull
497        if (this->state.rastState.cullDistanceMask)
498        {
499            primMask &= ~ComputeUserClipCullMask(pa, prim);
500        }
501
502        // cull prims outside view frustum
503        simdscalar clipIntersection = ComputeClipCodeIntersection();
504        int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
505
506        // skip clipping for points
507        uint32_t clipMask = 0;
508        if (NumVertsPerPrim != 1)
509        {
510            clipMask = primMask & ComputeClipMask();
511        }
512
513        if (clipMask)
514        {
515            AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
516            // we have to clip tris, execute the clipper, which will also
517            // call the binner
518            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
519            AR_END(FEGuardbandClip, 1);
520        }
521        else if (validMask)
522        {
523            // update CPrimitives pipeline state
524            UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
525
526            // forward valid prims directly to binner
527            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
528        }
529    }
530
531private:
532    inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
533    {
534        return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
535    }
536
537    inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
538    {
539        const uint32_t simdVertexStride = sizeof(simdvertex);
540        const uint32_t componentStride = sizeof(simdscalar);
541        const uint32_t attribStride = sizeof(simdvector);
542        const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
543            3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
544
545        // step to the simdvertex
546        simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
547
548        // step to the attribute and component
549        vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
550
551        // step to the lane
552        vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
553
554        return vOffsets;
555    }
556
557    // gathers a single component for a given attribute for each SIMD lane
558    inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
559    {
560        simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
561        simdscalar vSrc = _mm256_undefined_ps();
562        return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
563    }
564
565    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
566    {
567        simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
568
569        uint32_t* pOffsets = (uint32_t*)&vOffsets;
570        float* pSrc = (float*)&vSrc;
571        uint32_t mask = _simd_movemask_ps(vMask);
572        DWORD lane;
573        while (_BitScanForward(&lane, mask))
574        {
575            mask &= ~(1 << lane);
576            uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
577            *(float*)pBuf = pSrc[lane];
578        }
579    }
580
581    template<SWR_CLIPCODES ClippingPlane>
582    inline void intersect(
583        const simdscalar& vActiveMask,  // active lanes to operate on
584        const simdscalari& s,           // index to first edge vertex v0 in pInPts.
585        const simdscalari& p,           // index to second edge vertex v1 in pInPts.
586        const simdvector& v1,           // vertex 0 position
587        const simdvector& v2,           // vertex 1 position
588        simdscalari& outIndex,          // output index.
589        const float *pInVerts,          // array of all the input positions.
590        uint32_t numInAttribs,          // number of attributes per vertex.
591        float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
592    {
593        // compute interpolation factor
594        simdscalar t;
595        switch (ClippingPlane)
596        {
597        case FRUSTUM_LEFT:      t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
598        case FRUSTUM_RIGHT:     t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
599        case FRUSTUM_TOP:       t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
600        case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
601        case FRUSTUM_NEAR:
602            // DX Znear plane is 0, GL is -w
603            if (this->state.rastState.clipHalfZ)
604            {
605                t = ComputeInterpFactor(v1[2], v2[2]);
606            }
607            else
608            {
609                t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
610            }
611            break;
612        case FRUSTUM_FAR:       t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
613        default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
614        };
615
616        // interpolate position and store
617        for (uint32_t c = 0; c < 4; ++c)
618        {
619            simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
620            ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
621        }
622
623        // interpolate attributes and store
624        for (uint32_t a = 0; a < numInAttribs; ++a)
625        {
626            uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
627            for (uint32_t c = 0; c < 4; ++c)
628            {
629                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
630                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
631                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
632                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
633            }
634        }
635
636        // interpolate clip distance if enabled
637        if (this->state.rastState.clipDistanceMask & 0xf)
638        {
639            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
640            for (uint32_t c = 0; c < 4; ++c)
641            {
642                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
643                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
644                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
645                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
646            }
647        }
648
649        if (this->state.rastState.clipDistanceMask & 0xf0)
650        {
651            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
652            for (uint32_t c = 0; c < 4; ++c)
653            {
654                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
655                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
656                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
657                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
658            }
659        }
660    }
661
662    template<SWR_CLIPCODES ClippingPlane>
663    inline simdscalar inside(const simdvector& v)
664    {
665        switch (ClippingPlane)
666        {
667        case FRUSTUM_LEFT:      return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
668        case FRUSTUM_RIGHT:     return _simd_cmple_ps(v[0], v[3]);
669        case FRUSTUM_TOP:       return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
670        case FRUSTUM_BOTTOM:    return _simd_cmple_ps(v[1], v[3]);
671        case FRUSTUM_NEAR:      return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
672        case FRUSTUM_FAR:       return _simd_cmple_ps(v[2], v[3]);
673        default:
674            SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
675            return _simd_setzero_ps();
676        }
677    }
678
679    template<SWR_CLIPCODES ClippingPlane>
680    simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
681    {
682        simdscalari vCurIndex = _simd_setzero_si();
683        simdscalari vOutIndex = _simd_setzero_si();
684        simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
685
686        while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
687        {
688            simdscalari s = vCurIndex;
689            simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
690            simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
691            p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
692
693            // gather position
694            simdvector vInPos0, vInPos1;
695            for (uint32_t c = 0; c < 4; ++c)
696            {
697                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
698                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
699            }
700
701            // compute inside mask
702            simdscalar s_in = inside<ClippingPlane>(vInPos0);
703            simdscalar p_in = inside<ClippingPlane>(vInPos1);
704
705            // compute intersection mask (s_in != p_in)
706            simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
707            intersectMask = _simd_and_ps(intersectMask, vActiveMask);
708
709            // store s if inside
710            s_in = _simd_and_ps(s_in, vActiveMask);
711            if (!_simd_testz_ps(s_in, s_in))
712            {
713                // store position
714                for (uint32_t c = 0; c < 4; ++c)
715                {
716                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
717                }
718
719                // store attribs
720                for (uint32_t a = 0; a < numInAttribs; ++a)
721                {
722                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
723                    for (uint32_t c = 0; c < 4; ++c)
724                    {
725                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
726                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
727                    }
728                }
729
730                // store clip distance if enabled
731                if (this->state.rastState.clipDistanceMask & 0xf)
732                {
733                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
734                    for (uint32_t c = 0; c < 4; ++c)
735                    {
736                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
737                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
738                    }
739                }
740
741                if (this->state.rastState.clipDistanceMask & 0xf0)
742                {
743                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
744                    for (uint32_t c = 0; c < 4; ++c)
745                    {
746                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
747                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
748                    }
749                }
750
751                // increment outIndex
752                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
753            }
754
755            // compute and store intersection
756            if (!_simd_testz_ps(intersectMask, intersectMask))
757            {
758                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
759
760                // increment outIndex for active lanes
761                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
762            }
763
764            // increment loop index and update active mask
765            vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
766            vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
767        }
768
769        return vOutIndex;
770    }
771
772    template<SWR_CLIPCODES ClippingPlane>
773    simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
774    {
775        simdscalari vCurIndex = _simd_setzero_si();
776        simdscalari vOutIndex = _simd_setzero_si();
777        simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
778
779        if (!_simd_testz_ps(vActiveMask, vActiveMask))
780        {
781            simdscalari s = vCurIndex;
782            simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
783
784            // gather position
785            simdvector vInPos0, vInPos1;
786            for (uint32_t c = 0; c < 4; ++c)
787            {
788                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
789                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
790            }
791
792            // compute inside mask
793            simdscalar s_in = inside<ClippingPlane>(vInPos0);
794            simdscalar p_in = inside<ClippingPlane>(vInPos1);
795
796            // compute intersection mask (s_in != p_in)
797            simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
798            intersectMask = _simd_and_ps(intersectMask, vActiveMask);
799
800            // store s if inside
801            s_in = _simd_and_ps(s_in, vActiveMask);
802            if (!_simd_testz_ps(s_in, s_in))
803            {
804                for (uint32_t c = 0; c < 4; ++c)
805                {
806                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
807                }
808
809                // interpolate attributes and store
810                for (uint32_t a = 0; a < numInAttribs; ++a)
811                {
812                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
813                    for (uint32_t c = 0; c < 4; ++c)
814                    {
815                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
816                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
817                    }
818                }
819
820                // increment outIndex
821                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
822            }
823
824            // compute and store intersection
825            if (!_simd_testz_ps(intersectMask, intersectMask))
826            {
827                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
828
829                // increment outIndex for active lanes
830                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
831            }
832
833            // store p if inside
834            p_in = _simd_and_ps(p_in, vActiveMask);
835            if (!_simd_testz_ps(p_in, p_in))
836            {
837                for (uint32_t c = 0; c < 4; ++c)
838                {
839                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
840                }
841
842                // interpolate attributes and store
843                for (uint32_t a = 0; a < numInAttribs; ++a)
844                {
845                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
846                    for (uint32_t c = 0; c < 4; ++c)
847                    {
848                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
849                        ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
850                    }
851                }
852
853                // increment outIndex
854                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
855            }
856        }
857
858        return vOutIndex;
859    }
860
861    //////////////////////////////////////////////////////////////////////////
862    /// @brief Vertical clipper. Clips SIMD primitives at a time
863    /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
864    /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
865    /// @param numAttribs - number of valid input attribs, including position
866    simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
867    {
868        // temp storage
869        float* pTempVerts = (float*)&tlsTempVertices[0];
870
871        // zero out num input verts for non-active lanes
872        simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
873        vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
874
875        // clip prims to frustum
876        simdscalari vNumOutPts;
877        if (NumVertsPerPrim == 3)
878        {
879            vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
880            vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
881            vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
882            vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
883            vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
884            vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
885        }
886        else
887        {
888            SWR_ASSERT(NumVertsPerPrim == 2);
889            vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
890            vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
891            vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
892            vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
893            vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
894            vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
895        }
896
897        // restore num verts for non-clipped, active lanes
898        simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
899        vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
900
901        return vNumOutPts;
902    }
903
904    const uint32_t workerId{ 0 };
905    DRAW_CONTEXT* pDC{ nullptr };
906    const API_STATE& state;
907    simdscalar clipCodes[NumVertsPerPrim];
908};
909
910
911// pipeline stage functions
912void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
913void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
914void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
915