1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file frontend.h
24*
25* @brief Definitions for Frontend which handles vertex processing,
26*        primitive assembly, clipping, binning, etc.
27*
28******************************************************************************/
29#pragma once
30#include "context.h"
31#include <type_traits>
32
33// Calculates the A and B coefficients for the 3 edges of the triangle
34//
35// maths for edge equations:
36//   standard form of a line in 2d
37//   Ax + By + C = 0
38//   A = y0 - y1
39//   B = x1 - x0
40//   C = x0y1 - x1y0
41INLINE
42void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
43{
44    // vYsub = y1 y2 y0 dc
45    __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
46    // vY =    y0 y1 y2 dc
47    vA = _mm_sub_ps(vY, vYsub);
48
49    // Result:
50    // A[0] = y0 - y1
51    // A[1] = y1 - y2
52    // A[2] = y2 - y0
53
54    // vXsub = x1 x2 x0 dc
55    __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
56    // vX =    x0 x1 x2 dc
57    vB = _mm_sub_ps(vXsub, vX);
58
59    // Result:
60    // B[0] = x1 - x0
61    // B[1] = x2 - x1
62    // B[2] = x0 - x2
63}
64
65INLINE
66void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
67{
68    // generate edge equations
69    // A = y0 - y1
70    // B = x1 - x0
71    vA[0] = _simd_sub_ps(vY[0], vY[1]);
72    vA[1] = _simd_sub_ps(vY[1], vY[2]);
73    vA[2] = _simd_sub_ps(vY[2], vY[0]);
74
75    vB[0] = _simd_sub_ps(vX[1], vX[0]);
76    vB[1] = _simd_sub_ps(vX[2], vX[1]);
77    vB[2] = _simd_sub_ps(vX[0], vX[2]);
78}
79
80INLINE
81void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
82{
83    // generate edge equations
84    // A = y0 - y1
85    // B = x1 - x0
86    // C = x0y1 - x1y0
87    __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
88    vA = _mm_sub_epi32(vY, vYsub);
89
90    __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
91    vB = _mm_sub_epi32(vXsub, vX);
92}
93
94INLINE
95void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
96{
97    // A = y0 - y1
98    // B = x1 - x0
99    vA[0] = _simd_sub_epi32(vY[0], vY[1]);
100    vA[1] = _simd_sub_epi32(vY[1], vY[2]);
101    vA[2] = _simd_sub_epi32(vY[2], vY[0]);
102
103    vB[0] = _simd_sub_epi32(vX[1], vX[0]);
104    vB[1] = _simd_sub_epi32(vX[2], vX[1]);
105    vB[2] = _simd_sub_epi32(vX[0], vX[2]);
106}
107// Calculate the determinant of the triangle
108// 2 vectors between the 3 points: P, Q
109// Px = x0-x2, Py = y0-y2
110// Qx = x1-x2, Qy = y1-y2
111//       |Px Qx|
112// det = |     | = PxQy - PyQx
113//       |Py Qy|
114// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
115//               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
116//               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
117//               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
118//               : B[2]*A[1] - A[2]*B[1]
119INLINE
120float calcDeterminantInt(const __m128i vA, const __m128i vB)
121{
122    // vAShuf = [A1, A0, A2, A0]
123    __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
124    // vBShuf = [B2, B0, B1, B0]
125    __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
126    // vMul = [A1*B2, B1*A2]
127    __m128i vMul   = _mm_mul_epi32(vAShuf, vBShuf);
128
129    // shuffle upper to lower
130    // vMul2 = [B1*A2, B1*A2]
131    __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
132    //vMul = [A1*B2 - B1*A2]
133    vMul = _mm_sub_epi64(vMul, vMul2);
134
135    int64_t result;
136    _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
137
138    double dResult = (double)result;
139    dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
140
141    return (float)dResult;
142}
143
144INLINE
145void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
146{
147    // refer to calcDeterminantInt comment for calculation explanation
148    // A1*B2
149    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]);     // 0 0 1 1 4 4 5 5
150    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]);     // 2 2 3 3 6 6 7 7
151
152    simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
153    simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
154
155    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo);        // 0 1 4 5
156    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi);        // 2 3 6 7
157
158    // B1*A2
159    simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
160    simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
161
162    simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
163    simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
164
165    simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
166    simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
167
168    // A1*B2 - A2*B1
169    simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
170    simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
171
172    // shuffle 0 1 4 5 -> 0 1 2 3
173    simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
174    simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
175
176    pvDet[0] = vResultLo;
177    pvDet[1] = vResultHi;
178}
179
180INLINE
181void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
182{
183    // C = -Ax - By
184    vC  = _mm_mul_ps(vA, vX);
185    __m128 vCy = _mm_mul_ps(vB, vY);
186    vC  = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
187    vC  = _mm_sub_ps(vC, vCy);
188}
189
190INLINE
191void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
192{
193    vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
194    vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
195
196    vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
197    vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
198
199    vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
200    vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
201}
202
203template<uint32_t NumVerts>
204INLINE
205void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
206{
207    simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
208    simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
209    simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
210    simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
211    simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
212    simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
213
214    for (uint32_t i = 0; i < NumVerts; ++i)
215    {
216        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
217        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
218        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
219    }
220}
221
222template<uint32_t NumVerts>
223INLINE
224void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
225{
226    // perform a gather of each matrix element based on the viewport array indexes
227    simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
228    simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
229    simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
230    simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
231    simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
232    simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
233
234    for (uint32_t i = 0; i < NumVerts; ++i)
235    {
236        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
237        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
238        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
239    }
240}
241
242INLINE
243void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
244{
245    // Need horizontal fp min here
246    __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
247    __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
248
249    __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
250    __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
251
252
253    __m128i vMinX = _mm_min_epi32(vX, vX1);
254            vMinX = _mm_min_epi32(vMinX, vX2);
255
256    __m128i vMaxX = _mm_max_epi32(vX, vX1);
257            vMaxX = _mm_max_epi32(vMaxX, vX2);
258
259    __m128i vMinY = _mm_min_epi32(vY, vY1);
260            vMinY = _mm_min_epi32(vMinY, vY2);
261
262    __m128i vMaxY = _mm_max_epi32(vY, vY1);
263            vMaxY = _mm_max_epi32(vMaxY, vY2);
264
265    bbox.xmin = _mm_extract_epi32(vMinX, 0);
266    bbox.xmax = _mm_extract_epi32(vMaxX, 0);
267    bbox.ymin = _mm_extract_epi32(vMinY, 0);
268    bbox.ymax = _mm_extract_epi32(vMaxY, 0);
269}
270
271INLINE
272bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
273{
274    const API_STATE& state = GetApiState(pDC);
275
276    return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
277            state.rastState.pointSize == 1.0f &&
278            !state.rastState.pointParam &&
279            !state.rastState.pointSpriteEnable);
280}
281
282INLINE
283bool vHasNaN(const __m128& vec)
284{
285    const __m128 result = _mm_cmpunord_ps(vec, vec);
286    const int32_t mask = _mm_movemask_ps(result);
287    return (mask != 0);
288}
289
290uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
291uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
292
293
294// ProcessDraw front-end function.  All combinations of parameter values are available
295PFN_FE_WORK_FUNC GetProcessDrawFunc(
296    bool IsIndexed,
297    bool IsCutIndexEnabled,
298    bool HasTessellation,
299    bool HasGeometryShader,
300    bool HasStreamOut,
301    bool HasRasterization);
302
303void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
304void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
305void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
306void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
307void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
308
309PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
310
311struct PA_STATE_BASE;  // forward decl
312void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
313void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
314
315