1/**************************************************************************** 2* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22* 23* @file frontend.h 24* 25* @brief Definitions for Frontend which handles vertex processing, 26* primitive assembly, clipping, binning, etc. 27* 28******************************************************************************/ 29#pragma once 30#include "context.h" 31#include <type_traits> 32 33// Calculates the A and B coefficients for the 3 edges of the triangle 34// 35// maths for edge equations: 36// standard form of a line in 2d 37// Ax + By + C = 0 38// A = y0 - y1 39// B = x1 - x0 40// C = x0y1 - x1y0 41INLINE 42void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB) 43{ 44 // vYsub = y1 y2 y0 dc 45 __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1)); 46 // vY = y0 y1 y2 dc 47 vA = _mm_sub_ps(vY, vYsub); 48 49 // Result: 50 // A[0] = y0 - y1 51 // A[1] = y1 - y2 52 // A[2] = y2 - y0 53 54 // vXsub = x1 x2 x0 dc 55 __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1)); 56 // vX = x0 x1 x2 dc 57 vB = _mm_sub_ps(vXsub, vX); 58 59 // Result: 60 // B[0] = x1 - x0 61 // B[1] = x2 - x1 62 // B[2] = x0 - x2 63} 64 65INLINE 66void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3]) 67{ 68 // generate edge equations 69 // A = y0 - y1 70 // B = x1 - x0 71 vA[0] = _simd_sub_ps(vY[0], vY[1]); 72 vA[1] = _simd_sub_ps(vY[1], vY[2]); 73 vA[2] = _simd_sub_ps(vY[2], vY[0]); 74 75 vB[0] = _simd_sub_ps(vX[1], vX[0]); 76 vB[1] = _simd_sub_ps(vX[2], vX[1]); 77 vB[2] = _simd_sub_ps(vX[0], vX[2]); 78} 79 80INLINE 81void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB) 82{ 83 // generate edge equations 84 // A = y0 - y1 85 // B = x1 - x0 86 // C = x0y1 - x1y0 87 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1)); 88 vA = _mm_sub_epi32(vY, vYsub); 89 90 __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1)); 91 vB = _mm_sub_epi32(vXsub, vX); 92} 93 94INLINE 95void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3]) 96{ 97 // A = y0 - y1 98 // B = x1 - x0 99 vA[0] = _simd_sub_epi32(vY[0], vY[1]); 100 vA[1] = _simd_sub_epi32(vY[1], vY[2]); 101 vA[2] = _simd_sub_epi32(vY[2], vY[0]); 102 103 vB[0] = _simd_sub_epi32(vX[1], vX[0]); 104 vB[1] = _simd_sub_epi32(vX[2], vX[1]); 105 vB[2] = _simd_sub_epi32(vX[0], vX[2]); 106} 107// Calculate the determinant of the triangle 108// 2 vectors between the 3 points: P, Q 109// Px = x0-x2, Py = y0-y2 110// Qx = x1-x2, Qy = y1-y2 111// |Px Qx| 112// det = | | = PxQy - PyQx 113// |Py Qy| 114// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2) 115// try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx 116// : B[2]*A[1] - (-(y2-y0))*(-(x2-x1)) 117// : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1) 118// : B[2]*A[1] - A[2]*B[1] 119INLINE 120float calcDeterminantInt(const __m128i vA, const __m128i vB) 121{ 122 // vAShuf = [A1, A0, A2, A0] 123 __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1)); 124 // vBShuf = [B2, B0, B1, B0] 125 __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2)); 126 // vMul = [A1*B2, B1*A2] 127 __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf); 128 129 // shuffle upper to lower 130 // vMul2 = [B1*A2, B1*A2] 131 __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2)); 132 //vMul = [A1*B2 - B1*A2] 133 vMul = _mm_sub_epi64(vMul, vMul2); 134 135 int64_t result; 136 _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); 137 138 double dResult = (double)result; 139 dResult = dResult * (1.0 / FIXED_POINT16_SCALE); 140 141 return (float)dResult; 142} 143 144INLINE 145void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet) 146{ 147 // refer to calcDeterminantInt comment for calculation explanation 148 // A1*B2 149 simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 150 simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 151 152 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]); 153 simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]); 154 155 simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 156 simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 157 158 // B1*A2 159 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]); 160 simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]); 161 162 simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]); 163 simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]); 164 165 simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo); 166 simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi); 167 168 // A1*B2 - A2*B1 169 simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo); 170 simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); 171 172 // shuffle 0 1 4 5 -> 0 1 2 3 173 simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20); 174 simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31); 175 176 pvDet[0] = vResultLo; 177 pvDet[1] = vResultHi; 178} 179 180INLINE 181void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) 182{ 183 // C = -Ax - By 184 vC = _mm_mul_ps(vA, vX); 185 __m128 vCy = _mm_mul_ps(vB, vY); 186 vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f)); 187 vC = _mm_sub_ps(vC, vCy); 188} 189 190INLINE 191void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix) 192{ 193 vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00)); 194 vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30)); 195 196 vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11)); 197 vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31)); 198 199 vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22)); 200 vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32)); 201} 202 203template<uint32_t NumVerts> 204INLINE 205void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) 206{ 207 simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]); 208 simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]); 209 simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]); 210 simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]); 211 simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]); 212 simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]); 213 214 for (uint32_t i = 0; i < NumVerts; ++i) 215 { 216 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); 217 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); 218 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); 219 } 220} 221 222template<uint32_t NumVerts> 223INLINE 224void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx) 225{ 226 // perform a gather of each matrix element based on the viewport array indexes 227 simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); 228 simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); 229 simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); 230 simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); 231 simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); 232 simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); 233 234 for (uint32_t i = 0; i < NumVerts; ++i) 235 { 236 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); 237 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); 238 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); 239 } 240} 241 242INLINE 243void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) 244{ 245 // Need horizontal fp min here 246 __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1)); 247 __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2)); 248 249 __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1)); 250 __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2)); 251 252 253 __m128i vMinX = _mm_min_epi32(vX, vX1); 254 vMinX = _mm_min_epi32(vMinX, vX2); 255 256 __m128i vMaxX = _mm_max_epi32(vX, vX1); 257 vMaxX = _mm_max_epi32(vMaxX, vX2); 258 259 __m128i vMinY = _mm_min_epi32(vY, vY1); 260 vMinY = _mm_min_epi32(vMinY, vY2); 261 262 __m128i vMaxY = _mm_max_epi32(vY, vY1); 263 vMaxY = _mm_max_epi32(vMaxY, vY2); 264 265 bbox.xmin = _mm_extract_epi32(vMinX, 0); 266 bbox.xmax = _mm_extract_epi32(vMaxX, 0); 267 bbox.ymin = _mm_extract_epi32(vMinY, 0); 268 bbox.ymax = _mm_extract_epi32(vMaxY, 0); 269} 270 271INLINE 272bool CanUseSimplePoints(DRAW_CONTEXT *pDC) 273{ 274 const API_STATE& state = GetApiState(pDC); 275 276 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X && 277 state.rastState.pointSize == 1.0f && 278 !state.rastState.pointParam && 279 !state.rastState.pointSpriteEnable); 280} 281 282INLINE 283bool vHasNaN(const __m128& vec) 284{ 285 const __m128 result = _mm_cmpunord_ps(vec, vec); 286 const int32_t mask = _mm_movemask_ps(result); 287 return (mask != 0); 288} 289 290uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements); 291uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts); 292 293 294// ProcessDraw front-end function. All combinations of parameter values are available 295PFN_FE_WORK_FUNC GetProcessDrawFunc( 296 bool IsIndexed, 297 bool IsCutIndexEnabled, 298 bool HasTessellation, 299 bool HasGeometryShader, 300 bool HasStreamOut, 301 bool HasRasterization); 302 303void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 304void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 305void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 306void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 307void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); 308 309PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); 310 311struct PA_STATE_BASE; // forward decl 312void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); 313void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); 314 315