1/**************************************************************************** 2* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22* 23* @file frontend.cpp 24* 25* @brief Implementation for Frontend which handles vertex processing, 26* primitive assembly, clipping, binning, etc. 27* 28******************************************************************************/ 29 30#include "api.h" 31#include "frontend.h" 32#include "backend.h" 33#include "context.h" 34#include "rdtsc_core.h" 35#include "utils.h" 36#include "threads.h" 37#include "pa.h" 38#include "clip.h" 39#include "tilemgr.h" 40#include "tessellator.h" 41#include <limits> 42 43////////////////////////////////////////////////////////////////////////// 44/// @brief Helper macro to generate a bitmask 45static INLINE uint32_t GenMask(uint32_t numBits) 46{ 47 SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); 48 return ((1U << numBits) - 1); 49} 50 51////////////////////////////////////////////////////////////////////////// 52/// @brief FE handler for SwrSync. 53/// @param pContext - pointer to SWR context. 54/// @param pDC - pointer to draw context. 55/// @param workerId - thread's worker id. Even thread has a unique id. 56/// @param pUserData - Pointer to user data passed back to sync callback. 57/// @todo This should go away when we switch this to use compute threading. 58void ProcessSync( 59 SWR_CONTEXT *pContext, 60 DRAW_CONTEXT *pDC, 61 uint32_t workerId, 62 void *pUserData) 63{ 64 BE_WORK work; 65 work.type = SYNC; 66 work.pfnWork = ProcessSyncBE; 67 68 MacroTileMgr *pTileMgr = pDC->pTileMgr; 69 pTileMgr->enqueue(0, 0, &work); 70} 71 72////////////////////////////////////////////////////////////////////////// 73/// @brief FE handler for SwrDestroyContext. 74/// @param pContext - pointer to SWR context. 75/// @param pDC - pointer to draw context. 76/// @param workerId - thread's worker id. Even thread has a unique id. 77/// @param pUserData - Pointer to user data passed back to sync callback. 78void ProcessShutdown( 79 SWR_CONTEXT *pContext, 80 DRAW_CONTEXT *pDC, 81 uint32_t workerId, 82 void *pUserData) 83{ 84 BE_WORK work; 85 work.type = SHUTDOWN; 86 work.pfnWork = ProcessShutdownBE; 87 88 MacroTileMgr *pTileMgr = pDC->pTileMgr; 89 // Enqueue at least 1 work item for each worker thread 90 // account for number of numa nodes 91 uint32_t numNumaNodes = pContext->threadPool.numaMask + 1; 92 93 for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i) 94 { 95 for (uint32_t n = 0; n < numNumaNodes; ++n) 96 { 97 pTileMgr->enqueue(i, n, &work); 98 } 99 } 100} 101 102////////////////////////////////////////////////////////////////////////// 103/// @brief FE handler for SwrClearRenderTarget. 104/// @param pContext - pointer to SWR context. 105/// @param pDC - pointer to draw context. 106/// @param workerId - thread's worker id. Even thread has a unique id. 107/// @param pUserData - Pointer to user data passed back to clear callback. 108/// @todo This should go away when we switch this to use compute threading. 109void ProcessClear( 110 SWR_CONTEXT *pContext, 111 DRAW_CONTEXT *pDC, 112 uint32_t workerId, 113 void *pUserData) 114{ 115 CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData; 116 MacroTileMgr *pTileMgr = pDC->pTileMgr; 117 118 // queue a clear to each macro tile 119 // compute macro tile bounds for the specified rect 120 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; 121 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; 122 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; 123 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; 124 125 BE_WORK work; 126 work.type = CLEAR; 127 work.pfnWork = ProcessClearBE; 128 work.desc.clear = *pDesc; 129 130 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) 131 { 132 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) 133 { 134 pTileMgr->enqueue(x, y, &work); 135 } 136 } 137} 138 139////////////////////////////////////////////////////////////////////////// 140/// @brief FE handler for SwrStoreTiles. 141/// @param pContext - pointer to SWR context. 142/// @param pDC - pointer to draw context. 143/// @param workerId - thread's worker id. Even thread has a unique id. 144/// @param pUserData - Pointer to user data passed back to callback. 145/// @todo This should go away when we switch this to use compute threading. 146void ProcessStoreTiles( 147 SWR_CONTEXT *pContext, 148 DRAW_CONTEXT *pDC, 149 uint32_t workerId, 150 void *pUserData) 151{ 152 AR_BEGIN(FEProcessStoreTiles, pDC->drawId); 153 MacroTileMgr *pTileMgr = pDC->pTileMgr; 154 STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData; 155 156 // queue a store to each macro tile 157 // compute macro tile bounds for the specified rect 158 uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; 159 uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; 160 uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; 161 uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; 162 163 // store tiles 164 BE_WORK work; 165 work.type = STORETILES; 166 work.pfnWork = ProcessStoreTilesBE; 167 work.desc.storeTiles = *pDesc; 168 169 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) 170 { 171 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) 172 { 173 pTileMgr->enqueue(x, y, &work); 174 } 175 } 176 177 AR_END(FEProcessStoreTiles, 0); 178} 179 180////////////////////////////////////////////////////////////////////////// 181/// @brief FE handler for SwrInvalidateTiles. 182/// @param pContext - pointer to SWR context. 183/// @param pDC - pointer to draw context. 184/// @param workerId - thread's worker id. Even thread has a unique id. 185/// @param pUserData - Pointer to user data passed back to callback. 186/// @todo This should go away when we switch this to use compute threading. 187void ProcessDiscardInvalidateTiles( 188 SWR_CONTEXT *pContext, 189 DRAW_CONTEXT *pDC, 190 uint32_t workerId, 191 void *pUserData) 192{ 193 AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId); 194 DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; 195 MacroTileMgr *pTileMgr = pDC->pTileMgr; 196 197 // compute macro tile bounds for the specified rect 198 uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM; 199 uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1; 200 uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM; 201 uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1; 202 203 if (pDesc->fullTilesOnly == false) 204 { 205 // include partial tiles 206 macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; 207 macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; 208 macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; 209 macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; 210 } 211 212 SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X); 213 SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y); 214 215 macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X); 216 macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y); 217 218 // load tiles 219 BE_WORK work; 220 work.type = DISCARDINVALIDATETILES; 221 work.pfnWork = ProcessDiscardInvalidateTilesBE; 222 work.desc.discardInvalidateTiles = *pDesc; 223 224 for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) 225 { 226 for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) 227 { 228 pTileMgr->enqueue(x, y, &work); 229 } 230 } 231 232 AR_END(FEProcessInvalidateTiles, 0); 233} 234 235////////////////////////////////////////////////////////////////////////// 236/// @brief Computes the number of primitives given the number of verts. 237/// @param mode - primitive topology for draw operation. 238/// @param numPrims - number of vertices or indices for draw. 239/// @todo Frontend needs to be refactored. This will go in appropriate place then. 240uint32_t GetNumPrims( 241 PRIMITIVE_TOPOLOGY mode, 242 uint32_t numPrims) 243{ 244 switch (mode) 245 { 246 case TOP_POINT_LIST: return numPrims; 247 case TOP_TRIANGLE_LIST: return numPrims / 3; 248 case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2; 249 case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2; 250 case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1; 251 case TOP_QUAD_LIST: return numPrims / 4; 252 case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2; 253 case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1; 254 case TOP_LINE_LIST: return numPrims / 2; 255 case TOP_LINE_LOOP: return numPrims; 256 case TOP_RECT_LIST: return numPrims / 3; 257 case TOP_LINE_LIST_ADJ: return numPrims / 4; 258 case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3; 259 case TOP_TRI_LIST_ADJ: return numPrims / 6; 260 case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2; 261 262 case TOP_PATCHLIST_1: 263 case TOP_PATCHLIST_2: 264 case TOP_PATCHLIST_3: 265 case TOP_PATCHLIST_4: 266 case TOP_PATCHLIST_5: 267 case TOP_PATCHLIST_6: 268 case TOP_PATCHLIST_7: 269 case TOP_PATCHLIST_8: 270 case TOP_PATCHLIST_9: 271 case TOP_PATCHLIST_10: 272 case TOP_PATCHLIST_11: 273 case TOP_PATCHLIST_12: 274 case TOP_PATCHLIST_13: 275 case TOP_PATCHLIST_14: 276 case TOP_PATCHLIST_15: 277 case TOP_PATCHLIST_16: 278 case TOP_PATCHLIST_17: 279 case TOP_PATCHLIST_18: 280 case TOP_PATCHLIST_19: 281 case TOP_PATCHLIST_20: 282 case TOP_PATCHLIST_21: 283 case TOP_PATCHLIST_22: 284 case TOP_PATCHLIST_23: 285 case TOP_PATCHLIST_24: 286 case TOP_PATCHLIST_25: 287 case TOP_PATCHLIST_26: 288 case TOP_PATCHLIST_27: 289 case TOP_PATCHLIST_28: 290 case TOP_PATCHLIST_29: 291 case TOP_PATCHLIST_30: 292 case TOP_PATCHLIST_31: 293 case TOP_PATCHLIST_32: 294 return numPrims / (mode - TOP_PATCHLIST_BASE); 295 296 case TOP_POLYGON: 297 case TOP_POINT_LIST_BF: 298 case TOP_LINE_STRIP_CONT: 299 case TOP_LINE_STRIP_BF: 300 case TOP_LINE_STRIP_CONT_BF: 301 case TOP_TRIANGLE_FAN_NOSTIPPLE: 302 case TOP_TRI_STRIP_REVERSE: 303 case TOP_PATCHLIST_BASE: 304 case TOP_UNKNOWN: 305 SWR_ASSERT(false, "Unsupported topology: %d", mode); 306 return 0; 307 } 308 309 return 0; 310} 311 312////////////////////////////////////////////////////////////////////////// 313/// @brief Computes the number of verts given the number of primitives. 314/// @param mode - primitive topology for draw operation. 315/// @param numPrims - number of primitives for draw. 316uint32_t GetNumVerts( 317 PRIMITIVE_TOPOLOGY mode, 318 uint32_t numPrims) 319{ 320 switch (mode) 321 { 322 case TOP_POINT_LIST: return numPrims; 323 case TOP_TRIANGLE_LIST: return numPrims * 3; 324 case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0; 325 case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0; 326 case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0; 327 case TOP_QUAD_LIST: return numPrims * 4; 328 case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0; 329 case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0; 330 case TOP_LINE_LIST: return numPrims * 2; 331 case TOP_LINE_LOOP: return numPrims; 332 case TOP_RECT_LIST: return numPrims * 3; 333 case TOP_LINE_LIST_ADJ: return numPrims * 4; 334 case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0; 335 case TOP_TRI_LIST_ADJ: return numPrims * 6; 336 case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0; 337 338 case TOP_PATCHLIST_1: 339 case TOP_PATCHLIST_2: 340 case TOP_PATCHLIST_3: 341 case TOP_PATCHLIST_4: 342 case TOP_PATCHLIST_5: 343 case TOP_PATCHLIST_6: 344 case TOP_PATCHLIST_7: 345 case TOP_PATCHLIST_8: 346 case TOP_PATCHLIST_9: 347 case TOP_PATCHLIST_10: 348 case TOP_PATCHLIST_11: 349 case TOP_PATCHLIST_12: 350 case TOP_PATCHLIST_13: 351 case TOP_PATCHLIST_14: 352 case TOP_PATCHLIST_15: 353 case TOP_PATCHLIST_16: 354 case TOP_PATCHLIST_17: 355 case TOP_PATCHLIST_18: 356 case TOP_PATCHLIST_19: 357 case TOP_PATCHLIST_20: 358 case TOP_PATCHLIST_21: 359 case TOP_PATCHLIST_22: 360 case TOP_PATCHLIST_23: 361 case TOP_PATCHLIST_24: 362 case TOP_PATCHLIST_25: 363 case TOP_PATCHLIST_26: 364 case TOP_PATCHLIST_27: 365 case TOP_PATCHLIST_28: 366 case TOP_PATCHLIST_29: 367 case TOP_PATCHLIST_30: 368 case TOP_PATCHLIST_31: 369 case TOP_PATCHLIST_32: 370 return numPrims * (mode - TOP_PATCHLIST_BASE); 371 372 case TOP_POLYGON: 373 case TOP_POINT_LIST_BF: 374 case TOP_LINE_STRIP_CONT: 375 case TOP_LINE_STRIP_BF: 376 case TOP_LINE_STRIP_CONT_BF: 377 case TOP_TRIANGLE_FAN_NOSTIPPLE: 378 case TOP_TRI_STRIP_REVERSE: 379 case TOP_PATCHLIST_BASE: 380 case TOP_UNKNOWN: 381 SWR_ASSERT(false, "Unsupported topology: %d", mode); 382 return 0; 383 } 384 385 return 0; 386} 387 388////////////////////////////////////////////////////////////////////////// 389/// @brief Return number of verts per primitive. 390/// @param topology - topology 391/// @param includeAdjVerts - include adjacent verts in primitive vertices 392INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) 393{ 394 uint32_t numVerts = 0; 395 switch (topology) 396 { 397 case TOP_POINT_LIST: 398 case TOP_POINT_LIST_BF: 399 numVerts = 1; 400 break; 401 case TOP_LINE_LIST: 402 case TOP_LINE_STRIP: 403 case TOP_LINE_LIST_ADJ: 404 case TOP_LINE_LOOP: 405 case TOP_LINE_STRIP_CONT: 406 case TOP_LINE_STRIP_BF: 407 case TOP_LISTSTRIP_ADJ: 408 numVerts = 2; 409 break; 410 case TOP_TRIANGLE_LIST: 411 case TOP_TRIANGLE_STRIP: 412 case TOP_TRIANGLE_FAN: 413 case TOP_TRI_LIST_ADJ: 414 case TOP_TRI_STRIP_ADJ: 415 case TOP_TRI_STRIP_REVERSE: 416 case TOP_RECT_LIST: 417 numVerts = 3; 418 break; 419 case TOP_QUAD_LIST: 420 case TOP_QUAD_STRIP: 421 numVerts = 4; 422 break; 423 case TOP_PATCHLIST_1: 424 case TOP_PATCHLIST_2: 425 case TOP_PATCHLIST_3: 426 case TOP_PATCHLIST_4: 427 case TOP_PATCHLIST_5: 428 case TOP_PATCHLIST_6: 429 case TOP_PATCHLIST_7: 430 case TOP_PATCHLIST_8: 431 case TOP_PATCHLIST_9: 432 case TOP_PATCHLIST_10: 433 case TOP_PATCHLIST_11: 434 case TOP_PATCHLIST_12: 435 case TOP_PATCHLIST_13: 436 case TOP_PATCHLIST_14: 437 case TOP_PATCHLIST_15: 438 case TOP_PATCHLIST_16: 439 case TOP_PATCHLIST_17: 440 case TOP_PATCHLIST_18: 441 case TOP_PATCHLIST_19: 442 case TOP_PATCHLIST_20: 443 case TOP_PATCHLIST_21: 444 case TOP_PATCHLIST_22: 445 case TOP_PATCHLIST_23: 446 case TOP_PATCHLIST_24: 447 case TOP_PATCHLIST_25: 448 case TOP_PATCHLIST_26: 449 case TOP_PATCHLIST_27: 450 case TOP_PATCHLIST_28: 451 case TOP_PATCHLIST_29: 452 case TOP_PATCHLIST_30: 453 case TOP_PATCHLIST_31: 454 case TOP_PATCHLIST_32: 455 numVerts = topology - TOP_PATCHLIST_BASE; 456 break; 457 default: 458 SWR_ASSERT(false, "Unsupported topology: %d", topology); 459 break; 460 } 461 462 if (includeAdjVerts) 463 { 464 switch (topology) 465 { 466 case TOP_LISTSTRIP_ADJ: 467 case TOP_LINE_LIST_ADJ: numVerts = 4; break; 468 case TOP_TRI_STRIP_ADJ: 469 case TOP_TRI_LIST_ADJ: numVerts = 6; break; 470 default: break; 471 } 472 } 473 474 return numVerts; 475} 476 477////////////////////////////////////////////////////////////////////////// 478/// @brief Generate mask from remaining work. 479/// @param numWorkItems - Number of items being worked on by a SIMD. 480static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) 481{ 482 uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; 483 uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; 484 return _simd_castps_si(vMask(mask)); 485} 486 487////////////////////////////////////////////////////////////////////////// 488/// @brief StreamOut - Streams vertex data out to SO buffers. 489/// Generally, we are only streaming out a SIMDs worth of triangles. 490/// @param pDC - pointer to draw context. 491/// @param workerId - thread's worker id. Even thread has a unique id. 492/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) 493static void StreamOut( 494 DRAW_CONTEXT* pDC, 495 PA_STATE& pa, 496 uint32_t workerId, 497 uint32_t* pPrimData, 498 uint32_t streamIndex) 499{ 500 SWR_CONTEXT *pContext = pDC->pContext; 501 502 AR_BEGIN(FEStreamout, pDC->drawId); 503 504 const API_STATE& state = GetApiState(pDC); 505 const SWR_STREAMOUT_STATE &soState = state.soState; 506 507 uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); 508 509 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. 510 uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t); 511 512 SWR_STREAMOUT_CONTEXT soContext = { 0 }; 513 514 // Setup buffer state pointers. 515 for (uint32_t i = 0; i < 4; ++i) 516 { 517 soContext.pBuffer[i] = &state.soBuffer[i]; 518 } 519 520 uint32_t numPrims = pa.NumPrims(); 521 for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) 522 { 523 DWORD slot = 0; 524 uint32_t soMask = soState.streamMasks[streamIndex]; 525 526 // Write all entries into primitive data buffer for SOS. 527 while (_BitScanForward(&slot, soMask)) 528 { 529 __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) 530 uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT; 531 pa.AssembleSingle(paSlot, primIndex, attrib); 532 533 // Attribute offset is relative offset from start of vertex. 534 // Note that attributes start at slot 1 in the PA buffer. We need to write this 535 // to prim data starting at slot 0. Which is why we do (slot - 1). 536 // Also note: GL works slightly differently, and needs slot 0 537 uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); 538 539 // Store each vertex's attrib at appropriate locations in pPrimData buffer. 540 for (uint32_t v = 0; v < soVertsPerPrim; ++v) 541 { 542 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); 543 544 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); 545 } 546 soMask &= ~(1 << slot); 547 } 548 549 // Update pPrimData pointer 550 soContext.pPrimData = pPrimData; 551 552 // Call SOS 553 SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function."); 554 state.pfnSoFunc[streamIndex](soContext); 555 } 556 557 // Update SO write offset. The driver provides memory for the update. 558 for (uint32_t i = 0; i < 4; ++i) 559 { 560 if (state.soBuffer[i].pWriteOffset) 561 { 562 *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); 563 } 564 565 if (state.soBuffer[i].soWriteEnable) 566 { 567 pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); 568 pDC->dynState.SoWriteOffsetDirty[i] = true; 569 } 570 } 571 572 UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); 573 UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); 574 575 AR_END(FEStreamout, 1); 576} 577 578////////////////////////////////////////////////////////////////////////// 579/// @brief Computes number of invocations. The current index represents 580/// the start of the SIMD. The max index represents how much work 581/// items are remaining. If there is less then a SIMD's xmin of work 582/// then return the remaining amount of work. 583/// @param curIndex - The start index for the SIMD. 584/// @param maxIndex - The last index for all work items. 585static INLINE uint32_t GetNumInvocations( 586 uint32_t curIndex, 587 uint32_t maxIndex) 588{ 589 uint32_t remainder = (maxIndex - curIndex); 590 return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; 591} 592 593////////////////////////////////////////////////////////////////////////// 594/// @brief Converts a streamId buffer to a cut buffer for the given stream id. 595/// The geometry shader will loop over each active streamout buffer, assembling 596/// primitives for the downstream stages. When multistream output is enabled, 597/// the generated stream ID buffer from the GS needs to be converted to a cut 598/// buffer for the primitive assembler. 599/// @param stream - stream id to generate the cut buffer for 600/// @param pStreamIdBase - pointer to the stream ID buffer 601/// @param numEmittedVerts - Number of total verts emitted by the GS 602/// @param pCutBuffer - output buffer to write cuts to 603void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer) 604{ 605 SWR_ASSERT(stream < MAX_SO_STREAMS); 606 607 uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; 608 uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U); 609 610 for (uint32_t b = 0; b < numOutputBytes; ++b) 611 { 612 uint8_t curInputByte = pStreamIdBase[2*b]; 613 uint8_t outByte = 0; 614 for (uint32_t i = 0; i < 4; ++i) 615 { 616 if ((curInputByte & 0x3) != stream) 617 { 618 outByte |= (1 << i); 619 } 620 curInputByte >>= 2; 621 } 622 623 curInputByte = pStreamIdBase[2 * b + 1]; 624 for (uint32_t i = 0; i < 4; ++i) 625 { 626 if ((curInputByte & 0x3) != stream) 627 { 628 outByte |= (1 << (i + 4)); 629 } 630 curInputByte >>= 2; 631 } 632 633 *pCutBuffer++ = outByte; 634 } 635} 636 637THREAD SWR_GS_CONTEXT tlsGsContext; 638 639////////////////////////////////////////////////////////////////////////// 640/// @brief Implements GS stage. 641/// @param pDC - pointer to draw context. 642/// @param workerId - thread's worker id. Even thread has a unique id. 643/// @param pa - The primitive assembly object. 644/// @param pGsOut - output stream for GS 645template < 646 typename HasStreamOutT, 647 typename HasRastT> 648static void GeometryShaderStage( 649 DRAW_CONTEXT *pDC, 650 uint32_t workerId, 651 PA_STATE& pa, 652 void* pGsOut, 653 void* pCutBuffer, 654 void* pStreamCutBuffer, 655 uint32_t* pSoPrimData, 656 simdscalari primID) 657{ 658 SWR_CONTEXT *pContext = pDC->pContext; 659 660 AR_BEGIN(FEGeometryShader, pDC->drawId); 661 662 const API_STATE& state = GetApiState(pDC); 663 const SWR_GS_STATE* pState = &state.gsState; 664 665 SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); 666 SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized"); 667 668 tlsGsContext.pStream = (uint8_t*)pGsOut; 669 tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; 670 tlsGsContext.PrimitiveID = primID; 671 672 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); 673 simdvector attrib[MAX_ATTRIBUTES]; 674 675 // assemble all attributes for the input primitive 676 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) 677 { 678 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; 679 pa.Assemble(attribSlot, attrib); 680 681 for (uint32_t i = 0; i < numVertsPerPrim; ++i) 682 { 683 tlsGsContext.vert[i].attrib[attribSlot] = attrib[i]; 684 } 685 } 686 687 // assemble position 688 pa.Assemble(VERTEX_POSITION_SLOT, attrib); 689 for (uint32_t i = 0; i < numVertsPerPrim; ++i) 690 { 691 tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; 692 } 693 694 const uint32_t vertexStride = sizeof(simdvertex); 695 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; 696 const uint32_t inputPrimStride = numSimdBatches * vertexStride; 697 const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH; 698 uint32_t cutPrimStride; 699 uint32_t cutInstanceStride; 700 701 if (pState->isSingleStream) 702 { 703 cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; 704 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH; 705 } 706 else 707 { 708 cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4); 709 cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH; 710 } 711 712 // record valid prims from the frontend to avoid over binning the newly generated 713 // prims from the GS 714 uint32_t numInputPrims = pa.NumPrims(); 715 716 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) 717 { 718 tlsGsContext.InstanceID = instance; 719 tlsGsContext.mask = GenerateMask(numInputPrims); 720 721 // execute the geometry shader 722 state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext); 723 724 tlsGsContext.pStream += instanceStride; 725 tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride; 726 } 727 728 // set up new binner and state for the GS output topology 729 PFN_PROCESS_PRIMS pfnClipFunc = nullptr; 730 if (HasRastT::value) 731 { 732 switch (pState->outputTopology) 733 { 734 case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; 735 case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; 736 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; 737 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology); 738 } 739 } 740 741 // foreach input prim: 742 // - setup a new PA based on the emitted verts for that prim 743 // - loop over the new verts, calling PA to assemble each prim 744 uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount; 745 uint32_t* pPrimitiveId = (uint32_t*)&primID; 746 747 uint32_t totalPrimsGenerated = 0; 748 for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) 749 { 750 uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride; 751 uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride; 752 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) 753 { 754 uint32_t numEmittedVerts = pVertexCount[inputPrim]; 755 if (numEmittedVerts == 0) 756 { 757 continue; 758 } 759 760 uint8_t* pBase = pInstanceBase + instance * instanceStride; 761 uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride; 762 763 uint32_t numAttribs = state.feNumAttributes; 764 765 for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) 766 { 767 bool processCutVerts = false; 768 769 uint8_t* pCutBuffer = pCutBase; 770 771 // assign default stream ID, only relevant when GS is outputting a single stream 772 uint32_t streamID = 0; 773 if (pState->isSingleStream) 774 { 775 processCutVerts = true; 776 streamID = pState->singleStreamID; 777 if (streamID != stream) continue; 778 } 779 else 780 { 781 // early exit if this stream is not enabled for streamout 782 if (HasStreamOutT::value && !state.soState.streamEnable[stream]) 783 { 784 continue; 785 } 786 787 // multi-stream output, need to translate StreamID buffer to a cut buffer 788 ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer); 789 pCutBuffer = (uint8_t*)pStreamCutBuffer; 790 processCutVerts = false; 791 } 792 793 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); 794 795 while (gsPa.GetNextStreamOutput()) 796 { 797 do 798 { 799 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); 800 801 if (assemble) 802 { 803 totalPrimsGenerated += gsPa.NumPrims(); 804 805 if (HasStreamOutT::value) 806 { 807 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream); 808 } 809 810 if (HasRastT::value && state.soState.streamToRasterizer == stream) 811 { 812 simdscalari vPrimId; 813 // pull primitiveID from the GS output if available 814 if (state.gsState.emitsPrimitiveID) 815 { 816 simdvector primIdAttrib[3]; 817 gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib); 818 vPrimId = _simd_castps_si(primIdAttrib[0].x); 819 } 820 else 821 { 822 vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); 823 } 824 825 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0. 826 simdscalari vViewPortIdx; 827 if (state.gsState.emitsViewportArrayIndex) 828 { 829 simdvector vpiAttrib[3]; 830 gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib); 831 832 // OOB indices => forced to zero. 833 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); 834 simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports); 835 vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x); 836 837 vViewPortIdx = _simd_castps_si(vpiAttrib[0].x); 838 } 839 else 840 { 841 vViewPortIdx = _simd_set1_epi32(0); 842 } 843 844 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx); 845 } 846 } 847 } while (gsPa.NextPrim()); 848 } 849 } 850 } 851 } 852 853 // update GS pipeline stats 854 UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount); 855 UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated); 856 AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims)); 857 AR_END(FEGeometryShader, 1); 858} 859 860////////////////////////////////////////////////////////////////////////// 861/// @brief Allocate GS buffers 862/// @param pDC - pointer to draw context. 863/// @param state - API state 864/// @param ppGsOut - pointer to GS output buffer allocation 865/// @param ppCutBuffer - pointer to GS output cut buffer allocation 866static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer, 867 void **ppStreamCutBuffer) 868{ 869 auto pArena = pDC->pArena; 870 SWR_ASSERT(pArena != nullptr); 871 SWR_ASSERT(state.gsState.gsEnable); 872 // allocate arena space to hold GS output verts 873 // @todo pack attribs 874 // @todo support multiple streams 875 const uint32_t vertexStride = sizeof(simdvertex); 876 const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH; 877 uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH; 878 *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float)); 879 880 const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8; 881 const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4); 882 const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH; 883 const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH; 884 885 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the 886 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance 887 888 // allocate space for temporary per-stream cut buffer if multi-stream is enabled 889 if (state.gsState.isSingleStream) 890 { 891 *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float)); 892 *ppStreamCutBuffer = nullptr; 893 } 894 else 895 { 896 *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float)); 897 *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float)); 898 } 899 900} 901 902////////////////////////////////////////////////////////////////////////// 903/// @brief Contains all data generated by the HS and passed to the 904/// tessellator and DS. 905struct TessellationThreadLocalData 906{ 907 SWR_HS_CONTEXT hsContext; 908 ScalarPatch patchData[KNOB_SIMD_WIDTH]; 909 void* pTxCtx; 910 size_t tsCtxSize; 911 912 simdscalar* pDSOutput; 913 size_t numDSOutputVectors; 914}; 915 916THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; 917 918////////////////////////////////////////////////////////////////////////// 919/// @brief Allocate tessellation data for this worker thread. 920INLINE 921static void AllocateTessellationData(SWR_CONTEXT* pContext) 922{ 923 /// @TODO - Don't use thread local storage. Use Worker local storage instead. 924 if (gt_pTessellationThreadData == nullptr) 925 { 926 gt_pTessellationThreadData = (TessellationThreadLocalData*) 927 AlignedMalloc(sizeof(TessellationThreadLocalData), 64); 928 memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); 929 } 930} 931 932////////////////////////////////////////////////////////////////////////// 933/// @brief Implements Tessellation Stages. 934/// @param pDC - pointer to draw context. 935/// @param workerId - thread's worker id. Even thread has a unique id. 936/// @param pa - The primitive assembly object. 937/// @param pGsOut - output stream for GS 938template < 939 typename HasGeometryShaderT, 940 typename HasStreamOutT, 941 typename HasRastT> 942static void TessellationStages( 943 DRAW_CONTEXT *pDC, 944 uint32_t workerId, 945 PA_STATE& pa, 946 void* pGsOut, 947 void* pCutBuffer, 948 void* pCutStreamBuffer, 949 uint32_t* pSoPrimData, 950 simdscalari primID) 951{ 952 SWR_CONTEXT *pContext = pDC->pContext; 953 const API_STATE& state = GetApiState(pDC); 954 const SWR_TS_STATE& tsState = state.tsState; 955 956 SWR_ASSERT(gt_pTessellationThreadData); 957 958 HANDLE tsCtx = TSInitCtx( 959 tsState.domain, 960 tsState.partitioning, 961 tsState.tsOutputTopology, 962 gt_pTessellationThreadData->pTxCtx, 963 gt_pTessellationThreadData->tsCtxSize); 964 if (tsCtx == nullptr) 965 { 966 gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64); 967 tsCtx = TSInitCtx( 968 tsState.domain, 969 tsState.partitioning, 970 tsState.tsOutputTopology, 971 gt_pTessellationThreadData->pTxCtx, 972 gt_pTessellationThreadData->tsCtxSize); 973 } 974 SWR_ASSERT(tsCtx); 975 976 PFN_PROCESS_PRIMS pfnClipFunc = nullptr; 977 if (HasRastT::value) 978 { 979 switch (tsState.postDSTopology) 980 { 981 case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; 982 case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; 983 case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; 984 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology); 985 } 986 } 987 988 SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; 989 hsContext.pCPout = gt_pTessellationThreadData->patchData; 990 hsContext.PrimitiveID = primID; 991 992 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); 993 // Max storage for one attribute for an entire simdprimitive 994 simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; 995 996 // assemble all attributes for the input primitives 997 for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) 998 { 999 uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot; 1000 pa.Assemble(attribSlot, simdattrib); 1001 1002 for (uint32_t i = 0; i < numVertsPerPrim; ++i) 1003 { 1004 hsContext.vert[i].attrib[attribSlot] = simdattrib[i]; 1005 } 1006 } 1007 1008#if defined(_DEBUG) 1009 memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); 1010#endif 1011 1012 uint32_t numPrims = pa.NumPrims(); 1013 hsContext.mask = GenerateMask(numPrims); 1014 1015 // Run the HS 1016 AR_BEGIN(FEHullShader, pDC->drawId); 1017 state.pfnHsFunc(GetPrivateState(pDC), &hsContext); 1018 AR_END(FEHullShader, 0); 1019 1020 UPDATE_STAT_FE(HsInvocations, numPrims); 1021 1022 const uint32_t* pPrimId = (const uint32_t*)&primID; 1023 1024 for (uint32_t p = 0; p < numPrims; ++p) 1025 { 1026 // Run Tessellator 1027 SWR_TS_TESSELLATED_DATA tsData = { 0 }; 1028 AR_BEGIN(FETessellation, pDC->drawId); 1029 TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); 1030 AR_EVENT(TessPrimCount(1)); 1031 AR_END(FETessellation, 0); 1032 1033 if (tsData.NumPrimitives == 0) 1034 { 1035 continue; 1036 } 1037 SWR_ASSERT(tsData.NumDomainPoints); 1038 1039 // Allocate DS Output memory 1040 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; 1041 size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs; 1042 size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; 1043 if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors) 1044 { 1045 AlignedFree(gt_pTessellationThreadData->pDSOutput); 1046 gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64); 1047 gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors; 1048 } 1049 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); 1050 SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors); 1051 1052#if defined(_DEBUG) 1053 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize); 1054#endif 1055 1056 // Run Domain Shader 1057 SWR_DS_CONTEXT dsContext; 1058 dsContext.PrimitiveID = pPrimId[p]; 1059 dsContext.pCpIn = &hsContext.pCPout[p]; 1060 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; 1061 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; 1062 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; 1063 dsContext.vectorStride = requiredDSVectorInvocations; 1064 1065 uint32_t dsInvocations = 0; 1066 1067 for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) 1068 { 1069 dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations); 1070 1071 AR_BEGIN(FEDomainShader, pDC->drawId); 1072 state.pfnDsFunc(GetPrivateState(pDC), &dsContext); 1073 AR_END(FEDomainShader, 0); 1074 1075 dsInvocations += KNOB_SIMD_WIDTH; 1076 } 1077 UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); 1078 1079 PA_TESS tessPa( 1080 pDC, 1081 dsContext.pOutputData, 1082 dsContext.vectorStride, 1083 tsState.numDsOutputAttribs, 1084 tsData.ppIndices, 1085 tsData.NumPrimitives, 1086 tsState.postDSTopology); 1087 1088 while (tessPa.HasWork()) 1089 { 1090 if (HasGeometryShaderT::value) 1091 { 1092 GeometryShaderStage<HasStreamOutT, HasRastT>( 1093 pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, 1094 _simd_set1_epi32(dsContext.PrimitiveID)); 1095 } 1096 else 1097 { 1098 if (HasStreamOutT::value) 1099 { 1100 StreamOut(pDC, tessPa, workerId, pSoPrimData, 0); 1101 } 1102 1103 if (HasRastT::value) 1104 { 1105 simdvector prim[3]; // Only deal with triangles, lines, or points 1106 AR_BEGIN(FEPAAssemble, pDC->drawId); 1107#if SWR_ENABLE_ASSERTS 1108 bool assemble = 1109#endif 1110 tessPa.Assemble(VERTEX_POSITION_SLOT, prim); 1111 AR_END(FEPAAssemble, 1); 1112 SWR_ASSERT(assemble); 1113 1114 SWR_ASSERT(pfnClipFunc); 1115 pfnClipFunc(pDC, tessPa, workerId, prim, 1116 GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0)); 1117 } 1118 } 1119 1120 tessPa.NextPrim(); 1121 1122 } // while (tessPa.HasWork()) 1123 } // for (uint32_t p = 0; p < numPrims; ++p) 1124 1125 TSDestroyCtx(tsCtx); 1126} 1127 1128////////////////////////////////////////////////////////////////////////// 1129/// @brief FE handler for SwrDraw. 1130/// @tparam IsIndexedT - Is indexed drawing enabled 1131/// @tparam HasTessellationT - Is tessellation enabled 1132/// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled 1133/// @tparam HasStreamOutT - Is stream-out enabled 1134/// @tparam HasRastT - Is rasterization enabled 1135/// @param pContext - pointer to SWR context. 1136/// @param pDC - pointer to draw context. 1137/// @param workerId - thread's worker id. 1138/// @param pUserData - Pointer to DRAW_WORK 1139template < 1140 typename IsIndexedT, 1141 typename IsCutIndexEnabledT, 1142 typename HasTessellationT, 1143 typename HasGeometryShaderT, 1144 typename HasStreamOutT, 1145 typename HasRastT> 1146void ProcessDraw( 1147 SWR_CONTEXT *pContext, 1148 DRAW_CONTEXT *pDC, 1149 uint32_t workerId, 1150 void *pUserData) 1151{ 1152 1153#if KNOB_ENABLE_TOSS_POINTS 1154 if (KNOB_TOSS_QUEUE_FE) 1155 { 1156 return; 1157 } 1158#endif 1159 1160 AR_BEGIN(FEProcessDraw, pDC->drawId); 1161 1162 DRAW_WORK& work = *(DRAW_WORK*)pUserData; 1163 const API_STATE& state = GetApiState(pDC); 1164 __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); 1165 SWR_VS_CONTEXT vsContext; 1166 simdvertex vin; 1167 1168 int indexSize = 0; 1169 uint32_t endVertex = work.numVerts; 1170 1171 const int32_t* pLastRequestedIndex = nullptr; 1172 if (IsIndexedT::value) 1173 { 1174 switch (work.type) 1175 { 1176 case R32_UINT: 1177 indexSize = sizeof(uint32_t); 1178 pLastRequestedIndex = &(work.pIB[endVertex]); 1179 break; 1180 case R16_UINT: 1181 indexSize = sizeof(uint16_t); 1182 // nasty address offset to last index 1183 pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex])); 1184 break; 1185 case R8_UINT: 1186 indexSize = sizeof(uint8_t); 1187 // nasty address offset to last index 1188 pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex])); 1189 break; 1190 default: 1191 SWR_ASSERT(0); 1192 } 1193 } 1194 else 1195 { 1196 // No cuts, prune partial primitives. 1197 endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts)); 1198 } 1199 1200 SWR_FETCH_CONTEXT fetchInfo = { 0 }; 1201 fetchInfo.pStreams = &state.vertexBuffers[0]; 1202 fetchInfo.StartInstance = work.startInstance; 1203 fetchInfo.StartVertex = 0; 1204 1205 vsContext.pVin = &vin; 1206 1207 if (IsIndexedT::value) 1208 { 1209 fetchInfo.BaseVertex = work.baseVertex; 1210 1211 // if the entire index buffer isn't being consumed, set the last index 1212 // so that fetches < a SIMD wide will be masked off 1213 fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); 1214 if (pLastRequestedIndex < fetchInfo.pLastIndex) 1215 { 1216 fetchInfo.pLastIndex = pLastRequestedIndex; 1217 } 1218 } 1219 else 1220 { 1221 fetchInfo.StartVertex = work.startVertex; 1222 } 1223 1224#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR) 1225 uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); 1226#endif 1227 1228 void* pGsOut = nullptr; 1229 void* pCutBuffer = nullptr; 1230 void* pStreamCutBuffer = nullptr; 1231 if (HasGeometryShaderT::value) 1232 { 1233 AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer); 1234 } 1235 1236 if (HasTessellationT::value) 1237 { 1238 SWR_ASSERT(state.tsState.tsEnable == true); 1239 SWR_ASSERT(state.pfnHsFunc != nullptr); 1240 SWR_ASSERT(state.pfnDsFunc != nullptr); 1241 1242 AllocateTessellationData(pContext); 1243 } 1244 else 1245 { 1246 SWR_ASSERT(state.tsState.tsEnable == false); 1247 SWR_ASSERT(state.pfnHsFunc == nullptr); 1248 SWR_ASSERT(state.pfnDsFunc == nullptr); 1249 } 1250 1251 // allocate space for streamout input prim data 1252 uint32_t* pSoPrimData = nullptr; 1253 if (HasStreamOutT::value) 1254 { 1255 pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16); 1256 } 1257 1258 // choose primitive assembler 1259 PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts); 1260 PA_STATE& pa = paFactory.GetPA(); 1261 1262 /// @todo: temporarily move instance loop in the FE to ensure SO ordering 1263 for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) 1264 { 1265 simdscalari vIndex; 1266 uint32_t i = 0; 1267 1268 if (IsIndexedT::value) 1269 { 1270 fetchInfo.pIndices = work.pIB; 1271 } 1272 else 1273 { 1274 vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); 1275 fetchInfo.pIndices = (const int32_t*)&vIndex; 1276 } 1277 1278 fetchInfo.CurInstance = instanceNum; 1279 vsContext.InstanceID = instanceNum; 1280 1281 while (pa.HasWork()) 1282 { 1283 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state. 1284 // So we need to keep this outside of (i < endVertex) check. 1285 simdmask* pvCutIndices = nullptr; 1286 if (IsIndexedT::value) 1287 { 1288 pvCutIndices = &pa.GetNextVsIndices(); 1289 } 1290 1291 simdvertex& vout = pa.GetNextVsOutput(); 1292 vsContext.pVout = &vout; 1293 1294 if (i < endVertex) 1295 { 1296 1297 // 1. Execute FS/VS for a single SIMD. 1298 AR_BEGIN(FEFetchShader, pDC->drawId); 1299 state.pfnFetchFunc(fetchInfo, vin); 1300 AR_END(FEFetchShader, 0); 1301 1302 // forward fetch generated vertex IDs to the vertex shader 1303 vsContext.VertexID = fetchInfo.VertexID; 1304 1305 // Setup active mask for vertex shader. 1306 vsContext.mask = GenerateMask(endVertex - i); 1307 1308 // forward cut mask to the PA 1309 if (IsIndexedT::value) 1310 { 1311 *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); 1312 } 1313 1314 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); 1315 1316#if KNOB_ENABLE_TOSS_POINTS 1317 if (!KNOB_TOSS_FETCH) 1318#endif 1319 { 1320 AR_BEGIN(FEVertexShader, pDC->drawId); 1321 state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); 1322 AR_END(FEVertexShader, 0); 1323 1324 UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); 1325 } 1326 } 1327 1328 // 2. Assemble primitives given the last two SIMD. 1329 do 1330 { 1331 simdvector prim[MAX_NUM_VERTS_PER_PRIM]; 1332 // PaAssemble returns false if there is not enough verts to assemble. 1333 AR_BEGIN(FEPAAssemble, pDC->drawId); 1334 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); 1335 AR_END(FEPAAssemble, 1); 1336 1337#if KNOB_ENABLE_TOSS_POINTS 1338 if (!KNOB_TOSS_FETCH) 1339#endif 1340 { 1341#if KNOB_ENABLE_TOSS_POINTS 1342 if (!KNOB_TOSS_VS) 1343#endif 1344 { 1345 if (assemble) 1346 { 1347 UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); 1348 1349 if (HasTessellationT::value) 1350 { 1351 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( 1352 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); 1353 } 1354 else if (HasGeometryShaderT::value) 1355 { 1356 GeometryShaderStage<HasStreamOutT, HasRastT>( 1357 pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); 1358 } 1359 else 1360 { 1361 // If streamout is enabled then stream vertices out to memory. 1362 if (HasStreamOutT::value) 1363 { 1364 StreamOut(pDC, pa, workerId, pSoPrimData, 0); 1365 } 1366 1367 if (HasRastT::value) 1368 { 1369 SWR_ASSERT(pDC->pState->pfnProcessPrims); 1370 pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, 1371 GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0)); 1372 } 1373 } 1374 } 1375 } 1376 } 1377 } while (pa.NextPrim()); 1378 1379 i += KNOB_SIMD_WIDTH; 1380 if (IsIndexedT::value) 1381 { 1382 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); 1383 } 1384 else 1385 { 1386 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); 1387 } 1388 } 1389 pa.Reset(); 1390 } 1391 1392 1393 AR_END(FEProcessDraw, numPrims * work.numInstances); 1394} 1395 1396struct FEDrawChooser 1397{ 1398 typedef PFN_FE_WORK_FUNC FuncType; 1399 1400 template <typename... ArgsB> 1401 static FuncType GetFunc() 1402 { 1403 return ProcessDraw<ArgsB...>; 1404 } 1405}; 1406 1407 1408// Selector for correct templated Draw front-end function 1409PFN_FE_WORK_FUNC GetProcessDrawFunc( 1410 bool IsIndexed, 1411 bool IsCutIndexEnabled, 1412 bool HasTessellation, 1413 bool HasGeometryShader, 1414 bool HasStreamOut, 1415 bool HasRasterization) 1416{ 1417 return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization); 1418}