1/**************************************************************************** 2* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. 3* 4* Permission is hereby granted, free of charge, to any person obtaining a 5* copy of this software and associated documentation files (the "Software"), 6* to deal in the Software without restriction, including without limitation 7* the rights to use, copy, modify, merge, publish, distribute, sublicense, 8* and/or sell copies of the Software, and to permit persons to whom the 9* Software is furnished to do so, subject to the following conditions: 10* 11* The above copyright notice and this permission notice (including the next 12* paragraph) shall be included in all copies or substantial portions of the 13* Software. 14* 15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21* IN THE SOFTWARE. 22* 23* @file api.cpp 24* 25* @brief API implementation 26* 27******************************************************************************/ 28 29#include <cfloat> 30#include <cmath> 31#include <cstdio> 32#include <new> 33 34#include "core/api.h" 35#include "core/backend.h" 36#include "core/context.h" 37#include "core/depthstencil.h" 38#include "core/frontend.h" 39#include "core/rasterizer.h" 40#include "core/rdtsc_core.h" 41#include "core/threads.h" 42#include "core/tilemgr.h" 43#include "core/clip.h" 44#include "core/utils.h" 45 46#include "common/simdintrin.h" 47#include "common/os.h" 48 49static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y }; 50 51void SetupDefaultState(SWR_CONTEXT *pContext); 52 53static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) 54{ 55 return (SWR_CONTEXT*)hContext; 56} 57 58void WakeAllThreads(SWR_CONTEXT *pContext) 59{ 60 pContext->FifosNotEmpty.notify_all(); 61} 62 63////////////////////////////////////////////////////////////////////////// 64/// @brief Create SWR Context. 65/// @param pCreateInfo - pointer to creation info. 66HANDLE SwrCreateContext( 67 SWR_CREATECONTEXT_INFO* pCreateInfo) 68{ 69 RDTSC_RESET(); 70 RDTSC_INIT(0); 71 72 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4); 73 memset(pContextMem, 0, sizeof(SWR_CONTEXT)); 74 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); 75 76 pContext->privateStateSize = pCreateInfo->privateStateSize; 77 78 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); 79 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); 80 81 pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); 82 pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); 83 84 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) 85 { 86 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); 87 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); 88 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); 89 90 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); 91 } 92 93 pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; 94 pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; 95 pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; 96 pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; 97 pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; 98 99 if (pCreateInfo->pThreadInfo) 100 { 101 pContext->threadInfo = *pCreateInfo->pThreadInfo; 102 } 103 104 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); 105 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); 106 new (&pContext->WaitLock) std::mutex(); 107 new (&pContext->FifosNotEmpty) std::condition_variable(); 108 109 CreateThreadPool(pContext, &pContext->threadPool); 110 111 pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; 112 pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads]; 113 114#if defined(KNOB_ENABLE_AR) 115 // Setup ArchRast thread contexts which includes +1 for API thread. 116 pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1]; 117 pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API); 118#endif 119 120 // Allocate scratch space for workers. 121 ///@note We could lazily allocate this but its rather small amount of memory. 122 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) 123 { 124#if defined(_WIN32) 125 uint32_t numaNode = pContext->threadPool.pThreadData ? 126 pContext->threadPool.pThreadData[i].numaId : 0; 127 pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma( 128 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), 129 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, 130 numaNode); 131#else 132 pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); 133#endif 134 135#if defined(KNOB_ENABLE_AR) 136 // Initialize worker thread context for ArchRast. 137 pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER); 138#endif 139 } 140 141 // State setup AFTER context is fully initialized 142 SetupDefaultState(pContext); 143 144 // initialize hot tile manager 145 pContext->pHotTileMgr = new HotTileMgr(); 146 147 // initialize function pointer tables 148 InitClearTilesTable(); 149 150 // initialize callback functions 151 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; 152 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; 153 pContext->pfnClearTile = pCreateInfo->pfnClearTile; 154 pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; 155 pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; 156 pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; 157 158 159 // pass pointer to bucket manager back to caller 160#ifdef KNOB_ENABLE_RDTSC 161 pCreateInfo->pBucketMgr = &gBucketMgr; 162#endif 163 164 pCreateInfo->contextSaveSize = sizeof(API_STATE); 165 166 StartThreadPool(pContext, &pContext->threadPool); 167 168 return (HANDLE)pContext; 169} 170 171void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) 172{ 173 memcpy(&dst.state, &src.state, sizeof(API_STATE)); 174} 175 176template<bool IsDraw> 177void QueueWork(SWR_CONTEXT *pContext) 178{ 179 DRAW_CONTEXT* pDC = pContext->pCurDrawContext; 180 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; 181 182 if (IsDraw) 183 { 184 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex]; 185 pDC->pTileMgr->initialize(); 186 } 187 188 // Each worker thread looks at a DC for both FE and BE work at different times and so we 189 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers 190 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and 191 // then moved on if all work is done.) 192 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; 193 194 if (IsDraw) 195 { 196 InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE); 197 } 198 199 _ReadWriteBarrier(); 200 { 201 std::unique_lock<std::mutex> lock(pContext->WaitLock); 202 pContext->dcRing.Enqueue(); 203 } 204 205 if (pContext->threadInfo.SINGLE_THREADED) 206 { 207 // flush denormals to 0 208 uint32_t mxcsr = _mm_getcsr(); 209 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); 210 211 if (IsDraw) 212 { 213 uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; 214 WorkOnFifoFE(pContext, 0, curDraw[0]); 215 WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0); 216 } 217 else 218 { 219 uint32_t curDispatch = pContext->pCurDrawContext->drawId; 220 WorkOnCompute(pContext, 0, curDispatch); 221 } 222 223 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). 224 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {} 225 226 // restore csr 227 _mm_setcsr(mxcsr); 228 } 229 else 230 { 231 AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId); 232 WakeAllThreads(pContext); 233 AR_API_END(APIDrawWakeAllThreads, 1); 234 } 235 236 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. 237 pContext->pPrevDrawContext = pContext->pCurDrawContext; 238 pContext->pCurDrawContext = nullptr; 239} 240 241INLINE void QueueDraw(SWR_CONTEXT* pContext) 242{ 243 QueueWork<true>(pContext); 244} 245 246INLINE void QueueDispatch(SWR_CONTEXT* pContext) 247{ 248 QueueWork<false>(pContext); 249} 250 251DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) 252{ 253 AR_API_BEGIN(APIGetDrawContext, 0); 254 // If current draw context is null then need to obtain a new draw context to use from ring. 255 if (pContext->pCurDrawContext == nullptr) 256 { 257 // Need to wait for a free entry. 258 while (pContext->dcRing.IsFull()) 259 { 260 _mm_pause(); 261 } 262 263 uint64_t curDraw = pContext->dcRing.GetHead(); 264 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; 265 266 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 || 267 (curDraw - pContext->lastDrawChecked) > 0x10000) 268 { 269 // Take this opportunity to clean-up old arena allocations 270 pContext->cachingArenaAllocator.FreeOldBlocks(); 271 272 pContext->lastFrameChecked = pContext->frameCount; 273 pContext->lastDrawChecked = curDraw; 274 } 275 276 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; 277 pContext->pCurDrawContext = pCurDrawContext; 278 279 // Assign next available entry in DS ring to this DC. 280 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; 281 pCurDrawContext->pState = &pContext->dsRing[dsIndex]; 282 283 // Copy previous state to current state. 284 if (pContext->pPrevDrawContext) 285 { 286 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; 287 288 // If we're splitting our draw then we can just use the same state from the previous 289 // draw. In this case, we won't increment the DS ring index so the next non-split 290 // draw can receive the state. 291 if (isSplitDraw == false) 292 { 293 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); 294 295 // Should have been cleaned up previously 296 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); 297 298 pCurDrawContext->pState->pPrivateState = nullptr; 299 300 pContext->curStateId++; // Progress state ring index forward. 301 } 302 else 303 { 304 // If its a split draw then just copy the state pointer over 305 // since its the same draw. 306 pCurDrawContext->pState = pPrevDrawContext->pState; 307 SWR_ASSERT(pPrevDrawContext->cleanupState == false); 308 } 309 } 310 else 311 { 312 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); 313 pContext->curStateId++; // Progress state ring index forward. 314 } 315 316 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); 317 318 // Reset dependency 319 pCurDrawContext->dependent = false; 320 pCurDrawContext->dependentFE = false; 321 322 pCurDrawContext->pContext = pContext; 323 pCurDrawContext->isCompute = false; // Dispatch has to set this to true. 324 325 pCurDrawContext->doneFE = false; 326 pCurDrawContext->FeLock = 0; 327 pCurDrawContext->threadsDone = 0; 328 pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr; 329 330 pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads); 331 332 // Assign unique drawId for this DC 333 pCurDrawContext->drawId = pContext->dcRing.GetHead(); 334 335 pCurDrawContext->cleanupState = true; 336 337 } 338 else 339 { 340 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); 341 } 342 343 AR_API_END(APIGetDrawContext, 0); 344 return pContext->pCurDrawContext; 345} 346 347API_STATE* GetDrawState(SWR_CONTEXT *pContext) 348{ 349 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 350 SWR_ASSERT(pDC->pState != nullptr); 351 352 return &pDC->pState->state; 353} 354 355void SwrDestroyContext(HANDLE hContext) 356{ 357 SWR_CONTEXT *pContext = GetContext(hContext); 358 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 359 360 pDC->FeWork.type = SHUTDOWN; 361 pDC->FeWork.pfnWork = ProcessShutdown; 362 363 //enqueue 364 QueueDraw(pContext); 365 366 DestroyThreadPool(pContext, &pContext->threadPool); 367 368 // free the fifos 369 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) 370 { 371 delete[] pContext->dcRing[i].dynState.pStats; 372 delete pContext->dcRing[i].pArena; 373 delete pContext->dsRing[i].pArena; 374 pContext->pMacroTileManagerArray[i].~MacroTileMgr(); 375 pContext->pDispatchQueueArray[i].~DispatchQueue(); 376 } 377 378 AlignedFree(pContext->pDispatchQueueArray); 379 AlignedFree(pContext->pMacroTileManagerArray); 380 381 // Free scratch space. 382 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) 383 { 384#if defined(_WIN32) 385 VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); 386#else 387 AlignedFree(pContext->ppScratch[i]); 388#endif 389 390#if defined(KNOB_ENABLE_AR) 391 ArchRast::DestroyThreadContext(pContext->pArContext[i]); 392#endif 393 } 394 395 delete[] pContext->ppScratch; 396 delete[] pContext->pStats; 397 398 delete(pContext->pHotTileMgr); 399 400 pContext->~SWR_CONTEXT(); 401 AlignedFree(GetContext(hContext)); 402} 403 404void SWR_API SwrSaveState( 405 HANDLE hContext, 406 void* pOutputStateBlock, 407 size_t memSize) 408{ 409 SWR_CONTEXT *pContext = GetContext(hContext); 410 auto pSrc = GetDrawState(pContext); 411 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc)); 412 413 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc)); 414} 415 416void SWR_API SwrRestoreState( 417 HANDLE hContext, 418 const void* pStateBlock, 419 size_t memSize) 420{ 421 SWR_CONTEXT *pContext = GetContext(hContext); 422 auto pDst = GetDrawState(pContext); 423 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst)); 424 425 memcpy(pDst, pStateBlock, sizeof(*pDst)); 426} 427 428void SetupDefaultState(SWR_CONTEXT *pContext) 429{ 430 API_STATE* pState = GetDrawState(pContext); 431 432 pState->rastState.cullMode = SWR_CULLMODE_NONE; 433 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; 434 435 pState->depthBoundsState.depthBoundsTestEnable = false; 436 pState->depthBoundsState.depthBoundsTestMinValue = 0.0f; 437 pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f; 438} 439 440void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3) 441{ 442 SWR_ASSERT(pfnFunc != nullptr); 443 444 SWR_CONTEXT *pContext = GetContext(hContext); 445 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 446 447 AR_API_BEGIN(APISync, 0); 448 449 pDC->FeWork.type = SYNC; 450 pDC->FeWork.pfnWork = ProcessSync; 451 452 // Setup callback function 453 pDC->retireCallback.pfnCallbackFunc = pfnFunc; 454 pDC->retireCallback.userData = userData; 455 pDC->retireCallback.userData2 = userData2; 456 pDC->retireCallback.userData3 = userData3; 457 458 //enqueue 459 QueueDraw(pContext); 460 461 AR_API_END(APISync, 1); 462} 463 464void SwrWaitForIdle(HANDLE hContext) 465{ 466 SWR_CONTEXT *pContext = GetContext(hContext); 467 468 AR_API_BEGIN(APIWaitForIdle, 0); 469 470 while (!pContext->dcRing.IsEmpty()) 471 { 472 _mm_pause(); 473 } 474 475 AR_API_END(APIWaitForIdle, 1); 476} 477 478void SwrWaitForIdleFE(HANDLE hContext) 479{ 480 SWR_CONTEXT *pContext = GetContext(hContext); 481 482 AR_API_BEGIN(APIWaitForIdle, 0); 483 484 while (pContext->drawsOutstandingFE > 0) 485 { 486 _mm_pause(); 487 } 488 489 AR_API_END(APIWaitForIdle, 1); 490} 491 492void SwrSetVertexBuffers( 493 HANDLE hContext, 494 uint32_t numBuffers, 495 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers) 496{ 497 API_STATE* pState = GetDrawState(GetContext(hContext)); 498 499 for (uint32_t i = 0; i < numBuffers; ++i) 500 { 501 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i]; 502 pState->vertexBuffers[pVB->index] = *pVB; 503 } 504} 505 506void SwrSetIndexBuffer( 507 HANDLE hContext, 508 const SWR_INDEX_BUFFER_STATE* pIndexBuffer) 509{ 510 API_STATE* pState = GetDrawState(GetContext(hContext)); 511 512 pState->indexBuffer = *pIndexBuffer; 513} 514 515void SwrSetFetchFunc( 516 HANDLE hContext, 517 PFN_FETCH_FUNC pfnFetchFunc) 518{ 519 API_STATE* pState = GetDrawState(GetContext(hContext)); 520 521 pState->pfnFetchFunc = pfnFetchFunc; 522} 523 524void SwrSetSoFunc( 525 HANDLE hContext, 526 PFN_SO_FUNC pfnSoFunc, 527 uint32_t streamIndex) 528{ 529 API_STATE* pState = GetDrawState(GetContext(hContext)); 530 531 SWR_ASSERT(streamIndex < MAX_SO_STREAMS); 532 533 pState->pfnSoFunc[streamIndex] = pfnSoFunc; 534} 535 536void SwrSetSoState( 537 HANDLE hContext, 538 SWR_STREAMOUT_STATE* pSoState) 539{ 540 API_STATE* pState = GetDrawState(GetContext(hContext)); 541 542 pState->soState = *pSoState; 543} 544 545void SwrSetSoBuffers( 546 HANDLE hContext, 547 SWR_STREAMOUT_BUFFER* pSoBuffer, 548 uint32_t slot) 549{ 550 API_STATE* pState = GetDrawState(GetContext(hContext)); 551 552 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); 553 554 pState->soBuffer[slot] = *pSoBuffer; 555} 556 557void SwrSetVertexFunc( 558 HANDLE hContext, 559 PFN_VERTEX_FUNC pfnVertexFunc) 560{ 561 API_STATE* pState = GetDrawState(GetContext(hContext)); 562 563 pState->pfnVertexFunc = pfnVertexFunc; 564} 565 566void SwrSetFrontendState( 567 HANDLE hContext, 568 SWR_FRONTEND_STATE *pFEState) 569{ 570 API_STATE* pState = GetDrawState(GetContext(hContext)); 571 pState->frontendState = *pFEState; 572} 573 574void SwrSetGsState( 575 HANDLE hContext, 576 SWR_GS_STATE *pGSState) 577{ 578 API_STATE* pState = GetDrawState(GetContext(hContext)); 579 pState->gsState = *pGSState; 580} 581 582void SwrSetGsFunc( 583 HANDLE hContext, 584 PFN_GS_FUNC pfnGsFunc) 585{ 586 API_STATE* pState = GetDrawState(GetContext(hContext)); 587 pState->pfnGsFunc = pfnGsFunc; 588} 589 590void SwrSetCsFunc( 591 HANDLE hContext, 592 PFN_CS_FUNC pfnCsFunc, 593 uint32_t totalThreadsInGroup, 594 uint32_t totalSpillFillSize) 595{ 596 API_STATE* pState = GetDrawState(GetContext(hContext)); 597 pState->pfnCsFunc = pfnCsFunc; 598 pState->totalThreadsInGroup = totalThreadsInGroup; 599 pState->totalSpillFillSize = totalSpillFillSize; 600} 601 602void SwrSetTsState( 603 HANDLE hContext, 604 SWR_TS_STATE *pState) 605{ 606 API_STATE* pApiState = GetDrawState(GetContext(hContext)); 607 pApiState->tsState = *pState; 608} 609 610void SwrSetHsFunc( 611 HANDLE hContext, 612 PFN_HS_FUNC pfnFunc) 613{ 614 API_STATE* pApiState = GetDrawState(GetContext(hContext)); 615 pApiState->pfnHsFunc = pfnFunc; 616} 617 618void SwrSetDsFunc( 619 HANDLE hContext, 620 PFN_DS_FUNC pfnFunc) 621{ 622 API_STATE* pApiState = GetDrawState(GetContext(hContext)); 623 pApiState->pfnDsFunc = pfnFunc; 624} 625 626void SwrSetDepthStencilState( 627 HANDLE hContext, 628 SWR_DEPTH_STENCIL_STATE *pDSState) 629{ 630 API_STATE* pState = GetDrawState(GetContext(hContext)); 631 632 pState->depthStencilState = *pDSState; 633} 634 635void SwrSetBackendState( 636 HANDLE hContext, 637 SWR_BACKEND_STATE *pBEState) 638{ 639 API_STATE* pState = GetDrawState(GetContext(hContext)); 640 641 pState->backendState = *pBEState; 642} 643 644void SwrSetDepthBoundsState( 645 HANDLE hContext, 646 SWR_DEPTH_BOUNDS_STATE *pDBState) 647{ 648 API_STATE* pState = GetDrawState(GetContext(hContext)); 649 650 pState->depthBoundsState = *pDBState; 651} 652 653void SwrSetPixelShaderState( 654 HANDLE hContext, 655 SWR_PS_STATE *pPSState) 656{ 657 API_STATE *pState = GetDrawState(GetContext(hContext)); 658 pState->psState = *pPSState; 659} 660 661void SwrSetBlendState( 662 HANDLE hContext, 663 SWR_BLEND_STATE *pBlendState) 664{ 665 API_STATE *pState = GetDrawState(GetContext(hContext)); 666 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE)); 667} 668 669void SwrSetBlendFunc( 670 HANDLE hContext, 671 uint32_t renderTarget, 672 PFN_BLEND_JIT_FUNC pfnBlendFunc) 673{ 674 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); 675 API_STATE *pState = GetDrawState(GetContext(hContext)); 676 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; 677} 678 679// update guardband multipliers for the viewport 680void updateGuardbands(API_STATE *pState) 681{ 682 uint32_t numGbs = pState->gsState.emitsRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; 683 684 for(uint32_t i = 0; i < numGbs; ++i) 685 { 686 // guardband center is viewport center 687 pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; 688 pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; 689 pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; 690 pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; 691 } 692} 693 694void SwrSetRastState( 695 HANDLE hContext, 696 const SWR_RASTSTATE *pRastState) 697{ 698 SWR_CONTEXT *pContext = GetContext(hContext); 699 API_STATE* pState = GetDrawState(pContext); 700 701 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE)); 702} 703 704void SwrSetViewports( 705 HANDLE hContext, 706 uint32_t numViewports, 707 const SWR_VIEWPORT* pViewports, 708 const SWR_VIEWPORT_MATRICES* pMatrices) 709{ 710 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, 711 "Invalid number of viewports."); 712 713 SWR_CONTEXT *pContext = GetContext(hContext); 714 API_STATE* pState = GetDrawState(pContext); 715 716 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); 717 // @todo Faster to copy portions of the SOA or just copy all of it? 718 memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES)); 719 720 updateGuardbands(pState); 721} 722 723void SwrSetScissorRects( 724 HANDLE hContext, 725 uint32_t numScissors, 726 const SWR_RECT* pScissors) 727{ 728 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, 729 "Invalid number of scissor rects."); 730 731 API_STATE* pState = GetDrawState(GetContext(hContext)); 732 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0])); 733}; 734 735void SetupMacroTileScissors(DRAW_CONTEXT *pDC) 736{ 737 API_STATE *pState = &pDC->pState->state; 738 uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; 739 pState->scissorsTileAligned = true; 740 741 for (uint32_t index = 0; index < numScissors; ++index) 742 { 743 SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index]; 744 745 // Set up scissor dimensions based on scissor or viewport 746 if (pState->rastState.scissorEnable) 747 { 748 scissorInFixedPoint = pState->scissorRects[index]; 749 } 750 else 751 { 752 // the vp width and height must be added to origin un-rounded then the result round to -inf. 753 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive. 754 scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; 755 scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); 756 scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; 757 scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height); 758 } 759 760 // Clamp to max rect 761 scissorInFixedPoint &= g_MaxScissorRect; 762 763 // Test for tile alignment 764 bool tileAligned; 765 tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; 766 tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; 767 tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; 768 tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0; 769 770 pState->scissorsTileAligned &= tileAligned; 771 772 // Scale to fixed point 773 scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; 774 scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; 775 scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; 776 scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; 777 778 // Make scissor inclusive 779 scissorInFixedPoint.xmax -= 1; 780 scissorInFixedPoint.ymax -= 1; 781 } 782} 783 784// templated backend function tables 785extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; 786extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2]; 787extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2]; 788extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]; 789void SetupPipeline(DRAW_CONTEXT *pDC) 790{ 791 SWR_CONTEXT* pContext = pDC->pContext; 792 DRAW_STATE* pState = pDC->pState; 793 const SWR_RASTSTATE &rastState = pState->state.rastState; 794 const SWR_PS_STATE &psState = pState->state.psState; 795 BACKEND_FUNCS& backendFuncs = pState->backendFuncs; 796 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; 797 798 // setup backend 799 if (psState.pfnPixelShader == nullptr) 800 { 801 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; 802 } 803 else 804 { 805 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; 806 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; 807 const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; 808 809 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; 810 811 // select backend function 812 switch(psState.shadingRate) 813 { 814 case SWR_SHADING_RATE_PIXEL: 815 if(bMultisampleEnable) 816 { 817 // always need to generate I & J per sample for Z interpolation 818 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); 819 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ]; 820 } 821 else 822 { 823 // always need to generate I & J per pixel for Z interpolation 824 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); 825 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; 826 } 827 break; 828 case SWR_SHADING_RATE_SAMPLE: 829 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); 830 // always need to generate I & J per sample for Z interpolation 831 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); 832 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; 833 break; 834 default: 835 SWR_ASSERT(0 && "Invalid shading rate"); 836 break; 837 } 838 } 839 840 PFN_PROCESS_PRIMS pfnBinner; 841 switch (pState->state.topology) 842 { 843 case TOP_POINT_LIST: 844 pState->pfnProcessPrims = ClipPoints; 845 pfnBinner = BinPoints; 846 break; 847 case TOP_LINE_LIST: 848 case TOP_LINE_STRIP: 849 case TOP_LINE_LOOP: 850 case TOP_LINE_LIST_ADJ: 851 case TOP_LISTSTRIP_ADJ: 852 pState->pfnProcessPrims = ClipLines; 853 pfnBinner = BinLines; 854 break; 855 default: 856 pState->pfnProcessPrims = ClipTriangles; 857 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); 858 break; 859 }; 860 861 862 // disable clipper if viewport transform is disabled 863 if (pState->state.frontendState.vpTransformDisable) 864 { 865 pState->pfnProcessPrims = pfnBinner; 866 } 867 868 if ((pState->state.psState.pfnPixelShader == nullptr) && 869 (pState->state.depthStencilState.depthTestEnable == FALSE) && 870 (pState->state.depthStencilState.depthWriteEnable == FALSE) && 871 (pState->state.depthStencilState.stencilTestEnable == FALSE) && 872 (pState->state.depthStencilState.stencilWriteEnable == FALSE) && 873 (pState->state.backendState.numAttributes == 0)) 874 { 875 pState->pfnProcessPrims = nullptr; 876 } 877 878 if (pState->state.soState.rasterizerDisable == true) 879 { 880 pState->pfnProcessPrims = nullptr; 881 } 882 883 884 // set up the frontend attribute count 885 pState->state.feNumAttributes = 0; 886 const SWR_BACKEND_STATE& backendState = pState->state.backendState; 887 if (backendState.swizzleEnable) 888 { 889 // attribute swizzling is enabled, iterate over the map and record the max attribute used 890 for (uint32_t i = 0; i < backendState.numAttributes; ++i) 891 { 892 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1); 893 } 894 } 895 else 896 { 897 pState->state.feNumAttributes = pState->state.backendState.numAttributes; 898 } 899 900 if (pState->state.soState.soEnable) 901 { 902 uint32_t streamMasks = 0; 903 for (uint32_t i = 0; i < 4; ++i) 904 { 905 streamMasks |= pState->state.soState.streamMasks[i]; 906 } 907 908 DWORD maxAttrib; 909 if (_BitScanReverse(&maxAttrib, streamMasks)) 910 { 911 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1)); 912 } 913 } 914 915 // complicated logic to test for cases where we don't need backing hottile memory for a draw 916 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled. 917 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable && 918 !pState->state.depthStencilState.depthWriteEnable && 919 !pState->state.depthBoundsState.depthBoundsTestEnable && 920 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && 921 (pState->state.depthStencilState.depthTestEnable || 922 pState->state.depthStencilState.depthWriteEnable || 923 pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false; 924 925 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable && 926 !pState->state.depthStencilState.stencilWriteEnable && 927 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) || 928 // for stencil we have to check the double sided state as well 929 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable && 930 !pState->state.depthStencilState.stencilWriteEnable && 931 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && 932 (pState->state.depthStencilState.stencilTestEnable || 933 pState->state.depthStencilState.stencilWriteEnable)) ? true : false; 934 935 uint32_t numRTs = pState->state.psState.numRenderTargets; 936 pState->state.colorHottileEnable = 0; 937 if (psState.pfnPixelShader != nullptr) 938 { 939 for (uint32_t rt = 0; rt < numRTs; ++rt) 940 { 941 pState->state.colorHottileEnable |= 942 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha || 943 !pState->state.blendState.renderTarget[rt].writeDisableRed || 944 !pState->state.blendState.renderTarget[rt].writeDisableGreen || 945 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; 946 } 947 } 948 949 // Setup depth quantization function 950 if (pState->state.depthHottileEnable) 951 { 952 switch (pState->state.rastState.depthFormat) 953 { 954 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break; 955 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break; 956 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break; 957 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break; 958 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion."); 959 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; 960 } 961 } 962 else 963 { 964 // set up pass-through quantize if depth isn't enabled 965 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; 966 } 967} 968 969////////////////////////////////////////////////////////////////////////// 970/// @brief InitDraw 971/// @param pDC - Draw context to initialize for this draw. 972void InitDraw( 973 DRAW_CONTEXT *pDC, 974 bool isSplitDraw) 975{ 976 // We don't need to re-setup the scissors/pipeline state again for split draw. 977 if (isSplitDraw == false) 978 { 979 SetupMacroTileScissors(pDC); 980 SetupPipeline(pDC); 981 } 982 983 984} 985 986////////////////////////////////////////////////////////////////////////// 987/// @brief We can split the draw for certain topologies for better performance. 988/// @param totalVerts - Total vertices for draw 989/// @param topology - Topology used for draw 990uint32_t MaxVertsPerDraw( 991 DRAW_CONTEXT* pDC, 992 uint32_t totalVerts, 993 PRIMITIVE_TOPOLOGY topology) 994{ 995 API_STATE& state = pDC->pState->state; 996 997 uint32_t vertsPerDraw = totalVerts; 998 999 if (state.soState.soEnable) 1000 { 1001 return totalVerts; 1002 } 1003 1004 switch (topology) 1005 { 1006 case TOP_POINT_LIST: 1007 case TOP_TRIANGLE_LIST: 1008 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW; 1009 break; 1010 1011 case TOP_PATCHLIST_1: 1012 case TOP_PATCHLIST_2: 1013 case TOP_PATCHLIST_3: 1014 case TOP_PATCHLIST_4: 1015 case TOP_PATCHLIST_5: 1016 case TOP_PATCHLIST_6: 1017 case TOP_PATCHLIST_7: 1018 case TOP_PATCHLIST_8: 1019 case TOP_PATCHLIST_9: 1020 case TOP_PATCHLIST_10: 1021 case TOP_PATCHLIST_11: 1022 case TOP_PATCHLIST_12: 1023 case TOP_PATCHLIST_13: 1024 case TOP_PATCHLIST_14: 1025 case TOP_PATCHLIST_15: 1026 case TOP_PATCHLIST_16: 1027 case TOP_PATCHLIST_17: 1028 case TOP_PATCHLIST_18: 1029 case TOP_PATCHLIST_19: 1030 case TOP_PATCHLIST_20: 1031 case TOP_PATCHLIST_21: 1032 case TOP_PATCHLIST_22: 1033 case TOP_PATCHLIST_23: 1034 case TOP_PATCHLIST_24: 1035 case TOP_PATCHLIST_25: 1036 case TOP_PATCHLIST_26: 1037 case TOP_PATCHLIST_27: 1038 case TOP_PATCHLIST_28: 1039 case TOP_PATCHLIST_29: 1040 case TOP_PATCHLIST_30: 1041 case TOP_PATCHLIST_31: 1042 case TOP_PATCHLIST_32: 1043 if (pDC->pState->state.tsState.tsEnable) 1044 { 1045 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE; 1046 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW; 1047 } 1048 break; 1049 1050 // The Primitive Assembly code can only handle 1 RECT at a time. 1051 case TOP_RECT_LIST: 1052 vertsPerDraw = 3; 1053 break; 1054 1055 default: 1056 // We are not splitting up draws for other topologies. 1057 break; 1058 } 1059 1060 return vertsPerDraw; 1061} 1062 1063 1064////////////////////////////////////////////////////////////////////////// 1065/// @brief DrawInstanced 1066/// @param hContext - Handle passed back from SwrCreateContext 1067/// @param topology - Specifies topology for draw. 1068/// @param numVerts - How many vertices to read sequentially from vertex data (per instance). 1069/// @param startVertex - Specifies start vertex for draw. (vertex data) 1070/// @param numInstances - How many instances to render. 1071/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1072void DrawInstanced( 1073 HANDLE hContext, 1074 PRIMITIVE_TOPOLOGY topology, 1075 uint32_t numVertices, 1076 uint32_t startVertex, 1077 uint32_t numInstances = 1, 1078 uint32_t startInstance = 0) 1079{ 1080 if (KNOB_TOSS_DRAW) 1081 { 1082 return; 1083 } 1084 1085 SWR_CONTEXT *pContext = GetContext(hContext); 1086 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1087 1088 AR_API_BEGIN(APIDraw, pDC->drawId); 1089 AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance)); 1090 1091 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); 1092 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); 1093 uint32_t remainingVerts = numVertices; 1094 1095 API_STATE *pState = &pDC->pState->state; 1096 pState->topology = topology; 1097 pState->forceFront = false; 1098 1099 // disable culling for points/lines 1100 uint32_t oldCullMode = pState->rastState.cullMode; 1101 if (topology == TOP_POINT_LIST) 1102 { 1103 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1104 pState->forceFront = true; 1105 } 1106 else if (topology == TOP_RECT_LIST) 1107 { 1108 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1109 } 1110 1111 1112 int draw = 0; 1113 while (remainingVerts) 1114 { 1115 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? 1116 remainingVerts : maxVertsPerDraw; 1117 1118 bool isSplitDraw = (draw > 0) ? true : false; 1119 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); 1120 InitDraw(pDC, isSplitDraw); 1121 1122 pDC->FeWork.type = DRAW; 1123 pDC->FeWork.pfnWork = GetProcessDrawFunc( 1124 false, // IsIndexed 1125 false, // bEnableCutIndex 1126 pState->tsState.tsEnable, 1127 pState->gsState.gsEnable, 1128 pState->soState.soEnable, 1129 pDC->pState->pfnProcessPrims != nullptr); 1130 pDC->FeWork.desc.draw.numVerts = numVertsForDraw; 1131 pDC->FeWork.desc.draw.startVertex = startVertex; 1132 pDC->FeWork.desc.draw.numInstances = numInstances; 1133 pDC->FeWork.desc.draw.startInstance = startInstance; 1134 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; 1135 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; 1136 1137 pDC->cleanupState = (remainingVerts == numVertsForDraw); 1138 1139 //enqueue DC 1140 QueueDraw(pContext); 1141 1142 remainingVerts -= numVertsForDraw; 1143 draw++; 1144 } 1145 1146 // restore culling state 1147 pDC = GetDrawContext(pContext); 1148 pDC->pState->state.rastState.cullMode = oldCullMode; 1149 1150 1151 AR_API_END(APIDraw, numVertices * numInstances); 1152} 1153 1154////////////////////////////////////////////////////////////////////////// 1155/// @brief SwrDraw 1156/// @param hContext - Handle passed back from SwrCreateContext 1157/// @param topology - Specifies topology for draw. 1158/// @param startVertex - Specifies start vertex in vertex buffer for draw. 1159/// @param primCount - Number of vertices. 1160void SwrDraw( 1161 HANDLE hContext, 1162 PRIMITIVE_TOPOLOGY topology, 1163 uint32_t startVertex, 1164 uint32_t numVertices) 1165{ 1166 DrawInstanced(hContext, topology, numVertices, startVertex); 1167} 1168 1169////////////////////////////////////////////////////////////////////////// 1170/// @brief SwrDrawInstanced 1171/// @param hContext - Handle passed back from SwrCreateContext 1172/// @param topology - Specifies topology for draw. 1173/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data. 1174/// @param numInstances - How many instances to render. 1175/// @param startVertex - Specifies start vertex for draw. (vertex data) 1176/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1177void SwrDrawInstanced( 1178 HANDLE hContext, 1179 PRIMITIVE_TOPOLOGY topology, 1180 uint32_t numVertsPerInstance, 1181 uint32_t numInstances, 1182 uint32_t startVertex, 1183 uint32_t startInstance 1184 ) 1185{ 1186 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance); 1187} 1188 1189////////////////////////////////////////////////////////////////////////// 1190/// @brief DrawIndexedInstanced 1191/// @param hContext - Handle passed back from SwrCreateContext 1192/// @param topology - Specifies topology for draw. 1193/// @param numIndices - Number of indices to read sequentially from index buffer. 1194/// @param indexOffset - Starting index into index buffer. 1195/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. 1196/// @param numInstances - Number of instances to render. 1197/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1198void DrawIndexedInstance( 1199 HANDLE hContext, 1200 PRIMITIVE_TOPOLOGY topology, 1201 uint32_t numIndices, 1202 uint32_t indexOffset, 1203 int32_t baseVertex, 1204 uint32_t numInstances = 1, 1205 uint32_t startInstance = 0) 1206{ 1207 if (KNOB_TOSS_DRAW) 1208 { 1209 return; 1210 } 1211 1212 SWR_CONTEXT *pContext = GetContext(hContext); 1213 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1214 API_STATE* pState = &pDC->pState->state; 1215 1216 AR_API_BEGIN(APIDrawIndexed, pDC->drawId); 1217 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance)); 1218 1219 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); 1220 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); 1221 uint32_t remainingIndices = numIndices; 1222 1223 uint32_t indexSize = 0; 1224 switch (pState->indexBuffer.format) 1225 { 1226 case R32_UINT: indexSize = sizeof(uint32_t); break; 1227 case R16_UINT: indexSize = sizeof(uint16_t); break; 1228 case R8_UINT: indexSize = sizeof(uint8_t); break; 1229 default: 1230 SWR_ASSERT(0); 1231 } 1232 1233 int draw = 0; 1234 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; 1235 pIB += (uint64_t)indexOffset * (uint64_t)indexSize; 1236 1237 pState->topology = topology; 1238 pState->forceFront = false; 1239 1240 // disable culling for points/lines 1241 uint32_t oldCullMode = pState->rastState.cullMode; 1242 if (topology == TOP_POINT_LIST) 1243 { 1244 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1245 pState->forceFront = true; 1246 } 1247 else if (topology == TOP_RECT_LIST) 1248 { 1249 pState->rastState.cullMode = SWR_CULLMODE_NONE; 1250 } 1251 1252 1253 while (remainingIndices) 1254 { 1255 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? 1256 remainingIndices : maxIndicesPerDraw; 1257 1258 // When breaking up draw, we need to obtain new draw context for each iteration. 1259 bool isSplitDraw = (draw > 0) ? true : false; 1260 1261 pDC = GetDrawContext(pContext, isSplitDraw); 1262 InitDraw(pDC, isSplitDraw); 1263 1264 pDC->FeWork.type = DRAW; 1265 pDC->FeWork.pfnWork = GetProcessDrawFunc( 1266 true, // IsIndexed 1267 pState->frontendState.bEnableCutIndex, 1268 pState->tsState.tsEnable, 1269 pState->gsState.gsEnable, 1270 pState->soState.soEnable, 1271 pDC->pState->pfnProcessPrims != nullptr); 1272 pDC->FeWork.desc.draw.pDC = pDC; 1273 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; 1274 pDC->FeWork.desc.draw.pIB = (int*)pIB; 1275 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; 1276 1277 pDC->FeWork.desc.draw.numInstances = numInstances; 1278 pDC->FeWork.desc.draw.startInstance = startInstance; 1279 pDC->FeWork.desc.draw.baseVertex = baseVertex; 1280 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; 1281 1282 pDC->cleanupState = (remainingIndices == numIndicesForDraw); 1283 1284 //enqueue DC 1285 QueueDraw(pContext); 1286 1287 pIB += maxIndicesPerDraw * indexSize; 1288 remainingIndices -= numIndicesForDraw; 1289 draw++; 1290 } 1291 1292 // Restore culling state 1293 pDC = GetDrawContext(pContext); 1294 pDC->pState->state.rastState.cullMode = oldCullMode; 1295 1296 1297 AR_API_END(APIDrawIndexed, numIndices * numInstances); 1298} 1299 1300 1301////////////////////////////////////////////////////////////////////////// 1302/// @brief DrawIndexed 1303/// @param hContext - Handle passed back from SwrCreateContext 1304/// @param topology - Specifies topology for draw. 1305/// @param numIndices - Number of indices to read sequentially from index buffer. 1306/// @param indexOffset - Starting index into index buffer. 1307/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. 1308void SwrDrawIndexed( 1309 HANDLE hContext, 1310 PRIMITIVE_TOPOLOGY topology, 1311 uint32_t numIndices, 1312 uint32_t indexOffset, 1313 int32_t baseVertex 1314 ) 1315{ 1316 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex); 1317} 1318 1319////////////////////////////////////////////////////////////////////////// 1320/// @brief SwrDrawIndexedInstanced 1321/// @param hContext - Handle passed back from SwrCreateContext 1322/// @param topology - Specifies topology for draw. 1323/// @param numIndices - Number of indices to read sequentially from index buffer. 1324/// @param numInstances - Number of instances to render. 1325/// @param indexOffset - Starting index into index buffer. 1326/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. 1327/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) 1328void SwrDrawIndexedInstanced( 1329 HANDLE hContext, 1330 PRIMITIVE_TOPOLOGY topology, 1331 uint32_t numIndices, 1332 uint32_t numInstances, 1333 uint32_t indexOffset, 1334 int32_t baseVertex, 1335 uint32_t startInstance) 1336{ 1337 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); 1338} 1339 1340////////////////////////////////////////////////////////////////////////// 1341/// @brief SwrInvalidateTiles 1342/// @param hContext - Handle passed back from SwrCreateContext 1343/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. 1344/// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to 1345/// be hottile size-aligned. 1346void SWR_API SwrInvalidateTiles( 1347 HANDLE hContext, 1348 uint32_t attachmentMask, 1349 const SWR_RECT& invalidateRect) 1350{ 1351 if (KNOB_TOSS_DRAW) 1352 { 1353 return; 1354 } 1355 1356 SWR_CONTEXT *pContext = GetContext(hContext); 1357 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1358 1359 pDC->FeWork.type = DISCARDINVALIDATETILES; 1360 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; 1361 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; 1362 pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect; 1363 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; 1364 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; 1365 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; 1366 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; 1367 1368 //enqueue 1369 QueueDraw(pContext); 1370} 1371 1372////////////////////////////////////////////////////////////////////////// 1373/// @brief SwrDiscardRect 1374/// @param hContext - Handle passed back from SwrCreateContext 1375/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. 1376/// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be 1377/// discarded. 1378void SWR_API SwrDiscardRect( 1379 HANDLE hContext, 1380 uint32_t attachmentMask, 1381 const SWR_RECT& rect) 1382{ 1383 if (KNOB_TOSS_DRAW) 1384 { 1385 return; 1386 } 1387 1388 SWR_CONTEXT *pContext = GetContext(hContext); 1389 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1390 1391 // Queue a load to the hottile 1392 pDC->FeWork.type = DISCARDINVALIDATETILES; 1393 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; 1394 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; 1395 pDC->FeWork.desc.discardInvalidateTiles.rect = rect; 1396 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; 1397 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; 1398 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; 1399 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; 1400 1401 //enqueue 1402 QueueDraw(pContext); 1403} 1404 1405////////////////////////////////////////////////////////////////////////// 1406/// @brief SwrDispatch 1407/// @param hContext - Handle passed back from SwrCreateContext 1408/// @param threadGroupCountX - Number of thread groups dispatched in X direction 1409/// @param threadGroupCountY - Number of thread groups dispatched in Y direction 1410/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction 1411void SwrDispatch( 1412 HANDLE hContext, 1413 uint32_t threadGroupCountX, 1414 uint32_t threadGroupCountY, 1415 uint32_t threadGroupCountZ) 1416{ 1417 if (KNOB_TOSS_DRAW) 1418 { 1419 return; 1420 } 1421 1422 SWR_CONTEXT *pContext = GetContext(hContext); 1423 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1424 1425 AR_API_BEGIN(APIDispatch, pDC->drawId); 1426 AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ)); 1427 pDC->isCompute = true; // This is a compute context. 1428 1429 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); 1430 1431 pTaskData->threadGroupCountX = threadGroupCountX; 1432 pTaskData->threadGroupCountY = threadGroupCountY; 1433 pTaskData->threadGroupCountZ = threadGroupCountZ; 1434 1435 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; 1436 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; 1437 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; 1438 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE); 1439 1440 QueueDispatch(pContext); 1441 AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ); 1442} 1443 1444// Deswizzles, converts and stores current contents of the hot tiles to surface 1445// described by pState 1446void SWR_API SwrStoreTiles( 1447 HANDLE hContext, 1448 uint32_t attachmentMask, 1449 SWR_TILE_STATE postStoreTileState, 1450 const SWR_RECT& storeRect) 1451{ 1452 if (KNOB_TOSS_DRAW) 1453 { 1454 return; 1455 } 1456 1457 SWR_CONTEXT *pContext = GetContext(hContext); 1458 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1459 1460 AR_API_BEGIN(APIStoreTiles, pDC->drawId); 1461 1462 pDC->FeWork.type = STORETILES; 1463 pDC->FeWork.pfnWork = ProcessStoreTiles; 1464 pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask; 1465 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; 1466 pDC->FeWork.desc.storeTiles.rect = storeRect; 1467 pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect; 1468 1469 //enqueue 1470 QueueDraw(pContext); 1471 1472 AR_API_END(APIStoreTiles, 1); 1473} 1474 1475////////////////////////////////////////////////////////////////////////// 1476/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil 1477/// @param hContext - Handle passed back from SwrCreateContext 1478/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear 1479/// @param renderTargetArrayIndex - the RT array index to clear 1480/// @param clearColor - color use for clearing render targets 1481/// @param z - depth value use for clearing depth buffer 1482/// @param stencil - stencil value used for clearing stencil buffer 1483/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers 1484void SWR_API SwrClearRenderTarget( 1485 HANDLE hContext, 1486 uint32_t attachmentMask, 1487 uint32_t renderTargetArrayIndex, 1488 const float clearColor[4], 1489 float z, 1490 uint8_t stencil, 1491 const SWR_RECT& clearRect) 1492{ 1493 if (KNOB_TOSS_DRAW) 1494 { 1495 return; 1496 } 1497 1498 SWR_CONTEXT *pContext = GetContext(hContext); 1499 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1500 1501 AR_API_BEGIN(APIClearRenderTarget, pDC->drawId); 1502 1503 pDC->FeWork.type = CLEAR; 1504 pDC->FeWork.pfnWork = ProcessClear; 1505 pDC->FeWork.desc.clear.rect = clearRect; 1506 pDC->FeWork.desc.clear.rect &= g_MaxScissorRect; 1507 pDC->FeWork.desc.clear.attachmentMask = attachmentMask; 1508 pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex; 1509 pDC->FeWork.desc.clear.clearDepth = z; 1510 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; 1511 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; 1512 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; 1513 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; 1514 pDC->FeWork.desc.clear.clearStencil = stencil; 1515 1516 // enqueue draw 1517 QueueDraw(pContext); 1518 1519 AR_API_END(APIClearRenderTarget, 1); 1520} 1521 1522////////////////////////////////////////////////////////////////////////// 1523/// @brief Returns a pointer to the private context state for the current 1524/// draw operation. This is used for external componets such as the 1525/// sampler. 1526/// SWR is responsible for the allocation of the private context state. 1527/// @param hContext - Handle passed back from SwrCreateContext 1528VOID* SwrGetPrivateContextState( 1529 HANDLE hContext) 1530{ 1531 SWR_CONTEXT* pContext = GetContext(hContext); 1532 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1533 DRAW_STATE* pState = pDC->pState; 1534 1535 if (pState->pPrivateState == nullptr) 1536 { 1537 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); 1538 } 1539 1540 return pState->pPrivateState; 1541} 1542 1543////////////////////////////////////////////////////////////////////////// 1544/// @brief Clients can use this to allocate memory for draw/dispatch 1545/// operations. The memory will automatically be freed once operation 1546/// has completed. Client can use this to allocate binding tables, 1547/// etc. needed for shader execution. 1548/// @param hContext - Handle passed back from SwrCreateContext 1549/// @param size - Size of allocation 1550/// @param align - Alignment needed for allocation. 1551VOID* SwrAllocDrawContextMemory( 1552 HANDLE hContext, 1553 uint32_t size, 1554 uint32_t align) 1555{ 1556 SWR_CONTEXT* pContext = GetContext(hContext); 1557 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1558 1559 return pDC->pState->pArena->AllocAligned(size, align); 1560} 1561 1562////////////////////////////////////////////////////////////////////////// 1563/// @brief Enables stats counting 1564/// @param hContext - Handle passed back from SwrCreateContext 1565/// @param enable - If true then counts are incremented. 1566void SwrEnableStatsFE( 1567 HANDLE hContext, 1568 bool enable) 1569{ 1570 SWR_CONTEXT *pContext = GetContext(hContext); 1571 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1572 1573 pDC->pState->state.enableStatsFE = enable; 1574} 1575 1576////////////////////////////////////////////////////////////////////////// 1577/// @brief Enables stats counting 1578/// @param hContext - Handle passed back from SwrCreateContext 1579/// @param enable - If true then counts are incremented. 1580void SwrEnableStatsBE( 1581 HANDLE hContext, 1582 bool enable) 1583{ 1584 SWR_CONTEXT *pContext = GetContext(hContext); 1585 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1586 1587 pDC->pState->state.enableStatsBE = enable; 1588} 1589 1590////////////////////////////////////////////////////////////////////////// 1591/// @brief Mark end of frame - used for performance profiling 1592/// @param hContext - Handle passed back from SwrCreateContext 1593void SWR_API SwrEndFrame( 1594 HANDLE hContext) 1595{ 1596 SWR_CONTEXT *pContext = GetContext(hContext); 1597 DRAW_CONTEXT* pDC = GetDrawContext(pContext); 1598 1599 RDTSC_ENDFRAME(); 1600 AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId)); 1601 1602 pContext->frameCount++; 1603} 1604 1605