1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file api.cpp
24  *
25  * @brief API implementation
26  *
27  ******************************************************************************/
28 
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33 
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45 #include "core/tileset.h"
46 
47 #include "common/os.h"
48 
49 static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y};
50 
51 void SetupDefaultState(SWR_CONTEXT* pContext);
52 
GetContext(HANDLE hContext)53 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
54 {
55     return (SWR_CONTEXT*)hContext;
56 }
57 
WakeAllThreads(SWR_CONTEXT * pContext)58 void WakeAllThreads(SWR_CONTEXT* pContext)
59 {
60     pContext->FifosNotEmpty.notify_all();
61 }
62 
63 //////////////////////////////////////////////////////////////////////////
64 /// @brief Create SWR Context.
65 /// @param pCreateInfo - pointer to creation info.
SwrCreateContext(SWR_CREATECONTEXT_INFO * pCreateInfo)66 HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
67 {
68     void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
69     memset(pContextMem, 0, sizeof(SWR_CONTEXT));
70     SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT();
71 
72     pContext->privateStateSize = pCreateInfo->privateStateSize;
73 
74     // initialize callback functions
75     pContext->pfnLoadTile                = pCreateInfo->pfnLoadTile;
76     pContext->pfnStoreTile               = pCreateInfo->pfnStoreTile;
77     pContext->pfnTranslateGfxptrForRead  = pCreateInfo->pfnTranslateGfxptrForRead;
78     pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
79     pContext->pfnMakeGfxPtr              = pCreateInfo->pfnMakeGfxPtr;
80     pContext->pfnCreateMemoryContext     = pCreateInfo->pfnCreateMemoryContext;
81     pContext->pfnDestroyMemoryContext    = pCreateInfo->pfnDestroyMemoryContext;
82     pContext->pfnUpdateSoWriteOffset     = pCreateInfo->pfnUpdateSoWriteOffset;
83     pContext->pfnUpdateStats             = pCreateInfo->pfnUpdateStats;
84     pContext->pfnUpdateStatsFE           = pCreateInfo->pfnUpdateStatsFE;
85     pContext->pfnUpdateStreamOut         = pCreateInfo->pfnUpdateStreamOut;
86 
87 
88     pContext->hExternalMemory = pCreateInfo->hExternalMemory;
89 
90     pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
91     if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
92     {
93         pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
94     }
95 
96     pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
97     pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
98 
99     pContext->pMacroTileManagerArray =
100         (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
101     pContext->pDispatchQueueArray =
102         (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
103 
104     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
105     {
106         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
107         new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
108         new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
109 
110         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
111     }
112 
113     if (pCreateInfo->pThreadInfo)
114     {
115         pContext->threadInfo = *pCreateInfo->pThreadInfo;
116     }
117     else
118     {
119         pContext->threadInfo.MAX_WORKER_THREADS      = KNOB_MAX_WORKER_THREADS;
120         pContext->threadInfo.BASE_NUMA_NODE          = KNOB_BASE_NUMA_NODE;
121         pContext->threadInfo.BASE_CORE               = KNOB_BASE_CORE;
122         pContext->threadInfo.BASE_THREAD             = KNOB_BASE_THREAD;
123         pContext->threadInfo.MAX_NUMA_NODES          = KNOB_MAX_NUMA_NODES;
124         pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
125         pContext->threadInfo.MAX_THREADS_PER_CORE    = KNOB_MAX_THREADS_PER_CORE;
126         pContext->threadInfo.SINGLE_THREADED         = KNOB_SINGLE_THREADED;
127     }
128 
129     if (pCreateInfo->pApiThreadInfo)
130     {
131         pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
132     }
133     else
134     {
135         pContext->apiThreadInfo.bindAPIThread0        = true;
136         pContext->apiThreadInfo.numAPIReservedThreads = 1;
137         pContext->apiThreadInfo.numAPIThreadsPerCore  = 1;
138     }
139 
140     if (pCreateInfo->pWorkerPrivateState)
141     {
142         pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
143     }
144 
145     memset((void*)&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
146     memset((void*)&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
147     new (&pContext->WaitLock) std::mutex();
148     new (&pContext->FifosNotEmpty) std::condition_variable();
149 
150     CreateThreadPool(pContext, &pContext->threadPool);
151 
152     if (pContext->apiThreadInfo.bindAPIThread0)
153     {
154         BindApiThread(pContext, 0);
155     }
156 
157     if (pContext->threadInfo.SINGLE_THREADED)
158     {
159         pContext->pSingleThreadLockedTiles = new TileSet();
160     }
161 
162     pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
163     pContext->pStats =
164         (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
165 
166 #if defined(KNOB_ENABLE_AR)
167     // Setup ArchRast thread contexts which includes +1 for API thread.
168     pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1];
169     pContext->pArContext[pContext->NumWorkerThreads] =
170         ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
171 #endif
172 
173 #if defined(KNOB_ENABLE_RDTSC)
174     pContext->pBucketMgr = new BucketManager(pCreateInfo->contextName);
175     RDTSC_RESET(pContext->pBucketMgr);
176     RDTSC_INIT(pContext->pBucketMgr, 0);
177 #endif
178 
179     // Allocate scratch space for workers.
180     ///@note We could lazily allocate this but its rather small amount of memory.
181     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
182     {
183 #if defined(_WIN32)
184         uint32_t numaNode =
185             pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
186         pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
187                                                               nullptr,
188                                                               KNOB_WORKER_SCRATCH_SPACE_SIZE,
189                                                               MEM_RESERVE | MEM_COMMIT,
190                                                               PAGE_READWRITE,
191                                                               numaNode);
192 #else
193         pContext->ppScratch[i] =
194             (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4);
195 #endif
196 
197 #if defined(KNOB_ENABLE_AR)
198         // Initialize worker thread context for ArchRast.
199         pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
200 
201         SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
202         pWorkerData->hArContext = pContext->pArContext[i];
203 #endif
204 
205 
206     }
207 
208 #if defined(KNOB_ENABLE_AR)
209     // cache the API thread event manager, for use with sim layer
210     pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads];
211 #endif
212 
213     // State setup AFTER context is fully initialized
214     SetupDefaultState(pContext);
215 
216     // initialize hot tile manager
217     pContext->pHotTileMgr = new HotTileMgr();
218 
219     // pass pointer to bucket manager back to caller
220 #ifdef KNOB_ENABLE_RDTSC
221     pCreateInfo->pBucketMgr = pContext->pBucketMgr;
222 #endif
223 
224     pCreateInfo->contextSaveSize = sizeof(API_STATE);
225 
226     StartThreadPool(pContext, &pContext->threadPool);
227 
228     return (HANDLE)pContext;
229 }
230 
CopyState(DRAW_STATE & dst,const DRAW_STATE & src)231 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
232 {
233     memcpy((void*)&dst.state, (void*)&src.state, sizeof(API_STATE));
234 }
235 
236 template <bool IsDraw>
QueueWork(SWR_CONTEXT * pContext)237 void QueueWork(SWR_CONTEXT* pContext)
238 {
239     DRAW_CONTEXT* pDC     = pContext->pCurDrawContext;
240     uint32_t      dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
241 
242     if (IsDraw)
243     {
244         pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
245         pDC->pTileMgr->initialize();
246     }
247 
248     // Each worker thread looks at a DC for both FE and BE work at different times and so we
249     // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
250     // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
251     // then moved on if all work is done.)
252     pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
253 
254     if (IsDraw)
255     {
256         InterlockedIncrement(&pContext->drawsOutstandingFE);
257     }
258 
259     _ReadWriteBarrier();
260     {
261         std::unique_lock<std::mutex> lock(pContext->WaitLock);
262         pContext->dcRing.Enqueue();
263     }
264 
265     if (pContext->threadInfo.SINGLE_THREADED)
266     {
267         uint32_t mxcsr = SetOptimalVectorCSR();
268 
269         if (IsDraw)
270         {
271             uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId,
272                                    pContext->pCurDrawContext->drawId};
273             WorkOnFifoFE(pContext, 0, curDraw[0]);
274             WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
275         }
276         else
277         {
278             uint32_t curDispatch = pContext->pCurDrawContext->drawId;
279             WorkOnCompute(pContext, 0, curDispatch);
280         }
281 
282         // Dequeue the work here, if not already done, since we're single threaded (i.e. no
283         // workers).
284         while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0)
285         {
286         }
287 
288         // restore csr
289         RestoreVectorCSR(mxcsr);
290     }
291     else
292     {
293         RDTSC_BEGIN(pContext->pBucketMgr, APIDrawWakeAllThreads, pDC->drawId);
294         WakeAllThreads(pContext);
295         RDTSC_END(pContext->pBucketMgr, APIDrawWakeAllThreads, 1);
296     }
297 
298     // Set current draw context to NULL so that next state call forces a new draw context to be
299     // created and populated.
300     pContext->pPrevDrawContext = pContext->pCurDrawContext;
301     pContext->pCurDrawContext  = nullptr;
302 }
303 
QueueDraw(SWR_CONTEXT * pContext)304 INLINE void QueueDraw(SWR_CONTEXT* pContext)
305 {
306     QueueWork<true>(pContext);
307 }
308 
QueueDispatch(SWR_CONTEXT * pContext)309 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
310 {
311     QueueWork<false>(pContext);
312 }
313 
GetDrawContext(SWR_CONTEXT * pContext,bool isSplitDraw=false)314 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false)
315 {
316     RDTSC_BEGIN(pContext->pBucketMgr, APIGetDrawContext, 0);
317     // If current draw context is null then need to obtain a new draw context to use from ring.
318     if (pContext->pCurDrawContext == nullptr)
319     {
320         // Need to wait for a free entry.
321         while (pContext->dcRing.IsFull())
322         {
323             _mm_pause();
324         }
325 
326         uint64_t curDraw = pContext->dcRing.GetHead();
327         uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
328 
329         if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
330             (curDraw - pContext->lastDrawChecked) > 0x10000)
331         {
332             // Take this opportunity to clean-up old arena allocations
333             pContext->cachingArenaAllocator.FreeOldBlocks();
334 
335             pContext->lastFrameChecked = pContext->frameCount;
336             pContext->lastDrawChecked  = curDraw;
337         }
338 
339         DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
340         pContext->pCurDrawContext     = pCurDrawContext;
341 
342         // Assign next available entry in DS ring to this DC.
343         uint32_t dsIndex        = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
344         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
345 
346         // Copy previous state to current state.
347         if (pContext->pPrevDrawContext)
348         {
349             DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
350 
351             // If we're splitting our draw then we can just use the same state from the previous
352             // draw. In this case, we won't increment the DS ring index so the next non-split
353             // draw can receive the state.
354             if (isSplitDraw == false)
355             {
356                 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
357 
358                 // Should have been cleaned up previously
359                 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
360 
361                 pCurDrawContext->pState->pPrivateState = nullptr;
362 
363                 pContext->curStateId++; // Progress state ring index forward.
364             }
365             else
366             {
367                 // If its a split draw then just copy the state pointer over
368                 // since its the same draw.
369                 pCurDrawContext->pState = pPrevDrawContext->pState;
370                 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
371             }
372         }
373         else
374         {
375             SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
376             pContext->curStateId++; // Progress state ring index forward.
377         }
378 
379         SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
380 
381         // Reset dependency
382         pCurDrawContext->dependent   = false;
383         pCurDrawContext->dependentFE = false;
384 
385         pCurDrawContext->pContext  = pContext;
386         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
387 
388         pCurDrawContext->doneFE                         = false;
389         pCurDrawContext->FeLock                         = 0;
390         pCurDrawContext->threadsDone                    = 0;
391         pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
392 
393         pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
394 
395         // Assign unique drawId for this DC
396         pCurDrawContext->drawId = pContext->dcRing.GetHead();
397 
398         pCurDrawContext->cleanupState = true;
399     }
400     else
401     {
402         SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
403     }
404 
405     RDTSC_END(pContext->pBucketMgr, APIGetDrawContext, 0);
406     return pContext->pCurDrawContext;
407 }
408 
GetDrawState(SWR_CONTEXT * pContext)409 API_STATE* GetDrawState(SWR_CONTEXT* pContext)
410 {
411     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
412     SWR_ASSERT(pDC->pState != nullptr);
413 
414     return &pDC->pState->state;
415 }
416 
SwrDestroyContext(HANDLE hContext)417 void SwrDestroyContext(HANDLE hContext)
418 {
419     SWR_CONTEXT*  pContext = GetContext(hContext);
420     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
421 
422     pDC->FeWork.type    = SHUTDOWN;
423     pDC->FeWork.pfnWork = ProcessShutdown;
424 
425     // enqueue
426     QueueDraw(pContext);
427 
428     DestroyThreadPool(pContext, &pContext->threadPool);
429 
430     // free the fifos
431     for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
432     {
433         AlignedFree(pContext->dcRing[i].dynState.pStats);
434         delete pContext->dcRing[i].pArena;
435         delete pContext->dsRing[i].pArena;
436         pContext->pMacroTileManagerArray[i].~MacroTileMgr();
437         pContext->pDispatchQueueArray[i].~DispatchQueue();
438     }
439 
440     AlignedFree(pContext->pDispatchQueueArray);
441     AlignedFree(pContext->pMacroTileManagerArray);
442 
443     // Free scratch space.
444     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
445     {
446 #if defined(_WIN32)
447         VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
448 #else
449         AlignedFree(pContext->ppScratch[i]);
450 #endif
451 
452 #if defined(KNOB_ENABLE_AR)
453         ArchRast::DestroyThreadContext(pContext->pArContext[i]);
454 #endif
455     }
456 
457 #if defined(KNOB_ENABLE_RDTSC)
458     delete pContext->pBucketMgr;
459 #endif
460 
461     delete[] pContext->ppScratch;
462     AlignedFree(pContext->pStats);
463 
464     delete pContext->pHotTileMgr;
465     delete pContext->pSingleThreadLockedTiles;
466 
467     pContext->~SWR_CONTEXT();
468     AlignedFree(GetContext(hContext));
469 }
470 
SwrBindApiThread(HANDLE hContext,uint32_t apiThreadId)471 void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
472 {
473     SWR_CONTEXT* pContext = GetContext(hContext);
474     BindApiThread(pContext, apiThreadId);
475 }
476 
SwrSaveState(HANDLE hContext,void * pOutputStateBlock,size_t memSize)477 void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize)
478 {
479     SWR_CONTEXT* pContext = GetContext(hContext);
480     auto         pSrc     = GetDrawState(pContext);
481     assert(pOutputStateBlock && memSize >= sizeof(*pSrc));
482 
483     memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
484 }
485 
SwrRestoreState(HANDLE hContext,const void * pStateBlock,size_t memSize)486 void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize)
487 {
488     SWR_CONTEXT* pContext = GetContext(hContext);
489     auto         pDst     = GetDrawState(pContext);
490     assert(pStateBlock && memSize >= sizeof(*pDst));
491 
492     memcpy((void*)pDst, (void*)pStateBlock, sizeof(*pDst));
493 }
494 
SetupDefaultState(SWR_CONTEXT * pContext)495 void SetupDefaultState(SWR_CONTEXT* pContext)
496 {
497     API_STATE* pState = GetDrawState(pContext);
498 
499     pState->rastState.cullMode     = SWR_CULLMODE_NONE;
500     pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
501 
502     pState->depthBoundsState.depthBoundsTestEnable   = false;
503     pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
504     pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
505 }
506 
SwrSync(HANDLE hContext,PFN_CALLBACK_FUNC pfnFunc,uint64_t userData,uint64_t userData2,uint64_t userData3)507 void SWR_API SwrSync(HANDLE            hContext,
508                      PFN_CALLBACK_FUNC pfnFunc,
509                      uint64_t          userData,
510                      uint64_t          userData2,
511                      uint64_t          userData3)
512 {
513     SWR_ASSERT(pfnFunc != nullptr);
514 
515     SWR_CONTEXT*  pContext = GetContext(hContext);
516     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
517 
518     RDTSC_BEGIN(pContext->pBucketMgr, APISync, 0);
519 
520     pDC->FeWork.type    = SYNC;
521     pDC->FeWork.pfnWork = ProcessSync;
522 
523     // Setup callback function
524     pDC->retireCallback.pfnCallbackFunc = pfnFunc;
525     pDC->retireCallback.userData        = userData;
526     pDC->retireCallback.userData2       = userData2;
527     pDC->retireCallback.userData3       = userData3;
528 
529     AR_API_EVENT(SwrSyncEvent(pDC->drawId));
530 
531     // enqueue
532     QueueDraw(pContext);
533 
534     RDTSC_END(pContext->pBucketMgr, APISync, 1);
535 }
536 
SwrStallBE(HANDLE hContext)537 void SwrStallBE(HANDLE hContext)
538 {
539     SWR_CONTEXT*  pContext = GetContext(hContext);
540     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
541 
542     pDC->dependent = true;
543 }
544 
SwrWaitForIdle(HANDLE hContext)545 void SwrWaitForIdle(HANDLE hContext)
546 {
547     SWR_CONTEXT* pContext = GetContext(hContext);
548 
549     RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
550 
551     while (!pContext->dcRing.IsEmpty())
552     {
553         _mm_pause();
554     }
555 
556     RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
557 }
558 
SwrWaitForIdleFE(HANDLE hContext)559 void SwrWaitForIdleFE(HANDLE hContext)
560 {
561     SWR_CONTEXT* pContext = GetContext(hContext);
562 
563     RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
564 
565     while (pContext->drawsOutstandingFE > 0)
566     {
567         _mm_pause();
568     }
569 
570     RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
571 }
572 
SwrSetVertexBuffers(HANDLE hContext,uint32_t numBuffers,const SWR_VERTEX_BUFFER_STATE * pVertexBuffers)573 void SwrSetVertexBuffers(HANDLE                         hContext,
574                          uint32_t                       numBuffers,
575                          const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
576 {
577     API_STATE* pState = GetDrawState(GetContext(hContext));
578 
579     for (uint32_t i = 0; i < numBuffers; ++i)
580     {
581         const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i];
582         pState->vertexBuffers[pVB->index]  = *pVB;
583     }
584 }
585 
SwrSetIndexBuffer(HANDLE hContext,const SWR_INDEX_BUFFER_STATE * pIndexBuffer)586 void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
587 {
588     API_STATE* pState = GetDrawState(GetContext(hContext));
589 
590     pState->indexBuffer = *pIndexBuffer;
591 }
592 
SwrSetFetchFunc(HANDLE hContext,PFN_FETCH_FUNC pfnFetchFunc)593 void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc)
594 {
595     API_STATE* pState = GetDrawState(GetContext(hContext));
596 
597     pState->pfnFetchFunc = pfnFetchFunc;
598 }
599 
SwrSetSoFunc(HANDLE hContext,PFN_SO_FUNC pfnSoFunc,uint32_t streamIndex)600 void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex)
601 {
602     API_STATE* pState = GetDrawState(GetContext(hContext));
603 
604     SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
605 
606     pState->pfnSoFunc[streamIndex] = pfnSoFunc;
607 }
608 
SwrSetSoState(HANDLE hContext,SWR_STREAMOUT_STATE * pSoState)609 void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState)
610 {
611     API_STATE* pState = GetDrawState(GetContext(hContext));
612 
613     pState->soState = *pSoState;
614 }
615 
SwrSetSoBuffers(HANDLE hContext,SWR_STREAMOUT_BUFFER * pSoBuffer,uint32_t slot)616 void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot)
617 {
618     API_STATE* pState = GetDrawState(GetContext(hContext));
619 
620     SWR_ASSERT((slot < MAX_SO_STREAMS), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
621 
622     // remember buffer status in case of future resume StreamOut
623     if ((pState->soBuffer[slot].pBuffer != 0) && (pSoBuffer->pBuffer == 0))
624 	pState->soPausedBuffer[slot] = pState->soBuffer[slot];
625 
626     // resume
627     if (pState->soPausedBuffer[slot].pBuffer == pSoBuffer->pBuffer)
628 	pState->soBuffer[slot] = pState->soPausedBuffer[slot];
629     else
630         pState->soBuffer[slot] = *pSoBuffer;
631 }
632 
SwrSetVertexFunc(HANDLE hContext,PFN_VERTEX_FUNC pfnVertexFunc)633 void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc)
634 {
635     API_STATE* pState = GetDrawState(GetContext(hContext));
636 
637     pState->pfnVertexFunc = pfnVertexFunc;
638 }
639 
SwrSetFrontendState(HANDLE hContext,SWR_FRONTEND_STATE * pFEState)640 void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState)
641 {
642     API_STATE* pState     = GetDrawState(GetContext(hContext));
643     pState->frontendState = *pFEState;
644 }
645 
SwrSetGsState(HANDLE hContext,SWR_GS_STATE * pGSState)646 void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState)
647 {
648     API_STATE* pState = GetDrawState(GetContext(hContext));
649     pState->gsState   = *pGSState;
650 }
651 
SwrSetGsFunc(HANDLE hContext,PFN_GS_FUNC pfnGsFunc)652 void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc)
653 {
654     API_STATE* pState = GetDrawState(GetContext(hContext));
655     pState->pfnGsFunc = pfnGsFunc;
656 }
657 
SwrSetCsFunc(HANDLE hContext,PFN_CS_FUNC pfnCsFunc,uint32_t totalThreadsInGroup,uint32_t totalSpillFillSize,uint32_t scratchSpaceSizePerWarp,uint32_t numWarps)658 void SwrSetCsFunc(HANDLE      hContext,
659                   PFN_CS_FUNC pfnCsFunc,
660                   uint32_t    totalThreadsInGroup,
661                   uint32_t    totalSpillFillSize,
662                   uint32_t    scratchSpaceSizePerWarp,
663                   uint32_t    numWarps)
664 {
665     API_STATE* pState               = GetDrawState(GetContext(hContext));
666     pState->pfnCsFunc               = pfnCsFunc;
667     pState->totalThreadsInGroup     = totalThreadsInGroup;
668     pState->totalSpillFillSize      = totalSpillFillSize;
669     pState->scratchSpaceSizePerWarp = scratchSpaceSizePerWarp;
670     pState->scratchSpaceNumWarps    = numWarps;
671 }
672 
SwrSetTsState(HANDLE hContext,SWR_TS_STATE * pState)673 void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState)
674 {
675     API_STATE* pApiState = GetDrawState(GetContext(hContext));
676     pApiState->tsState   = *pState;
677 }
678 
SwrSetHsFunc(HANDLE hContext,PFN_HS_FUNC pfnFunc)679 void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc)
680 {
681     API_STATE* pApiState = GetDrawState(GetContext(hContext));
682     pApiState->pfnHsFunc = pfnFunc;
683 }
684 
SwrSetDsFunc(HANDLE hContext,PFN_DS_FUNC pfnFunc)685 void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc)
686 {
687     API_STATE* pApiState = GetDrawState(GetContext(hContext));
688     pApiState->pfnDsFunc = pfnFunc;
689 }
690 
SwrSetDepthStencilState(HANDLE hContext,SWR_DEPTH_STENCIL_STATE * pDSState)691 void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState)
692 {
693     API_STATE* pState = GetDrawState(GetContext(hContext));
694 
695     pState->depthStencilState = *pDSState;
696 }
697 
SwrSetBackendState(HANDLE hContext,SWR_BACKEND_STATE * pBEState)698 void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState)
699 {
700     API_STATE* pState = GetDrawState(GetContext(hContext));
701 
702     pState->backendState = *pBEState;
703 }
704 
SwrSetDepthBoundsState(HANDLE hContext,SWR_DEPTH_BOUNDS_STATE * pDBState)705 void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState)
706 {
707     API_STATE* pState = GetDrawState(GetContext(hContext));
708 
709     pState->depthBoundsState = *pDBState;
710 }
711 
SwrSetPixelShaderState(HANDLE hContext,SWR_PS_STATE * pPSState)712 void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState)
713 {
714     API_STATE* pState = GetDrawState(GetContext(hContext));
715     pState->psState   = *pPSState;
716 }
717 
SwrSetBlendState(HANDLE hContext,SWR_BLEND_STATE * pBlendState)718 void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState)
719 {
720     API_STATE* pState = GetDrawState(GetContext(hContext));
721     memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
722 }
723 
SwrSetBlendFunc(HANDLE hContext,uint32_t renderTarget,PFN_BLEND_JIT_FUNC pfnBlendFunc)724 void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc)
725 {
726     SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
727     API_STATE* pState                  = GetDrawState(GetContext(hContext));
728     pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
729 }
730 
731 // update guardband multipliers for the viewport
updateGuardbands(API_STATE * pState)732 void updateGuardbands(API_STATE* pState)
733 {
734     uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
735 
736     for (uint32_t i = 0; i < numGbs; ++i)
737     {
738         // guardband center is viewport center
739         pState->gbState.left[i]   = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
740         pState->gbState.right[i]  = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
741         pState->gbState.top[i]    = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
742         pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
743     }
744 }
745 
SwrSetRastState(HANDLE hContext,const SWR_RASTSTATE * pRastState)746 void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState)
747 {
748     SWR_CONTEXT* pContext = GetContext(hContext);
749     API_STATE*   pState   = GetDrawState(pContext);
750 
751     memcpy((void*)&pState->rastState, (void*)pRastState, sizeof(SWR_RASTSTATE));
752 }
753 
SwrSetViewports(HANDLE hContext,uint32_t numViewports,const SWR_VIEWPORT * pViewports,const SWR_VIEWPORT_MATRICES * pMatrices)754 void SwrSetViewports(HANDLE                       hContext,
755                      uint32_t                     numViewports,
756                      const SWR_VIEWPORT*          pViewports,
757                      const SWR_VIEWPORT_MATRICES* pMatrices)
758 {
759     SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports.");
760 
761     SWR_CONTEXT* pContext = GetContext(hContext);
762     API_STATE*   pState   = GetDrawState(pContext);
763 
764     memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
765     // @todo Faster to copy portions of the SOA or just copy all of it?
766     memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
767 }
768 
SwrSetScissorRects(HANDLE hContext,uint32_t numScissors,const SWR_RECT * pScissors)769 void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors)
770 {
771     SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects.");
772 
773     API_STATE* pState = GetDrawState(GetContext(hContext));
774     memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
775 };
776 
SetupMacroTileScissors(DRAW_CONTEXT * pDC)777 void SetupMacroTileScissors(DRAW_CONTEXT* pDC)
778 {
779     API_STATE* pState = &pDC->pState->state;
780     uint32_t numScissors =
781         pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
782     pState->scissorsTileAligned = true;
783 
784     for (uint32_t index = 0; index < numScissors; ++index)
785     {
786         SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index];
787 
788         // Set up scissor dimensions based on scissor or viewport
789         if (pState->rastState.scissorEnable)
790         {
791             scissorInFixedPoint = pState->scissorRects[index];
792         }
793         else
794         {
795             // the vp width and height must be added to origin un-rounded then the result round to
796             // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are
797             // positive.
798             scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
799             scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
800             scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
801             scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
802         }
803 
804         // Clamp to max rect
805         scissorInFixedPoint &= g_MaxScissorRect;
806 
807         // Test for tile alignment
808         bool tileAligned;
809         tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
810         tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
811         tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
812         tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
813 
814         pState->scissorsTileAligned &= tileAligned;
815 
816         // Scale to fixed point
817         scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
818         scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
819         scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
820         scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
821 
822         // Make scissor inclusive
823         scissorInFixedPoint.xmax -= 1;
824         scissorInFixedPoint.ymax -= 1;
825     }
826 }
827 
828 
829 // templated backend function tables
830 
SetupPipeline(DRAW_CONTEXT * pDC)831 void SetupPipeline(DRAW_CONTEXT* pDC)
832 {
833     DRAW_STATE*          pState       = pDC->pState;
834     const SWR_RASTSTATE& rastState    = pState->state.rastState;
835     const SWR_PS_STATE&  psState      = pState->state.psState;
836     BACKEND_FUNCS&       backendFuncs = pState->backendFuncs;
837 
838     // setup backend
839     if (psState.pfnPixelShader == nullptr)
840     {
841         backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
842     }
843     else
844     {
845         const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
846         const bool     bMultisampleEnable =
847             ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
848         const uint32_t centroid =
849             ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
850         const uint32_t canEarlyZ =
851             (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
852         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
853 
854         // select backend function
855         switch (psState.shadingRate)
856         {
857         case SWR_SHADING_RATE_PIXEL:
858             if (bMultisampleEnable)
859             {
860                 // always need to generate I & J per sample for Z interpolation
861                 barycentricsMask =
862                     (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
863                 backendFuncs.pfnBackend =
864                     gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern]
865                                           [psState.inputCoverage][centroid][forcedSampleCount]
866                                           [canEarlyZ]
867                     ;
868             }
869             else
870             {
871                 // always need to generate I & J per pixel for Z interpolation
872                 barycentricsMask =
873                     (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
874                 backendFuncs.pfnBackend =
875                     gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
876             }
877             break;
878         case SWR_SHADING_RATE_SAMPLE:
879             SWR_ASSERT(rastState.bIsCenterPattern != true);
880             // always need to generate I & J per sample for Z interpolation
881             barycentricsMask =
882                 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
883             backendFuncs.pfnBackend =
884                 gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]
885                                        [canEarlyZ];
886             break;
887         default:
888             SWR_ASSERT(0 && "Invalid shading rate");
889             break;
890         }
891     }
892 
893     SWR_ASSERT(backendFuncs.pfnBackend);
894 
895     PFN_PROCESS_PRIMS pfnBinner;
896 #if USE_SIMD16_FRONTEND
897     PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
898 #endif
899     switch (pState->state.topology)
900     {
901     case TOP_POINT_LIST:
902         pState->pfnProcessPrims = ClipPoints;
903         pfnBinner               = BinPoints;
904 #if USE_SIMD16_FRONTEND
905         pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
906         pfnBinner_simd16               = BinPoints_simd16;
907 #endif
908         break;
909     case TOP_LINE_LIST:
910     case TOP_LINE_STRIP:
911     case TOP_LINE_LOOP:
912     case TOP_LINE_LIST_ADJ:
913     case TOP_LISTSTRIP_ADJ:
914         pState->pfnProcessPrims = ClipLines;
915         pfnBinner               = BinLines;
916 #if USE_SIMD16_FRONTEND
917         pState->pfnProcessPrims_simd16 = ClipLines_simd16;
918         pfnBinner_simd16               = BinLines_simd16;
919 #endif
920         break;
921     default:
922         pState->pfnProcessPrims = ClipTriangles;
923         pfnBinner               = GetBinTrianglesFunc((rastState.conservativeRast > 0));
924 #if USE_SIMD16_FRONTEND
925         pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
926         pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
927 #endif
928         break;
929     };
930 
931 
932     // Disable clipper if viewport transform is disabled or if clipper is disabled
933     if (pState->state.frontendState.vpTransformDisable || !pState->state.rastState.clipEnable)
934     {
935         pState->pfnProcessPrims = pfnBinner;
936 #if USE_SIMD16_FRONTEND
937         pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
938 #endif
939     }
940 
941     // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes
942     if ((pState->state.psState.pfnPixelShader == nullptr) &&
943         (pState->state.depthStencilState.depthTestEnable == FALSE) &&
944         (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
945         (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
946         (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
947         (pState->state.backendState.numAttributes == 0))
948     {
949         pState->pfnProcessPrims = nullptr;
950 #if USE_SIMD16_FRONTEND
951         pState->pfnProcessPrims_simd16 = nullptr;
952 #endif
953     }
954 
955     if (pState->state.soState.rasterizerDisable == true)
956     {
957         pState->pfnProcessPrims = nullptr;
958 #if USE_SIMD16_FRONTEND
959         pState->pfnProcessPrims_simd16 = nullptr;
960 #endif
961     }
962 
963 
964     // set up the frontend attribute count
965     pState->state.feNumAttributes         = 0;
966     const SWR_BACKEND_STATE& backendState = pState->state.backendState;
967     if (backendState.swizzleEnable)
968     {
969         // attribute swizzling is enabled, iterate over the map and record the max attribute used
970         for (uint32_t i = 0; i < backendState.numAttributes; ++i)
971         {
972             pState->state.feNumAttributes =
973                 std::max(pState->state.feNumAttributes,
974                          (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
975         }
976     }
977     else
978     {
979         pState->state.feNumAttributes = pState->state.backendState.numAttributes;
980     }
981 
982     if (pState->state.soState.soEnable)
983     {
984         uint64_t streamMasks = 0;
985         for (uint32_t i = 0; i < 4; ++i)
986         {
987             streamMasks |= pState->state.soState.streamMasks[i];
988         }
989 
990         unsigned long maxAttrib;
991         if (_BitScanReverse64(&maxAttrib, streamMasks))
992         {
993             pState->state.feNumAttributes =
994                 std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
995         }
996     }
997 
998     // complicated logic to test for cases where we don't need backing hottile memory for a draw
999     // have to check for the special case where depth/stencil test is enabled but depthwrite is
1000     // disabled.
1001     pState->state.depthHottileEnable =
1002         ((!(pState->state.depthStencilState.depthTestEnable &&
1003             !pState->state.depthStencilState.depthWriteEnable &&
1004             !pState->state.depthBoundsState.depthBoundsTestEnable &&
1005             pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
1006          (pState->state.depthStencilState.depthTestEnable ||
1007           pState->state.depthStencilState.depthWriteEnable ||
1008           pState->state.depthBoundsState.depthBoundsTestEnable))
1009             ? true
1010             : false;
1011 
1012     pState->state.stencilHottileEnable =
1013         (((!(pState->state.depthStencilState.stencilTestEnable &&
1014              !pState->state.depthStencilState.stencilWriteEnable &&
1015              pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
1016           // for stencil we have to check the double sided state as well
1017           (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
1018              !pState->state.depthStencilState.stencilWriteEnable &&
1019              pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
1020          (pState->state.depthStencilState.stencilTestEnable ||
1021           pState->state.depthStencilState.stencilWriteEnable))
1022             ? true
1023             : false;
1024 
1025     uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
1026 
1027     // Disable hottile for surfaces with no writes
1028     if (psState.pfnPixelShader != nullptr)
1029     {
1030         unsigned long rt;
1031         uint32_t rtMask = pState->state.psState.renderTargetMask;
1032         while (_BitScanForward(&rt, rtMask))
1033         {
1034             rtMask &= ~(1 << rt);
1035 
1036             if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
1037                 pState->state.blendState.renderTarget[rt].writeDisableRed &&
1038                 pState->state.blendState.renderTarget[rt].writeDisableGreen &&
1039                 pState->state.blendState.renderTarget[rt].writeDisableBlue)
1040             {
1041                 hotTileEnable &= ~(1 << rt);
1042             }
1043         }
1044     }
1045 
1046     pState->state.colorHottileEnable = hotTileEnable;
1047 
1048     // Setup depth quantization function
1049     if (pState->state.depthHottileEnable)
1050     {
1051         switch (pState->state.rastState.depthFormat)
1052         {
1053         case R32_FLOAT_X8X24_TYPELESS:
1054             pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>;
1055             break;
1056         case R32_FLOAT:
1057             pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1058             break;
1059         case R24_UNORM_X8_TYPELESS:
1060             pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>;
1061             break;
1062         case R16_UNORM:
1063             pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>;
1064             break;
1065         default:
1066             SWR_INVALID("Unsupported depth format for depth quantiztion.");
1067             pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1068         }
1069     }
1070     else
1071     {
1072         // set up pass-through quantize if depth isn't enabled
1073         pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1074     }
1075 
1076     // Generate guardbands
1077     updateGuardbands(&pState->state);
1078 }
1079 
1080 //////////////////////////////////////////////////////////////////////////
1081 /// @brief InitDraw
1082 /// @param pDC - Draw context to initialize for this draw.
InitDraw(DRAW_CONTEXT * pDC,bool isSplitDraw)1083 void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw)
1084 {
1085     // We don't need to re-setup the scissors/pipeline state again for split draw.
1086     if (isSplitDraw == false)
1087     {
1088         SetupMacroTileScissors(pDC);
1089         SetupPipeline(pDC);
1090     }
1091 
1092 }
1093 
1094 //////////////////////////////////////////////////////////////////////////
1095 /// @brief We can split the draw for certain topologies for better performance.
1096 /// @param totalVerts - Total vertices for draw
1097 /// @param topology - Topology used for draw
MaxVertsPerDraw(DRAW_CONTEXT * pDC,uint32_t totalVerts,PRIMITIVE_TOPOLOGY topology)1098 uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology)
1099 {
1100     API_STATE& state = pDC->pState->state;
1101 
1102     // We can not split draws that have streamout enabled because there is no practical way
1103     // to support multiple threads generating SO data for a single set of buffers.
1104     if (state.soState.soEnable)
1105     {
1106         return totalVerts;
1107     }
1108 
1109     // The Primitive Assembly code can only handle 1 RECT at a time. Specified with only 3 verts.
1110     if (topology == TOP_RECT_LIST)
1111     {
1112         return 3;
1113     }
1114 
1115     // Is split drawing disabled?
1116     if (KNOB_DISABLE_SPLIT_DRAW)
1117     {
1118         return totalVerts;
1119     }
1120 
1121     uint32_t vertsPerDraw = totalVerts;
1122 
1123     switch (topology)
1124     {
1125     case TOP_POINT_LIST:
1126     case TOP_TRIANGLE_LIST:
1127         vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
1128         break;
1129 
1130     case TOP_PATCHLIST_1:
1131     case TOP_PATCHLIST_2:
1132     case TOP_PATCHLIST_3:
1133     case TOP_PATCHLIST_4:
1134     case TOP_PATCHLIST_5:
1135     case TOP_PATCHLIST_6:
1136     case TOP_PATCHLIST_7:
1137     case TOP_PATCHLIST_8:
1138     case TOP_PATCHLIST_9:
1139     case TOP_PATCHLIST_10:
1140     case TOP_PATCHLIST_11:
1141     case TOP_PATCHLIST_12:
1142     case TOP_PATCHLIST_13:
1143     case TOP_PATCHLIST_14:
1144     case TOP_PATCHLIST_15:
1145     case TOP_PATCHLIST_16:
1146     case TOP_PATCHLIST_17:
1147     case TOP_PATCHLIST_18:
1148     case TOP_PATCHLIST_19:
1149     case TOP_PATCHLIST_20:
1150     case TOP_PATCHLIST_21:
1151     case TOP_PATCHLIST_22:
1152     case TOP_PATCHLIST_23:
1153     case TOP_PATCHLIST_24:
1154     case TOP_PATCHLIST_25:
1155     case TOP_PATCHLIST_26:
1156     case TOP_PATCHLIST_27:
1157     case TOP_PATCHLIST_28:
1158     case TOP_PATCHLIST_29:
1159     case TOP_PATCHLIST_30:
1160     case TOP_PATCHLIST_31:
1161     case TOP_PATCHLIST_32:
1162         if (pDC->pState->state.tsState.tsEnable)
1163         {
1164             uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1165             vertsPerDraw          = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1166         }
1167         break;
1168     default:
1169         // We are not splitting up draws for other topologies.
1170         break;
1171     }
1172 
1173     return vertsPerDraw;
1174 }
1175 
1176 //////////////////////////////////////////////////////////////////////////
1177 /// @brief DrawInstanced
1178 /// @param hContext - Handle passed back from SwrCreateContext
1179 /// @param topology - Specifies topology for draw.
1180 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1181 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1182 /// @param numInstances - How many instances to render.
1183 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1184 /// (instanced data)
DrawInstanced(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numVertices,uint32_t startVertex,uint32_t numInstances=1,uint32_t startInstance=0)1185 void DrawInstanced(HANDLE             hContext,
1186                    PRIMITIVE_TOPOLOGY topology,
1187                    uint32_t           numVertices,
1188                    uint32_t           startVertex,
1189                    uint32_t           numInstances  = 1,
1190                    uint32_t           startInstance = 0)
1191 {
1192     if (KNOB_TOSS_DRAW)
1193     {
1194         return;
1195     }
1196 
1197     SWR_CONTEXT*  pContext = GetContext(hContext);
1198     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1199 
1200     RDTSC_BEGIN(pContext->pBucketMgr, APIDraw, pDC->drawId);
1201 
1202     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1203     uint32_t primsPerDraw    = GetNumPrims(topology, maxVertsPerDraw);
1204     uint32_t remainingVerts  = numVertices;
1205 
1206     API_STATE* pState  = &pDC->pState->state;
1207     pState->topology   = topology;
1208     pState->forceFront = false;
1209 
1210     // disable culling for points/lines
1211     uint32_t oldCullMode = pState->rastState.cullMode;
1212     if (topology == TOP_POINT_LIST)
1213     {
1214         pState->rastState.cullMode = SWR_CULLMODE_NONE;
1215         pState->forceFront         = true;
1216     }
1217     else if (topology == TOP_RECT_LIST)
1218     {
1219         pState->rastState.cullMode = SWR_CULLMODE_NONE;
1220     }
1221 
1222     int draw = 0;
1223     while (remainingVerts)
1224     {
1225         uint32_t numVertsForDraw =
1226             (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw;
1227 
1228         bool          isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
1229         DRAW_CONTEXT* pDC         = GetDrawContext(pContext, isSplitDraw);
1230         InitDraw(pDC, isSplitDraw);
1231 
1232         pDC->FeWork.type                    = DRAW;
1233         pDC->FeWork.pfnWork                 = GetProcessDrawFunc(false, // IsIndexed
1234                                                  false, // bEnableCutIndex
1235                                                  pState->tsState.tsEnable,
1236                                                  pState->gsState.gsEnable,
1237                                                  pState->soState.soEnable,
1238                                                  pDC->pState->pfnProcessPrims != nullptr);
1239         pDC->FeWork.desc.draw.numVerts      = numVertsForDraw;
1240         pDC->FeWork.desc.draw.startVertex   = startVertex;
1241         pDC->FeWork.desc.draw.numInstances  = numInstances;
1242         pDC->FeWork.desc.draw.startInstance = startInstance;
1243         pDC->FeWork.desc.draw.startPrimID   = draw * primsPerDraw;
1244         pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1245 
1246         pDC->cleanupState = (remainingVerts == numVertsForDraw);
1247 
1248         // enqueue DC
1249         QueueDraw(pContext);
1250 
1251         AR_API_EVENT(DrawInstancedEvent(pDC->drawId,
1252                                         topology,
1253                                         numVertsForDraw,
1254                                         startVertex,
1255                                         numInstances,
1256                                         startInstance,
1257                                         pState->tsState.tsEnable,
1258                                         pState->gsState.gsEnable,
1259                                         pState->soState.soEnable,
1260                                         pState->gsState.outputTopology,
1261                                         draw));
1262 
1263         remainingVerts -= numVertsForDraw;
1264         draw++;
1265     }
1266 
1267     // restore culling state
1268     pDC                                   = GetDrawContext(pContext);
1269     pDC->pState->state.rastState.cullMode = oldCullMode;
1270 
1271     RDTSC_END(pContext->pBucketMgr, APIDraw, numVertices * numInstances);
1272 }
1273 
1274 //////////////////////////////////////////////////////////////////////////
1275 /// @brief SwrDraw
1276 /// @param hContext - Handle passed back from SwrCreateContext
1277 /// @param topology - Specifies topology for draw.
1278 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1279 /// @param primCount - Number of vertices.
SwrDraw(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t startVertex,uint32_t numVertices)1280 void SwrDraw(HANDLE             hContext,
1281              PRIMITIVE_TOPOLOGY topology,
1282              uint32_t           startVertex,
1283              uint32_t           numVertices)
1284 {
1285     DrawInstanced(hContext, topology, numVertices, startVertex);
1286 }
1287 
1288 //////////////////////////////////////////////////////////////////////////
1289 /// @brief SwrDrawInstanced
1290 /// @param hContext - Handle passed back from SwrCreateContext
1291 /// @param topology - Specifies topology for draw.
1292 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1293 /// @param numInstances - How many instances to render.
1294 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1295 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1296 /// (instanced data)
SwrDrawInstanced(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numVertsPerInstance,uint32_t numInstances,uint32_t startVertex,uint32_t startInstance)1297 void SwrDrawInstanced(HANDLE             hContext,
1298                       PRIMITIVE_TOPOLOGY topology,
1299                       uint32_t           numVertsPerInstance,
1300                       uint32_t           numInstances,
1301                       uint32_t           startVertex,
1302                       uint32_t           startInstance)
1303 {
1304     DrawInstanced(
1305         hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1306 }
1307 
1308 //////////////////////////////////////////////////////////////////////////
1309 /// @brief DrawIndexedInstanced
1310 /// @param hContext - Handle passed back from SwrCreateContext
1311 /// @param topology - Specifies topology for draw.
1312 /// @param numIndices - Number of indices to read sequentially from index buffer.
1313 /// @param indexOffset - Starting index into index buffer.
1314 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1315 /// @param numInstances - Number of instances to render.
1316 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1317 /// (instanced data)
DrawIndexedInstance(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numIndices,uint32_t indexOffset,int32_t baseVertex,uint32_t numInstances=1,uint32_t startInstance=0)1318 void DrawIndexedInstance(HANDLE             hContext,
1319                          PRIMITIVE_TOPOLOGY topology,
1320                          uint32_t           numIndices,
1321                          uint32_t           indexOffset,
1322                          int32_t            baseVertex,
1323                          uint32_t           numInstances  = 1,
1324                          uint32_t           startInstance = 0)
1325 {
1326     if (KNOB_TOSS_DRAW)
1327     {
1328         return;
1329     }
1330 
1331     SWR_CONTEXT*  pContext = GetContext(hContext);
1332     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1333     API_STATE*    pState   = &pDC->pState->state;
1334 
1335     RDTSC_BEGIN(pContext->pBucketMgr, APIDrawIndexed, pDC->drawId);
1336 
1337     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1338     uint32_t primsPerDraw      = GetNumPrims(topology, maxIndicesPerDraw);
1339     uint32_t remainingIndices  = numIndices;
1340 
1341     uint32_t indexSize = 0;
1342     switch (pState->indexBuffer.format)
1343     {
1344     case R32_UINT:
1345         indexSize = sizeof(uint32_t);
1346         break;
1347     case R16_UINT:
1348         indexSize = sizeof(uint16_t);
1349         break;
1350     case R8_UINT:
1351         indexSize = sizeof(uint8_t);
1352         break;
1353     default:
1354         SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
1355     }
1356 
1357     int      draw = 0;
1358     gfxptr_t xpIB = pState->indexBuffer.xpIndices;
1359     xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1360 
1361     pState->topology   = topology;
1362     pState->forceFront = false;
1363 
1364     // disable culling for points/lines
1365     uint32_t oldCullMode = pState->rastState.cullMode;
1366     if (topology == TOP_POINT_LIST)
1367     {
1368         pState->rastState.cullMode = SWR_CULLMODE_NONE;
1369         pState->forceFront         = true;
1370     }
1371     else if (topology == TOP_RECT_LIST)
1372     {
1373         pState->rastState.cullMode = SWR_CULLMODE_NONE;
1374     }
1375 
1376     while (remainingIndices)
1377     {
1378         uint32_t numIndicesForDraw =
1379             (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw;
1380 
1381         // When breaking up draw, we need to obtain new draw context for each iteration.
1382         bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
1383 
1384         pDC = GetDrawContext(pContext, isSplitDraw);
1385         InitDraw(pDC, isSplitDraw);
1386 
1387         pDC->FeWork.type                 = DRAW;
1388         pDC->FeWork.pfnWork              = GetProcessDrawFunc(true, // IsIndexed
1389                                                  pState->frontendState.bEnableCutIndex,
1390                                                  pState->tsState.tsEnable,
1391                                                  pState->gsState.gsEnable,
1392                                                  pState->soState.soEnable,
1393                                                  pDC->pState->pfnProcessPrims != nullptr);
1394         pDC->FeWork.desc.draw.pDC        = pDC;
1395         pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1396         pDC->FeWork.desc.draw.xpIB       = xpIB;
1397         pDC->FeWork.desc.draw.type       = pDC->pState->state.indexBuffer.format;
1398 
1399         pDC->FeWork.desc.draw.numInstances  = numInstances;
1400         pDC->FeWork.desc.draw.startInstance = startInstance;
1401         pDC->FeWork.desc.draw.baseVertex    = baseVertex;
1402         pDC->FeWork.desc.draw.startPrimID   = draw * primsPerDraw;
1403 
1404         pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1405 
1406         // enqueue DC
1407         QueueDraw(pContext);
1408 
1409         AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId,
1410                                                topology,
1411                                                numIndicesForDraw,
1412                                                indexOffset,
1413                                                baseVertex,
1414                                                numInstances,
1415                                                startInstance,
1416                                                pState->tsState.tsEnable,
1417                                                pState->gsState.gsEnable,
1418                                                pState->soState.soEnable,
1419                                                pState->gsState.outputTopology,
1420                                                draw));
1421 
1422         xpIB += maxIndicesPerDraw * indexSize;
1423         remainingIndices -= numIndicesForDraw;
1424         draw++;
1425     }
1426 
1427     // Restore culling state
1428     pDC                                   = GetDrawContext(pContext);
1429     pDC->pState->state.rastState.cullMode = oldCullMode;
1430 
1431     RDTSC_END(pContext->pBucketMgr, APIDrawIndexed, numIndices * numInstances);
1432 }
1433 
1434 //////////////////////////////////////////////////////////////////////////
1435 /// @brief DrawIndexed
1436 /// @param hContext - Handle passed back from SwrCreateContext
1437 /// @param topology - Specifies topology for draw.
1438 /// @param numIndices - Number of indices to read sequentially from index buffer.
1439 /// @param indexOffset - Starting index into index buffer.
1440 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
SwrDrawIndexed(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numIndices,uint32_t indexOffset,int32_t baseVertex)1441 void SwrDrawIndexed(HANDLE             hContext,
1442                     PRIMITIVE_TOPOLOGY topology,
1443                     uint32_t           numIndices,
1444                     uint32_t           indexOffset,
1445                     int32_t            baseVertex)
1446 {
1447     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1448 }
1449 
1450 //////////////////////////////////////////////////////////////////////////
1451 /// @brief SwrDrawIndexedInstanced
1452 /// @param hContext - Handle passed back from SwrCreateContext
1453 /// @param topology - Specifies topology for draw.
1454 /// @param numIndices - Number of indices to read sequentially from index buffer.
1455 /// @param numInstances - Number of instances to render.
1456 /// @param indexOffset - Starting index into index buffer.
1457 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1458 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1459 /// (instanced data)
SwrDrawIndexedInstanced(HANDLE hContext,PRIMITIVE_TOPOLOGY topology,uint32_t numIndices,uint32_t numInstances,uint32_t indexOffset,int32_t baseVertex,uint32_t startInstance)1460 void SwrDrawIndexedInstanced(HANDLE             hContext,
1461                              PRIMITIVE_TOPOLOGY topology,
1462                              uint32_t           numIndices,
1463                              uint32_t           numInstances,
1464                              uint32_t           indexOffset,
1465                              int32_t            baseVertex,
1466                              uint32_t           startInstance)
1467 {
1468     DrawIndexedInstance(
1469         hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1470 }
1471 
1472 //////////////////////////////////////////////////////////////////////////
1473 /// @brief SwrInvalidateTiles
1474 /// @param hContext - Handle passed back from SwrCreateContext
1475 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
1476 /// invalidate.
1477 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
1478 ///                         be hottile size-aligned.
SwrInvalidateTiles(HANDLE hContext,uint32_t attachmentMask,const SWR_RECT & invalidateRect)1479 void SWR_API SwrInvalidateTiles(HANDLE          hContext,
1480                                 uint32_t        attachmentMask,
1481                                 const SWR_RECT& invalidateRect)
1482 {
1483     if (KNOB_TOSS_DRAW)
1484     {
1485         return;
1486     }
1487 
1488     SWR_CONTEXT*  pContext = GetContext(hContext);
1489     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1490 
1491     pDC->FeWork.type                                       = DISCARDINVALIDATETILES;
1492     pDC->FeWork.pfnWork                                    = ProcessDiscardInvalidateTiles;
1493     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1494     pDC->FeWork.desc.discardInvalidateTiles.rect           = invalidateRect;
1495     pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1496     pDC->FeWork.desc.discardInvalidateTiles.newTileState   = SWR_TILE_INVALID;
1497     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1498     pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly  = false;
1499 
1500     // enqueue
1501     QueueDraw(pContext);
1502 
1503     AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
1504 }
1505 
1506 //////////////////////////////////////////////////////////////////////////
1507 /// @brief SwrDiscardRect
1508 /// @param hContext - Handle passed back from SwrCreateContext
1509 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1510 /// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
1511 ///               discarded.
SwrDiscardRect(HANDLE hContext,uint32_t attachmentMask,const SWR_RECT & rect)1512 void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect)
1513 {
1514     if (KNOB_TOSS_DRAW)
1515     {
1516         return;
1517     }
1518 
1519     SWR_CONTEXT*  pContext = GetContext(hContext);
1520     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1521 
1522     // Queue a load to the hottile
1523     pDC->FeWork.type                                       = DISCARDINVALIDATETILES;
1524     pDC->FeWork.pfnWork                                    = ProcessDiscardInvalidateTiles;
1525     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1526     pDC->FeWork.desc.discardInvalidateTiles.rect           = rect;
1527     pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1528     pDC->FeWork.desc.discardInvalidateTiles.newTileState   = SWR_TILE_RESOLVED;
1529     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1530     pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly  = true;
1531 
1532     // enqueue
1533     QueueDraw(pContext);
1534 
1535     AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
1536 }
1537 
1538 //////////////////////////////////////////////////////////////////////////
1539 /// @brief SwrDispatch
1540 /// @param hContext - Handle passed back from SwrCreateContext
1541 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1542 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1543 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
SwrDispatch(HANDLE hContext,uint32_t threadGroupCountX,uint32_t threadGroupCountY,uint32_t threadGroupCountZ)1544 void SwrDispatch(HANDLE   hContext,
1545                  uint32_t threadGroupCountX,
1546                  uint32_t threadGroupCountY,
1547                  uint32_t threadGroupCountZ
1548 
1549 )
1550 {
1551     if (KNOB_TOSS_DRAW)
1552     {
1553         return;
1554     }
1555 
1556     SWR_CONTEXT*  pContext = GetContext(hContext);
1557     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1558 
1559     RDTSC_BEGIN(pContext->pBucketMgr, APIDispatch, pDC->drawId);
1560     AR_API_EVENT(
1561         DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
1562     pDC->isCompute = true; // This is a compute context.
1563 
1564     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1565 
1566     pTaskData->threadGroupCountX = threadGroupCountX;
1567     pTaskData->threadGroupCountY = threadGroupCountY;
1568     pTaskData->threadGroupCountZ = threadGroupCountZ;
1569 
1570     pTaskData->enableThreadDispatch = false;
1571 
1572     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1573     uint32_t dcIndex           = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
1574     pDC->pDispatch             = &pContext->pDispatchQueueArray[dcIndex];
1575     pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
1576 
1577     QueueDispatch(pContext);
1578     RDTSC_END(pContext->pBucketMgr,
1579               APIDispatch,
1580               threadGroupCountX * threadGroupCountY * threadGroupCountZ);
1581 }
1582 
1583 // Deswizzles, converts and stores current contents of the hot tiles to surface
1584 // described by pState
SwrStoreTiles(HANDLE hContext,uint32_t attachmentMask,SWR_TILE_STATE postStoreTileState,const SWR_RECT & storeRect)1585 void SWR_API SwrStoreTiles(HANDLE          hContext,
1586                            uint32_t        attachmentMask,
1587                            SWR_TILE_STATE  postStoreTileState,
1588                            const SWR_RECT& storeRect)
1589 {
1590     if (KNOB_TOSS_DRAW)
1591     {
1592         return;
1593     }
1594 
1595     SWR_CONTEXT*  pContext = GetContext(hContext);
1596     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1597 
1598     RDTSC_BEGIN(pContext->pBucketMgr, APIStoreTiles, pDC->drawId);
1599 
1600     pDC->FeWork.type                               = STORETILES;
1601     pDC->FeWork.pfnWork                            = ProcessStoreTiles;
1602     pDC->FeWork.desc.storeTiles.attachmentMask     = attachmentMask;
1603     pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1604     pDC->FeWork.desc.storeTiles.rect               = storeRect;
1605     pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
1606 
1607     // enqueue
1608     QueueDraw(pContext);
1609 
1610     AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
1611 
1612     RDTSC_END(pContext->pBucketMgr, APIStoreTiles, 1);
1613 }
1614 
1615 //////////////////////////////////////////////////////////////////////////
1616 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
1617 /// @param hContext - Handle passed back from SwrCreateContext
1618 /// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
1619 /// @param renderTargetArrayIndex - the RT array index to clear
1620 /// @param clearColor - color use for clearing render targets
1621 /// @param z - depth value use for clearing depth buffer
1622 /// @param stencil - stencil value used for clearing stencil buffer
1623 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
SwrClearRenderTarget(HANDLE hContext,uint32_t attachmentMask,uint32_t renderTargetArrayIndex,const float clearColor[4],float z,uint8_t stencil,const SWR_RECT & clearRect)1624 void SWR_API SwrClearRenderTarget(HANDLE          hContext,
1625                                   uint32_t        attachmentMask,
1626                                   uint32_t        renderTargetArrayIndex,
1627                                   const float     clearColor[4],
1628                                   float           z,
1629                                   uint8_t         stencil,
1630                                   const SWR_RECT& clearRect)
1631 {
1632     if (KNOB_TOSS_DRAW)
1633     {
1634         return;
1635     }
1636 
1637     SWR_CONTEXT*  pContext = GetContext(hContext);
1638     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1639 
1640     RDTSC_BEGIN(pContext->pBucketMgr, APIClearRenderTarget, pDC->drawId);
1641 
1642     pDC->FeWork.type            = CLEAR;
1643     pDC->FeWork.pfnWork         = ProcessClear;
1644     pDC->FeWork.desc.clear.rect = clearRect;
1645     pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
1646     pDC->FeWork.desc.clear.attachmentMask         = attachmentMask;
1647     pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
1648     pDC->FeWork.desc.clear.clearDepth             = z;
1649     pDC->FeWork.desc.clear.clearRTColor[0]        = clearColor[0];
1650     pDC->FeWork.desc.clear.clearRTColor[1]        = clearColor[1];
1651     pDC->FeWork.desc.clear.clearRTColor[2]        = clearColor[2];
1652     pDC->FeWork.desc.clear.clearRTColor[3]        = clearColor[3];
1653     pDC->FeWork.desc.clear.clearStencil           = stencil;
1654 
1655     // enqueue draw
1656     QueueDraw(pContext);
1657 
1658     RDTSC_END(pContext->pBucketMgr, APIClearRenderTarget, 1);
1659 }
1660 
1661 //////////////////////////////////////////////////////////////////////////
1662 /// @brief Returns a pointer to the private context state for the current
1663 ///        draw operation. This is used for external componets such as the
1664 ///        sampler.
1665 ///        SWR is responsible for the allocation of the private context state.
1666 /// @param hContext - Handle passed back from SwrCreateContext
SwrGetPrivateContextState(HANDLE hContext)1667 VOID* SwrGetPrivateContextState(HANDLE hContext)
1668 {
1669     SWR_CONTEXT*  pContext = GetContext(hContext);
1670     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1671     DRAW_STATE*   pState   = pDC->pState;
1672 
1673     if (pState->pPrivateState == nullptr)
1674     {
1675         pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize,
1676                                                              KNOB_SIMD_WIDTH * sizeof(float));
1677     }
1678 
1679     return pState->pPrivateState;
1680 }
1681 
1682 //////////////////////////////////////////////////////////////////////////
1683 /// @brief Clients can use this to allocate memory for draw/dispatch
1684 ///        operations. The memory will automatically be freed once operation
1685 ///        has completed. Client can use this to allocate binding tables,
1686 ///        etc. needed for shader execution.
1687 /// @param hContext - Handle passed back from SwrCreateContext
1688 /// @param size - Size of allocation
1689 /// @param align - Alignment needed for allocation.
SwrAllocDrawContextMemory(HANDLE hContext,uint32_t size,uint32_t align)1690 VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align)
1691 {
1692     SWR_CONTEXT*  pContext = GetContext(hContext);
1693     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1694 
1695     return pDC->pState->pArena->AllocAligned(size, align);
1696 }
1697 
1698 //////////////////////////////////////////////////////////////////////////
1699 /// @brief Enables stats counting
1700 /// @param hContext - Handle passed back from SwrCreateContext
1701 /// @param enable - If true then counts are incremented.
SwrEnableStatsFE(HANDLE hContext,bool enable)1702 void SwrEnableStatsFE(HANDLE hContext, bool enable)
1703 {
1704     SWR_CONTEXT*  pContext = GetContext(hContext);
1705     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1706 
1707     pDC->pState->state.enableStatsFE = enable;
1708 }
1709 
1710 //////////////////////////////////////////////////////////////////////////
1711 /// @brief Enables stats counting
1712 /// @param hContext - Handle passed back from SwrCreateContext
1713 /// @param enable - If true then counts are incremented.
SwrEnableStatsBE(HANDLE hContext,bool enable)1714 void SwrEnableStatsBE(HANDLE hContext, bool enable)
1715 {
1716     SWR_CONTEXT*  pContext = GetContext(hContext);
1717     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1718 
1719     pDC->pState->state.enableStatsBE = enable;
1720 }
1721 
1722 //////////////////////////////////////////////////////////////////////////
1723 /// @brief Mark end of frame - used for performance profiling
1724 /// @param hContext - Handle passed back from SwrCreateContext
SwrEndFrame(HANDLE hContext)1725 void SWR_API SwrEndFrame(HANDLE hContext)
1726 {
1727     SWR_CONTEXT*  pContext = GetContext(hContext);
1728     DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
1729     (void)pDC; // var used
1730 
1731     RDTSC_ENDFRAME(pContext->pBucketMgr);
1732     AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
1733 
1734     pContext->frameCount++;
1735 }
1736 
1737 void InitSimLoadTilesTable();
1738 void InitSimStoreTilesTable();
1739 void InitSimClearTilesTable();
1740 
1741 void InitClearTilesTable();
1742 void InitBackendFuncTables();
1743 
1744 //////////////////////////////////////////////////////////////////////////
1745 /// @brief Initialize swr backend and memory internal tables
SwrInit()1746 void SwrInit()
1747 {
1748     InitClearTilesTable();
1749     InitBackendFuncTables();
1750     InitRasterizerFunctions();
1751 }
1752 
SwrGetInterface(SWR_INTERFACE & out_funcs)1753 void SwrGetInterface(SWR_INTERFACE& out_funcs)
1754 {
1755     out_funcs.pfnSwrCreateContext          = SwrCreateContext;
1756     out_funcs.pfnSwrDestroyContext         = SwrDestroyContext;
1757     out_funcs.pfnSwrBindApiThread          = SwrBindApiThread;
1758     out_funcs.pfnSwrSaveState              = SwrSaveState;
1759     out_funcs.pfnSwrRestoreState           = SwrRestoreState;
1760     out_funcs.pfnSwrSync                   = SwrSync;
1761     out_funcs.pfnSwrStallBE                = SwrStallBE;
1762     out_funcs.pfnSwrWaitForIdle            = SwrWaitForIdle;
1763     out_funcs.pfnSwrWaitForIdleFE          = SwrWaitForIdleFE;
1764     out_funcs.pfnSwrSetVertexBuffers       = SwrSetVertexBuffers;
1765     out_funcs.pfnSwrSetIndexBuffer         = SwrSetIndexBuffer;
1766     out_funcs.pfnSwrSetFetchFunc           = SwrSetFetchFunc;
1767     out_funcs.pfnSwrSetSoFunc              = SwrSetSoFunc;
1768     out_funcs.pfnSwrSetSoState             = SwrSetSoState;
1769     out_funcs.pfnSwrSetSoBuffers           = SwrSetSoBuffers;
1770     out_funcs.pfnSwrSetVertexFunc          = SwrSetVertexFunc;
1771     out_funcs.pfnSwrSetFrontendState       = SwrSetFrontendState;
1772     out_funcs.pfnSwrSetGsState             = SwrSetGsState;
1773     out_funcs.pfnSwrSetGsFunc              = SwrSetGsFunc;
1774     out_funcs.pfnSwrSetCsFunc              = SwrSetCsFunc;
1775     out_funcs.pfnSwrSetTsState             = SwrSetTsState;
1776     out_funcs.pfnSwrSetHsFunc              = SwrSetHsFunc;
1777     out_funcs.pfnSwrSetDsFunc              = SwrSetDsFunc;
1778     out_funcs.pfnSwrSetDepthStencilState   = SwrSetDepthStencilState;
1779     out_funcs.pfnSwrSetBackendState        = SwrSetBackendState;
1780     out_funcs.pfnSwrSetDepthBoundsState    = SwrSetDepthBoundsState;
1781     out_funcs.pfnSwrSetPixelShaderState    = SwrSetPixelShaderState;
1782     out_funcs.pfnSwrSetBlendState          = SwrSetBlendState;
1783     out_funcs.pfnSwrSetBlendFunc           = SwrSetBlendFunc;
1784     out_funcs.pfnSwrDraw                   = SwrDraw;
1785     out_funcs.pfnSwrDrawInstanced          = SwrDrawInstanced;
1786     out_funcs.pfnSwrDrawIndexed            = SwrDrawIndexed;
1787     out_funcs.pfnSwrDrawIndexedInstanced   = SwrDrawIndexedInstanced;
1788     out_funcs.pfnSwrInvalidateTiles        = SwrInvalidateTiles;
1789     out_funcs.pfnSwrDiscardRect            = SwrDiscardRect;
1790     out_funcs.pfnSwrDispatch               = SwrDispatch;
1791     out_funcs.pfnSwrStoreTiles             = SwrStoreTiles;
1792     out_funcs.pfnSwrClearRenderTarget      = SwrClearRenderTarget;
1793     out_funcs.pfnSwrSetRastState           = SwrSetRastState;
1794     out_funcs.pfnSwrSetViewports           = SwrSetViewports;
1795     out_funcs.pfnSwrSetScissorRects        = SwrSetScissorRects;
1796     out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
1797     out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
1798     out_funcs.pfnSwrEnableStatsFE          = SwrEnableStatsFE;
1799     out_funcs.pfnSwrEnableStatsBE          = SwrEnableStatsBE;
1800     out_funcs.pfnSwrEndFrame               = SwrEndFrame;
1801     out_funcs.pfnSwrInit                   = SwrInit;
1802 }
1803