1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  ****************************************************************************/
23 
24 #include <stdio.h>
25 #include <thread>
26 #include <algorithm>
27 #include <float.h>
28 #include <vector>
29 #include <utility>
30 #include <fstream>
31 #include <string>
32 
33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
34 #include <pthread.h>
35 #include <sched.h>
36 #include <unistd.h>
37 #endif
38 
39 #ifdef __APPLE__
40 #include <sys/types.h>
41 #include <sys/sysctl.h>
42 #endif
43 
44 #include "common/os.h"
45 #include "core/api.h"
46 #include "context.h"
47 #include "frontend.h"
48 #include "backend.h"
49 #include "rasterizer.h"
50 #include "rdtsc_core.h"
51 #include "tilemgr.h"
52 #include "tileset.h"
53 
54 
55 // ThreadId
56 struct Core
57 {
58     uint32_t              procGroup = 0;
59     std::vector<uint32_t> threadIds;
60 };
61 
62 struct NumaNode
63 {
64     uint32_t          numaId;
65     std::vector<Core> cores;
66 };
67 
68 typedef std::vector<NumaNode> CPUNumaNodes;
69 
CalculateProcessorTopology(CPUNumaNodes & out_nodes,uint32_t & out_numThreadsPerProcGroup)70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
71 {
72     out_nodes.clear();
73     out_numThreadsPerProcGroup = 0;
74 
75 #if defined(_WIN32)
76 
77     std::vector<KAFFINITY> threadMaskPerProcGroup;
78 
79     static std::mutex           m;
80     std::lock_guard<std::mutex> l(m);
81 
82     DWORD bufSize = 0;
83 
84     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
85     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
86 
87     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
88         (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
89     SWR_ASSERT(pBufferMem);
90 
91     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
92     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
93 
94     uint32_t                                 count   = bufSize / pBufferMem->Size;
95     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
96 
97     for (uint32_t i = 0; i < count; ++i)
98     {
99         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
100         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
101         {
102             auto&    gmask     = pBuffer->Processor.GroupMask[g];
103             uint32_t threadId  = 0;
104             uint32_t procGroup = gmask.Group;
105 
106             Core* pCore = nullptr;
107 
108             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
109             {
110                 // clear mask
111                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
112                 gmask.Mask &= ~threadMask;
113 
114                 if (procGroup >= threadMaskPerProcGroup.size())
115                 {
116                     threadMaskPerProcGroup.resize(procGroup + 1);
117                 }
118 
119                 if (threadMaskPerProcGroup[procGroup] & threadMask)
120                 {
121                     // Already seen this mask.  This means that we are in 32-bit mode and
122                     // have seen more than 32 HW threads for this procGroup
123                     // Don't use it
124 #if defined(_WIN64)
125                     SWR_INVALID("Shouldn't get here in 64-bit mode");
126 #endif
127                     continue;
128                 }
129 
130                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
131 
132                 // Find Numa Node
133                 uint32_t         numaId  = 0;
134                 PROCESSOR_NUMBER procNum = {};
135                 procNum.Group            = WORD(procGroup);
136                 procNum.Number           = UCHAR(threadId);
137 
138                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
139                 SWR_ASSERT(ret);
140 
141                 // Store data
142                 if (out_nodes.size() <= numaId)
143                 {
144                     out_nodes.resize(numaId + 1);
145                 }
146                 auto& numaNode  = out_nodes[numaId];
147                 numaNode.numaId = numaId;
148 
149                 if (nullptr == pCore)
150                 {
151                     numaNode.cores.push_back(Core());
152                     pCore            = &numaNode.cores.back();
153                     pCore->procGroup = procGroup;
154                 }
155                 pCore->threadIds.push_back(threadId);
156                 if (procGroup == 0)
157                 {
158                     out_numThreadsPerProcGroup++;
159                 }
160             }
161         }
162         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
163     }
164 
165     free(pBufferMem);
166 
167 #elif defined(__linux__) || defined(__gnu_linux__)
168 
169     // Parse /proc/cpuinfo to get full topology
170     std::ifstream input("/proc/cpuinfo");
171     std::string   line;
172     char*         c;
173     uint32_t      procId = uint32_t(-1);
174     uint32_t      coreId = uint32_t(-1);
175     uint32_t      physId = uint32_t(-1);
176 
177     while (std::getline(input, line))
178     {
179         if (line.find("processor") != std::string::npos)
180         {
181             auto data_start = line.find(": ") + 2;
182             procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
183             continue;
184         }
185         if (line.find("core id") != std::string::npos)
186         {
187             auto data_start = line.find(": ") + 2;
188             coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
189             continue;
190         }
191         if (line.find("physical id") != std::string::npos)
192         {
193             auto data_start = line.find(": ") + 2;
194             physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
195             continue;
196         }
197         if (line.length() == 0)
198         {
199             if (physId + 1 > out_nodes.size())
200                 out_nodes.resize(physId + 1);
201             auto& numaNode  = out_nodes[physId];
202             numaNode.numaId = physId;
203 
204             if (coreId + 1 > numaNode.cores.size())
205                 numaNode.cores.resize(coreId + 1);
206             auto& core     = numaNode.cores[coreId];
207             core.procGroup = coreId;
208             core.threadIds.push_back(procId);
209         }
210     }
211 
212     out_numThreadsPerProcGroup = 0;
213     for (auto& node : out_nodes)
214     {
215         for (auto& core : node.cores)
216         {
217             out_numThreadsPerProcGroup += core.threadIds.size();
218         }
219     }
220 
221 #elif defined(__APPLE__)
222 
223     auto numProcessors  = 0;
224     auto numCores       = 0;
225     auto numPhysicalIds = 0;
226 
227     int    value;
228     size_t size = sizeof(value);
229 
230     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
231     SWR_ASSERT(result == 0);
232     numPhysicalIds = value;
233 
234     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
235     SWR_ASSERT(result == 0);
236     numProcessors = value;
237 
238     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
239     SWR_ASSERT(result == 0);
240     numCores = value;
241 
242     out_nodes.resize(numPhysicalIds);
243 
244     for (auto physId = 0; physId < numPhysicalIds; ++physId)
245     {
246         auto& numaNode = out_nodes[physId];
247         auto  procId   = 0;
248 
249         numaNode.cores.resize(numCores);
250 
251         while (procId < numProcessors)
252         {
253             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
254             {
255                 auto& core = numaNode.cores[coreId];
256 
257                 core.procGroup = coreId;
258                 core.threadIds.push_back(procId);
259             }
260         }
261     }
262 
263     out_numThreadsPerProcGroup = 0;
264 
265     for (auto& node : out_nodes)
266     {
267         for (auto& core : node.cores)
268         {
269             out_numThreadsPerProcGroup += core.threadIds.size();
270         }
271     }
272 
273 #else
274 
275 #error Unsupported platform
276 
277 #endif
278 
279     // Prune empty cores and numa nodes
280     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
281     {
282         // Erase empty cores (first)
283         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
284         {
285             if (core_it->threadIds.size() == 0)
286             {
287                 core_it = node_it->cores.erase(core_it);
288             }
289             else
290             {
291                 ++core_it;
292             }
293         }
294 
295         // Erase empty numa nodes (second)
296         if (node_it->cores.size() == 0)
297         {
298             node_it = out_nodes.erase(node_it);
299         }
300         else
301         {
302             ++node_it;
303         }
304     }
305 }
306 
bindThread(SWR_CONTEXT * pContext,uint32_t threadId,uint32_t procGroupId=0,bool bindProcGroup=false)307 void bindThread(SWR_CONTEXT* pContext,
308                 uint32_t     threadId,
309                 uint32_t     procGroupId   = 0,
310                 bool         bindProcGroup = false)
311 {
312     // Only bind threads when MAX_WORKER_THREADS isn't set.
313     if (pContext->threadInfo.SINGLE_THREADED ||
314         (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
315     {
316         return;
317     }
318 
319 #if defined(_WIN32)
320 
321     GROUP_AFFINITY affinity = {};
322     affinity.Group          = procGroupId;
323 
324 #if !defined(_WIN64)
325     if (threadId >= 32)
326     {
327         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
328         SWR_INVALID("Shouldn't get here");
329 
330         // In a 32-bit process on Windows it is impossible to bind
331         // to logical processors 32-63 within a processor group.
332         // In this case set the mask to 0 and let the system assign
333         // the processor.  Hopefully it will make smart choices.
334         affinity.Mask = 0;
335     }
336     else
337 #endif
338     {
339         // If MAX_WORKER_THREADS is set, only bind to the proc group,
340         // Not the individual HW thread.
341         if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
342         {
343             affinity.Mask = KAFFINITY(1) << threadId;
344         }
345         else
346         {
347             affinity.Mask = KAFFINITY(0);
348         }
349     }
350 
351     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
352     {
353         SWR_INVALID("Failed to set Thread Affinity");
354     }
355 
356 #elif defined(__linux__) || defined(__gnu_linux__)
357 
358     cpu_set_t cpuset;
359     pthread_t thread = pthread_self();
360     CPU_ZERO(&cpuset);
361     CPU_SET(threadId, &cpuset);
362 
363     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
364     if (err != 0)
365     {
366         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
367     }
368 
369 #endif
370 }
371 
372 INLINE
GetEnqueuedDraw(SWR_CONTEXT * pContext)373 uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
374 {
375     return pContext->dcRing.GetHead();
376 }
377 
378 INLINE
GetDC(SWR_CONTEXT * pContext,uint32_t drawId)379 DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
380 {
381     return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
382 }
383 
384 INLINE
IDComparesLess(uint32_t a,uint32_t b)385 bool IDComparesLess(uint32_t a, uint32_t b)
386 {
387     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
388     int32_t delta = int32_t(a - b);
389     return (delta < 0);
390 }
391 
392 // returns true if dependency not met
393 INLINE
CheckDependency(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t lastRetiredDraw)394 bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
395 {
396     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
397 }
398 
CheckDependencyFE(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t lastRetiredDraw)399 bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
400 {
401     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
402 }
403 
404 //////////////////////////////////////////////////////////////////////////
405 /// @brief Update client stats.
UpdateClientStats(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)406 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
407 {
408     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
409     {
410         return;
411     }
412 
413     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
414     OSALIGNLINE(SWR_STATS) stats{0};
415 
416     // Sum up stats across all workers before sending to client.
417     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
418     {
419         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
420         stats.PsInvocations += dynState.pStats[i].PsInvocations;
421         stats.CsInvocations += dynState.pStats[i].CsInvocations;
422 
423     }
424 
425 
426     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
427 }
428 
ExecuteCallbacks(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)429 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
430 {
431     UpdateClientStats(pContext, workerId, pDC);
432 
433     if (pDC->retireCallback.pfnCallbackFunc)
434     {
435         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
436                                             pDC->retireCallback.userData2,
437                                             pDC->retireCallback.userData3);
438 
439         // Callbacks to external code *could* change floating point control state
440         // Reset our optimal flags
441         SetOptimalVectorCSR();
442     }
443 }
444 
445 // inlined-only version
CompleteDrawContextInl(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)446 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
447 {
448     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
449     SWR_ASSERT(result >= 0);
450 
451     AR_FLUSH(pDC->drawId);
452 
453     if (result == 0)
454     {
455         ExecuteCallbacks(pContext, workerId, pDC);
456 
457 
458         // Cleanup memory allocations
459         pDC->pArena->Reset(true);
460         if (!pDC->isCompute)
461         {
462             pDC->pTileMgr->initialize();
463         }
464         if (pDC->cleanupState)
465         {
466             pDC->pState->pArena->Reset(true);
467         }
468 
469         _ReadWriteBarrier();
470 
471         pContext->dcRing.Dequeue(); // Remove from tail
472     }
473 
474     return result;
475 }
476 
477 // available to other translation modules
CompleteDrawContext(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC)478 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
479 {
480     return CompleteDrawContextInl(pContext, 0, pDC);
481 }
482 
FindFirstIncompleteDraw(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawBE,uint32_t & drawEnqueued)483 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
484                                     uint32_t     workerId,
485                                     uint32_t&    curDrawBE,
486                                     uint32_t&    drawEnqueued)
487 {
488     // increment our current draw id to the first incomplete draw
489     drawEnqueued = GetEnqueuedDraw(pContext);
490     while (IDComparesLess(curDrawBE, drawEnqueued))
491     {
492         DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
493 
494         // If its not compute and FE is not done then break out of loop.
495         if (!pDC->doneFE && !pDC->isCompute)
496             break;
497 
498         bool isWorkComplete =
499             pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
500 
501         if (isWorkComplete)
502         {
503             curDrawBE++;
504             CompleteDrawContextInl(pContext, workerId, pDC);
505         }
506         else
507         {
508             break;
509         }
510     }
511 
512     // If there are no more incomplete draws then return false.
513     return IDComparesLess(curDrawBE, drawEnqueued);
514 }
515 
516 //////////////////////////////////////////////////////////////////////////
517 /// @brief If there is any BE work then go work on it.
518 /// @param pContext - pointer to SWR context.
519 /// @param workerId - The unique worker ID that is assigned to this thread.
520 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
521 /// thread
522 ///                    has its own curDrawBE counter and this ensures that each worker processes all
523 ///                    the draws in order.
524 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
525 ///                      own set and each time it fails to lock a macrotile, because its already
526 ///                      locked, then it will add that tile to the lockedTiles set. As a worker
527 ///                      begins to work on future draws the lockedTiles ensure that it doesn't work
528 ///                      on tiles that may still have work pending in a previous draw. Additionally,
529 ///                      the lockedTiles is hueristic that can steer a worker back to the same
530 ///                      macrotile that it had been working on in a previous draw.
531 /// @returns        true if worker thread should shutdown
WorkOnFifoBE(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawBE,TileSet & lockedTiles,uint32_t numaNode,uint32_t numaMask)532 bool WorkOnFifoBE(SWR_CONTEXT* pContext,
533                   uint32_t     workerId,
534                   uint32_t&    curDrawBE,
535                   TileSet&     lockedTiles,
536                   uint32_t     numaNode,
537                   uint32_t     numaMask)
538 {
539     bool bShutdown = false;
540 
541     // Find the first incomplete draw that has pending work. If no such draw is found then
542     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
543     uint32_t drawEnqueued = 0;
544     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
545     {
546         return false;
547     }
548 
549     uint32_t lastRetiredDraw =
550         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
551 
552     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
553     lockedTiles.clear();
554 
555     // Try to work on each draw in order of the available draws in flight.
556     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
557     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
558     //      working on those macrotiles that are known to be complete in the prior draw to
559     //      maintain order. The locked tiles provides the history to ensures this.
560     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
561     {
562         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
563 
564         if (pDC->isCompute)
565             return false; // We don't look at compute work.
566 
567         // First wait for FE to be finished with this draw. This keeps threading model simple
568         // but if there are lots of bubbles between draws then serializing FE and BE may
569         // need to be revisited.
570         if (!pDC->doneFE)
571             return false;
572 
573         // If this draw is dependent on a previous draw then we need to bail.
574         if (CheckDependency(pContext, pDC, lastRetiredDraw))
575         {
576             return false;
577         }
578 
579         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
580         auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
581 
582         for (auto tile : macroTiles)
583         {
584             uint32_t tileID = tile->mId;
585 
586             // Only work on tiles for this numa node
587             uint32_t x, y;
588             pDC->pTileMgr->getTileIndices(tileID, x, y);
589             if (((x ^ y) & numaMask) != numaNode)
590             {
591                 _mm_pause();
592                 continue;
593             }
594 
595             if (!tile->getNumQueued())
596             {
597                 _mm_pause();
598                 continue;
599             }
600 
601             // can only work on this draw if it's not in use by other threads
602             if (lockedTiles.get(tileID))
603             {
604                 _mm_pause();
605                 continue;
606             }
607 
608             if (tile->tryLock())
609             {
610                 BE_WORK* pWork;
611 
612                 RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
613 
614                 uint32_t numWorkItems = tile->getNumQueued();
615                 SWR_ASSERT(numWorkItems);
616 
617                 pWork = tile->peek();
618                 SWR_ASSERT(pWork);
619                 if (pWork->type == DRAW)
620                 {
621                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
622                 }
623                 else if (pWork->type == SHUTDOWN)
624                 {
625                     bShutdown = true;
626                 }
627 
628                 while ((pWork = tile->peek()) != nullptr)
629                 {
630                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
631                     tile->dequeue();
632                 }
633                 RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
634 
635                 _ReadWriteBarrier();
636 
637                 pDC->pTileMgr->markTileComplete(tileID);
638 
639                 // Optimization: If the draw is complete and we're the last one to have worked on it
640                 // then we can reset the locked list as we know that all previous draws before the
641                 // next are guaranteed to be complete.
642                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
643                 {
644                     // We can increment the current BE and safely move to next draw since we know
645                     // this draw is complete.
646                     curDrawBE++;
647                     CompleteDrawContextInl(pContext, workerId, pDC);
648 
649                     lastRetiredDraw++;
650 
651                     lockedTiles.clear();
652                     break;
653                 }
654 
655                 if (bShutdown)
656                 {
657                     break;
658                 }
659             }
660             else
661             {
662                 // This tile is already locked. So let's add it to our locked tiles set. This way we
663                 // don't try locking this one again.
664                 lockedTiles.set(tileID);
665                 _mm_pause();
666             }
667         }
668     }
669 
670     return bShutdown;
671 }
672 
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Called when FE work is complete for this DC.
CompleteDrawFE(SWR_CONTEXT * pContext,uint32_t workerId,DRAW_CONTEXT * pDC)675 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
676 {
677     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
678     {
679         SWR_STATS_FE& stats = pDC->dynState.statsFE;
680 
681         AR_EVENT(FrontendStatsEvent(pDC->drawId,
682                                     stats.IaVertices,
683                                     stats.IaPrimitives,
684                                     stats.VsInvocations,
685                                     stats.HsInvocations,
686                                     stats.DsInvocations,
687                                     stats.GsInvocations,
688                                     stats.GsPrimitives,
689                                     stats.CInvocations,
690                                     stats.CPrimitives,
691                                     stats.SoPrimStorageNeeded[0],
692                                     stats.SoPrimStorageNeeded[1],
693                                     stats.SoPrimStorageNeeded[2],
694                                     stats.SoPrimStorageNeeded[3],
695                                     stats.SoNumPrimsWritten[0],
696                                     stats.SoNumPrimsWritten[1],
697                                     stats.SoNumPrimsWritten[2],
698                                     stats.SoNumPrimsWritten[3]));
699         AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
700 
701         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
702     }
703 
704     if (pContext->pfnUpdateSoWriteOffset)
705     {
706         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
707         {
708             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
709                 (pDC->pState->state.soBuffer[i].soWriteEnable))
710             {
711                 pContext->pfnUpdateSoWriteOffset(
712                     GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
713             }
714         }
715     }
716 
717     if (pContext->pfnUpdateStreamOut)
718         pContext->pfnUpdateStreamOut(GetPrivateState(pDC),  pDC->dynState.soPrims);
719 
720     // Ensure all streaming writes are globally visible before marking this FE done
721     _mm_mfence();
722     pDC->doneFE = true;
723 
724     InterlockedDecrement(&pContext->drawsOutstandingFE);
725 }
726 
WorkOnFifoFE(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawFE)727 void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
728 {
729     // Try to grab the next DC from the ring
730     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
731     while (IDComparesLess(curDrawFE, drawEnqueued))
732     {
733         uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
734         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
735         if (pDC->isCompute || pDC->doneFE)
736         {
737             CompleteDrawContextInl(pContext, workerId, pDC);
738             curDrawFE++;
739         }
740         else
741         {
742             break;
743         }
744     }
745 
746     uint32_t lastRetiredFE = curDrawFE - 1;
747     uint32_t curDraw       = curDrawFE;
748     while (IDComparesLess(curDraw, drawEnqueued))
749     {
750         uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
751         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
752 
753         if (!pDC->FeLock && !pDC->isCompute)
754         {
755             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
756             {
757                 return;
758             }
759 
760             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
761             if (initial == 0)
762             {
763                 // successfully grabbed the DC, now run the FE
764                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
765 
766                 CompleteDrawFE(pContext, workerId, pDC);
767             }
768             else
769             {
770                 _mm_pause();
771             }
772         }
773         else
774         {
775             _mm_pause();
776         }
777 
778         curDraw++;
779     }
780 }
781 
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief If there is any compute work then go work on it.
784 /// @param pContext - pointer to SWR context.
785 /// @param workerId - The unique worker ID that is assigned to this thread.
786 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
787 /// thread
788 ///                    has its own curDrawBE counter and this ensures that each worker processes all
789 ///                    the draws in order.
WorkOnCompute(SWR_CONTEXT * pContext,uint32_t workerId,uint32_t & curDrawBE)790 void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
791 {
792     uint32_t drawEnqueued = 0;
793     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
794     {
795         return;
796     }
797 
798     uint32_t lastRetiredDraw =
799         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
800 
801     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
802     {
803         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
804         if (pDC->isCompute == false)
805             return;
806 
807         // check dependencies
808         if (CheckDependency(pContext, pDC, lastRetiredDraw))
809         {
810             return;
811         }
812 
813         SWR_ASSERT(pDC->pDispatch != nullptr);
814         DispatchQueue& queue = *pDC->pDispatch;
815 
816         // Is there any work remaining?
817         if (queue.getNumQueued() > 0)
818         {
819             void*    pSpillFillBuffer = nullptr;
820             void*    pScratchSpace    = nullptr;
821             uint32_t threadGroupId    = 0;
822             while (queue.getWork(threadGroupId))
823             {
824                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
825                 queue.finishedWork();
826             }
827 
828             // Ensure all streaming writes are globally visible before moving onto the next draw
829             _mm_mfence();
830         }
831     }
832 }
833 
BindApiThread(SWR_CONTEXT * pContext,uint32_t apiThreadId)834 void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
835 {
836     if (nullptr == pContext)
837     {
838         return;
839     }
840 
841     if (apiThreadId >= pContext->threadPool.numReservedThreads)
842     {
843         if (pContext->threadPool.numReservedThreads)
844         {
845             const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
846             // Just bind to the process group used for API thread 0
847             bindThread(pContext, 0, threadData.procGroupId, true);
848         }
849         return;
850     }
851 
852     const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
853 
854     bindThread(
855         pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
856 }
857 
858 template <bool IsFEThread, bool IsBEThread>
workerThreadMain(LPVOID pData)859 DWORD workerThreadMain(LPVOID pData)
860 {
861     THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
862     SWR_CONTEXT* pContext    = pThreadData->pContext;
863     uint32_t     threadId    = pThreadData->threadId;
864     uint32_t     workerId    = pThreadData->workerId;
865 
866     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
867 
868     {
869         char threadName[64];
870         sprintf_s(threadName,
871 #if defined(_WIN32)
872                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
873 #else
874                   // linux pthread name limited to 16 chars (including \0)
875                   "w%03d-n%d-c%03d-t%d",
876 #endif
877                   workerId,
878                   pThreadData->numaId,
879                   pThreadData->coreId,
880                   pThreadData->htId);
881         SetCurrentThreadName(threadName);
882     }
883 
884     RDTSC_INIT(pContext->pBucketMgr, threadId);
885 
886     // Only need offset numa index from base for correct masking
887     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
888     uint32_t numaMask = pContext->threadPool.numaMask;
889 
890     SetOptimalVectorCSR();
891 
892     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
893     // locked then we'll add it to this list so that we don't try and lock it again.
894     TileSet lockedTiles;
895 
896     // each worker has the ability to work on any of the queued draws as long as certain
897     // conditions are met. the data associated
898     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
899     // has moved on to the next draw when he determines there is no more work to do. The api
900     // thread will not increment the head of the dc ring until all workers have moved past the
901     // current head.
902     // the logic to determine what to work on is:
903     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
904     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
905     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
906     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
907     //    trying until he reaches the tail.
908     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
909     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
910     //    any work left by comparing the total # of binned work items and the total # of completed
911     //    work items. If they are equal, then there is no more work to do for this draw, and
912     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
913     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
914 
915     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
916 
917     uint32_t curDrawBE = 0;
918     uint32_t curDrawFE = 0;
919 
920     bool bShutdown = false;
921 
922     while (true)
923     {
924         if (bShutdown && !threadHasWork(curDrawBE))
925         {
926             break;
927         }
928 
929         uint32_t loop = 0;
930         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
931         {
932             _mm_pause();
933         }
934 
935         if (!threadHasWork(curDrawBE))
936         {
937             lock.lock();
938 
939             // check for thread idle condition again under lock
940             if (threadHasWork(curDrawBE))
941             {
942                 lock.unlock();
943                 continue;
944             }
945 
946             pContext->FifosNotEmpty.wait(lock);
947             lock.unlock();
948         }
949 
950         if (IsBEThread)
951         {
952             RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
953             bShutdown |=
954                 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
955             RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
956 
957             WorkOnCompute(pContext, workerId, curDrawBE);
958         }
959 
960         if (IsFEThread)
961         {
962             WorkOnFifoFE(pContext, workerId, curDrawFE);
963 
964             if (!IsBEThread)
965             {
966                 curDrawBE = curDrawFE;
967             }
968         }
969     }
970 
971     return 0;
972 }
973 template <>
974 DWORD workerThreadMain<false, false>(LPVOID) = delete;
975 
976 template <bool IsFEThread, bool IsBEThread>
workerThreadInit(LPVOID pData)977 DWORD workerThreadInit(LPVOID pData)
978 {
979 #if defined(_MSC_VER)
980     __try
981 #endif // _WIN32
982     {
983         return workerThreadMain<IsFEThread, IsBEThread>(pData);
984     }
985 
986 #if defined(_MSC_VER)
987     __except (EXCEPTION_CONTINUE_SEARCH)
988     {
989     }
990 
991 #endif // _WIN32
992 
993     return 1;
994 }
995 template <>
996 DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
997 
InitPerThreadStats(SWR_CONTEXT * pContext,uint32_t numThreads)998 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
999 {
1000     // Initialize DRAW_CONTEXT's per-thread stats
1001     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
1002     {
1003         pContext->dcRing[dc].dynState.pStats =
1004             (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
1005         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
1006     }
1007 }
1008 
1009 //////////////////////////////////////////////////////////////////////////
1010 /// @brief Creates thread pool info but doesn't launch threads.
1011 /// @param pContext - pointer to context
1012 /// @param pPool - pointer to thread pool object.
CreateThreadPool(SWR_CONTEXT * pContext,THREAD_POOL * pPool)1013 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1014 {
1015     CPUNumaNodes nodes;
1016     uint32_t     numThreadsPerProcGroup = 0;
1017     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
1018     assert(numThreadsPerProcGroup > 0);
1019 
1020     // Assumption, for asymmetric topologies, multi-threaded cores will appear
1021     // in the list before single-threaded cores.  This appears to be true for
1022     // Windows when the total HW threads is limited to 64.
1023     uint32_t numHWNodes        = (uint32_t)nodes.size();
1024     uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
1025     uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
1026 
1027 #if defined(_WIN32) && !defined(_WIN64)
1028     if (!pContext->threadInfo.MAX_WORKER_THREADS)
1029     {
1030         // Limit 32-bit windows to bindable HW threads only
1031         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
1032         {
1033             numHWCoresPerNode = 32 / numHWHyperThreads;
1034         }
1035     }
1036 #endif
1037 
1038     // Calculate num HW threads.  Due to asymmetric topologies, this is not
1039     // a trivial multiplication.
1040     uint32_t numHWThreads = 0;
1041     for (auto const& node : nodes)
1042     {
1043         for (auto const& core : node.cores)
1044         {
1045             numHWThreads += (uint32_t)core.threadIds.size();
1046         }
1047     }
1048 
1049     uint32_t numNodes        = numHWNodes;
1050     uint32_t numCoresPerNode = numHWCoresPerNode;
1051     uint32_t numHyperThreads = numHWHyperThreads;
1052 
1053     // Calc used threads per-core
1054     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1055     {
1056         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1057     }
1058     else
1059     {
1060         SWR_ASSERT(false,
1061                    "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1062                    pContext->threadInfo.BASE_THREAD,
1063                    numHyperThreads);
1064         pContext->threadInfo.BASE_THREAD = 0;
1065     }
1066 
1067     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1068     {
1069         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1070     }
1071 
1072     // Prune any cores that don't support the number of threads
1073     if (numHyperThreads > 1)
1074     {
1075         for (auto& node : nodes)
1076         {
1077             uint32_t numUsableCores = 0;
1078             for (auto& core : node.cores)
1079             {
1080                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1081             }
1082             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1083         }
1084     }
1085 
1086     // Calc used cores per NUMA node
1087     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1088     {
1089         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1090     }
1091     else
1092     {
1093         SWR_ASSERT(false,
1094                    "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1095                    pContext->threadInfo.BASE_CORE,
1096                    numCoresPerNode);
1097         pContext->threadInfo.BASE_CORE = 0;
1098     }
1099 
1100     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1101     {
1102         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1103     }
1104 
1105     // Calc used NUMA nodes
1106     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1107     {
1108         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1109     }
1110     else
1111     {
1112         SWR_ASSERT(
1113             false,
1114             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1115             pContext->threadInfo.BASE_NUMA_NODE,
1116             numNodes);
1117         pContext->threadInfo.BASE_NUMA_NODE = 0;
1118     }
1119 
1120     if (pContext->threadInfo.MAX_NUMA_NODES)
1121     {
1122         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1123     }
1124 
1125     // Calculate numThreads - at this point everything should be symmetric
1126     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1127     SWR_REL_ASSERT(numThreads <= numHWThreads);
1128 
1129     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1130     uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
1131     uint32_t  numRemovedThreads     = 0;
1132 
1133     if (pContext->threadInfo.SINGLE_THREADED)
1134     {
1135         numAPIReservedThreads      = 0;
1136         numThreads                 = 1;
1137         pContext->NumWorkerThreads = 1;
1138         pContext->NumFEThreads     = 1;
1139         pContext->NumBEThreads     = 1;
1140         pPool->numThreads          = 0;
1141     }
1142     else if (pContext->threadInfo.MAX_WORKER_THREADS)
1143     {
1144         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1145         pContext->threadInfo.BASE_NUMA_NODE = 0;
1146         pContext->threadInfo.BASE_CORE      = 0;
1147         pContext->threadInfo.BASE_THREAD    = 0;
1148         numAPIReservedThreads               = 0;
1149     }
1150     else
1151     {
1152         if (numAPIReservedThreads >= numThreads)
1153         {
1154             numAPIReservedThreads = 0;
1155         }
1156         else if (numAPIReservedThreads)
1157         {
1158             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1159 
1160             if (0 == numAPIThreadsPerCore)
1161             {
1162                 numAPIThreadsPerCore = numHWHyperThreads;
1163             }
1164 
1165             numRemovedThreads = numAPIReservedThreads;
1166             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1167             {
1168                 // Adjust removed threads to make logic below work
1169                 numRemovedThreads =
1170                     std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1171             }
1172 
1173             numThreads -= numRemovedThreads;
1174         }
1175     }
1176 
1177     InitPerThreadStats(pContext, numThreads);
1178 
1179     if (pContext->threadInfo.SINGLE_THREADED)
1180     {
1181         numAPIReservedThreads = 0;
1182         numThreads            = 1;
1183     }
1184 
1185     if (numAPIReservedThreads)
1186     {
1187         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1188         SWR_ASSERT(pPool->pApiThreadData);
1189         if (!pPool->pApiThreadData)
1190         {
1191             numAPIReservedThreads = 0;
1192         }
1193         else
1194         {
1195             memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
1196         }
1197     }
1198     pPool->numReservedThreads = numAPIReservedThreads;
1199 
1200     pPool->numThreads          = numThreads;
1201     pContext->NumWorkerThreads = pPool->numThreads;
1202 
1203     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1204     assert(pPool->pThreadData);
1205     memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
1206     pPool->numaMask = 0;
1207 
1208     // Allocate worker private data
1209     pPool->pWorkerPrivateDataArray = nullptr;
1210     if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
1211     {
1212         pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
1213         pContext->workerPrivateState.pfnInitWorkerData = nullptr;
1214         pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
1215     }
1216 
1217     // initialize contents of SWR_WORKER_DATA
1218     size_t perWorkerSize =
1219         AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
1220     size_t totalSize = perWorkerSize * pPool->numThreads;
1221     if (totalSize)
1222     {
1223         pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
1224         SWR_ASSERT(pPool->pWorkerPrivateDataArray);
1225 
1226         void* pWorkerData = pPool->pWorkerPrivateDataArray;
1227         for (uint32_t i = 0; i < pPool->numThreads; ++i)
1228         {
1229             pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
1230             if (pContext->workerPrivateState.pfnInitWorkerData)
1231             {
1232                 pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
1233             }
1234             pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
1235         }
1236     }
1237 
1238     if (pContext->threadInfo.SINGLE_THREADED)
1239     {
1240         return;
1241     }
1242 
1243     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1244     assert(pPool->pThreads);
1245 
1246     if (pContext->threadInfo.MAX_WORKER_THREADS)
1247     {
1248         bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1249         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1250         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1251         // But Windows will still require binding to specific process groups
1252         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1253         {
1254             pPool->pThreadData[workerId].workerId           = workerId;
1255             pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
1256             pPool->pThreadData[workerId].threadId           = 0;
1257             pPool->pThreadData[workerId].numaId             = 0;
1258             pPool->pThreadData[workerId].coreId             = 0;
1259             pPool->pThreadData[workerId].htId               = 0;
1260             pPool->pThreadData[workerId].pContext           = pContext;
1261             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1262 
1263             pContext->NumBEThreads++;
1264             pContext->NumFEThreads++;
1265         }
1266     }
1267     else
1268     {
1269         // numa distribution assumes workers on all nodes
1270         bool useNuma = true;
1271         if (numCoresPerNode * numHyperThreads == 1)
1272         {
1273             useNuma = false;
1274         }
1275 
1276         if (useNuma)
1277         {
1278             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1279         }
1280         else
1281         {
1282             pPool->numaMask = 0;
1283         }
1284 
1285         uint32_t workerId           = 0;
1286         uint32_t numReservedThreads = numAPIReservedThreads;
1287         for (uint32_t n = 0; n < numNodes; ++n)
1288         {
1289             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1290             {
1291                 break;
1292             }
1293             auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1294             uint32_t numCores = numCoresPerNode;
1295             for (uint32_t c = 0; c < numCores; ++c)
1296             {
1297                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1298                 {
1299                     break;
1300                 }
1301 
1302                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1303                 for (uint32_t t = 0; t < numHyperThreads; ++t)
1304                 {
1305                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1306                     {
1307                         break;
1308                     }
1309 
1310                     if (numRemovedThreads)
1311                     {
1312                         --numRemovedThreads;
1313                         assert(numReservedThreads);
1314                         --numReservedThreads;
1315                         pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1316                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1317                         pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
1318                         pPool->pApiThreadData[numReservedThreads].numaId =
1319                             useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1320                         pPool->pApiThreadData[numReservedThreads].coreId =
1321                             c + pContext->threadInfo.BASE_CORE;
1322                         pPool->pApiThreadData[numReservedThreads].htId =
1323                             t + pContext->threadInfo.BASE_THREAD;
1324                         pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1325                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1326 
1327                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1328                         {
1329                             --numReservedThreads;
1330                             pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1331                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1332                             pPool->pApiThreadData[numReservedThreads].threadId =
1333                                 core.threadIds[t + 1];
1334                             pPool->pApiThreadData[numReservedThreads].numaId =
1335                                 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1336                             pPool->pApiThreadData[numReservedThreads].coreId =
1337                                 c + pContext->threadInfo.BASE_CORE;
1338                             pPool->pApiThreadData[numReservedThreads].htId =
1339                                 t + pContext->threadInfo.BASE_THREAD;
1340                             pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1341                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1342                         }
1343 
1344                         continue;
1345                     }
1346 
1347                     SWR_ASSERT(workerId < numThreads);
1348 
1349                     pPool->pThreadData[workerId].workerId    = workerId;
1350                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
1351                     pPool->pThreadData[workerId].threadId =
1352                         core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1353                     pPool->pThreadData[workerId].numaId =
1354                         useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1355                     pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
1356                     pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
1357                     pPool->pThreadData[workerId].pContext = pContext;
1358                     pPool->pThreadData[workerId].forceBindProcGroup = false;
1359 
1360                     pContext->NumBEThreads++;
1361                     pContext->NumFEThreads++;
1362 
1363                     ++workerId;
1364                 }
1365             }
1366         }
1367         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1368     }
1369 }
1370 
1371 //////////////////////////////////////////////////////////////////////////
1372 /// @brief Launches worker threads in thread pool.
1373 /// @param pContext - pointer to context
1374 /// @param pPool - pointer to thread pool object.
StartThreadPool(SWR_CONTEXT * pContext,THREAD_POOL * pPool)1375 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1376 {
1377     if (pContext->threadInfo.SINGLE_THREADED)
1378     {
1379         return;
1380     }
1381 
1382     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1383     {
1384         pPool->pThreads[workerId] =
1385             new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1386     }
1387 }
1388 
1389 //////////////////////////////////////////////////////////////////////////
1390 /// @brief Destroys thread pool.
1391 /// @param pContext - pointer to context
1392 /// @param pPool - pointer to thread pool object.
DestroyThreadPool(SWR_CONTEXT * pContext,THREAD_POOL * pPool)1393 void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1394 {
1395     // Wait for all threads to finish
1396     SwrWaitForIdle(pContext);
1397 
1398     // Wait for threads to finish and destroy them
1399     for (uint32_t t = 0; t < pPool->numThreads; ++t)
1400     {
1401         if (!pContext->threadInfo.SINGLE_THREADED)
1402         {
1403             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
1404             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1405             pPool->pThreads[t]->detach();
1406             delete (pPool->pThreads[t]);
1407         }
1408 
1409         if (pContext->workerPrivateState.pfnFinishWorkerData)
1410         {
1411             pContext->workerPrivateState.pfnFinishWorkerData(
1412                 pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
1413         }
1414     }
1415 
1416     delete[] pPool->pThreads;
1417 
1418     // Clean up data used by threads
1419     delete[] pPool->pThreadData;
1420     delete[] pPool->pApiThreadData;
1421 
1422     AlignedFree(pPool->pWorkerPrivateDataArray);
1423 }
1424