1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file frontend.cpp
24  *
25  * @brief Implementation for Frontend which handles vertex processing,
26  *        primitive assembly, clipping, binning, etc.
27  *
28  ******************************************************************************/
29 
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43 
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
ProcessSync(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)51 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
52 {
53     BE_WORK work;
54     work.type    = SYNC;
55     work.pfnWork = ProcessSyncBE;
56 
57     MacroTileMgr* pTileMgr = pDC->pTileMgr;
58     pTileMgr->enqueue(0, 0, &work);
59 }
60 
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
ProcessShutdown(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)67 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
68 {
69     BE_WORK work;
70     work.type    = SHUTDOWN;
71     work.pfnWork = ProcessShutdownBE;
72 
73     MacroTileMgr* pTileMgr = pDC->pTileMgr;
74     // Enqueue at least 1 work item for each worker thread
75     // account for number of numa nodes
76     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
77 
78     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
79     {
80         for (uint32_t n = 0; n < numNumaNodes; ++n)
81         {
82             pTileMgr->enqueue(i, n, &work);
83         }
84     }
85 }
86 
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
ProcessClear(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)94 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
95 {
96     CLEAR_DESC*   pDesc    = (CLEAR_DESC*)pUserData;
97     MacroTileMgr* pTileMgr = pDC->pTileMgr;
98 
99     // queue a clear to each macro tile
100     // compute macro tile bounds for the specified rect
101     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
102     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
103     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
104     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
105 
106     BE_WORK work;
107     work.type       = CLEAR;
108     work.pfnWork    = ProcessClearBE;
109     work.desc.clear = *pDesc;
110 
111     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
112     {
113         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
114         {
115             pTileMgr->enqueue(x, y, &work);
116         }
117     }
118 }
119 
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
ProcessStoreTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)127 void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
128 {
129     RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
130     MacroTileMgr*     pTileMgr = pDC->pTileMgr;
131     STORE_TILES_DESC* pDesc    = (STORE_TILES_DESC*)pUserData;
132 
133     // queue a store to each macro tile
134     // compute macro tile bounds for the specified rect
135     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
136     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
137     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
138     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
139 
140     // store tiles
141     BE_WORK work;
142     work.type            = STORETILES;
143     work.pfnWork         = ProcessStoreTilesBE;
144     work.desc.storeTiles = *pDesc;
145 
146     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
147     {
148         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
149         {
150             pTileMgr->enqueue(x, y, &work);
151         }
152     }
153 
154     RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
155 }
156 
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
ProcessDiscardInvalidateTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
165                                    DRAW_CONTEXT* pDC,
166                                    uint32_t      workerId,
167                                    void*         pUserData)
168 {
169     RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
170     DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
171     MacroTileMgr*                  pTileMgr = pDC->pTileMgr;
172 
173     // compute macro tile bounds for the specified rect
174     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
175     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
176     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
177     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
178 
179     if (pDesc->fullTilesOnly == false)
180     {
181         // include partial tiles
182         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
183         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
184         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
185         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
186     }
187 
188     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
189     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
190 
191     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
192     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
193 
194     // load tiles
195     BE_WORK work;
196     work.type                        = DISCARDINVALIDATETILES;
197     work.pfnWork                     = ProcessDiscardInvalidateTilesBE;
198     work.desc.discardInvalidateTiles = *pDesc;
199 
200     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
201     {
202         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
203         {
204             pTileMgr->enqueue(x, y, &work);
205         }
206     }
207 
208     RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
209 }
210 
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
GetNumPrims(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
217 {
218     switch (mode)
219     {
220     case TOP_POINT_LIST:
221         return numPrims;
222     case TOP_TRIANGLE_LIST:
223         return numPrims / 3;
224     case TOP_TRIANGLE_STRIP:
225         return numPrims < 3 ? 0 : numPrims - 2;
226     case TOP_TRIANGLE_FAN:
227         return numPrims < 3 ? 0 : numPrims - 2;
228     case TOP_TRIANGLE_DISC:
229         return numPrims < 2 ? 0 : numPrims - 1;
230     case TOP_QUAD_LIST:
231         return numPrims / 4;
232     case TOP_QUAD_STRIP:
233         return numPrims < 4 ? 0 : (numPrims - 2) / 2;
234     case TOP_LINE_STRIP:
235         return numPrims < 2 ? 0 : numPrims - 1;
236     case TOP_LINE_LIST:
237         return numPrims / 2;
238     case TOP_LINE_LOOP:
239         return numPrims;
240     case TOP_RECT_LIST:
241         return numPrims / 3;
242     case TOP_LINE_LIST_ADJ:
243         return numPrims / 4;
244     case TOP_LISTSTRIP_ADJ:
245         return numPrims < 3 ? 0 : numPrims - 3;
246     case TOP_TRI_LIST_ADJ:
247         return numPrims / 6;
248     case TOP_TRI_STRIP_ADJ:
249         return numPrims < 4 ? 0 : (numPrims / 2) - 2;
250 
251     case TOP_PATCHLIST_1:
252     case TOP_PATCHLIST_2:
253     case TOP_PATCHLIST_3:
254     case TOP_PATCHLIST_4:
255     case TOP_PATCHLIST_5:
256     case TOP_PATCHLIST_6:
257     case TOP_PATCHLIST_7:
258     case TOP_PATCHLIST_8:
259     case TOP_PATCHLIST_9:
260     case TOP_PATCHLIST_10:
261     case TOP_PATCHLIST_11:
262     case TOP_PATCHLIST_12:
263     case TOP_PATCHLIST_13:
264     case TOP_PATCHLIST_14:
265     case TOP_PATCHLIST_15:
266     case TOP_PATCHLIST_16:
267     case TOP_PATCHLIST_17:
268     case TOP_PATCHLIST_18:
269     case TOP_PATCHLIST_19:
270     case TOP_PATCHLIST_20:
271     case TOP_PATCHLIST_21:
272     case TOP_PATCHLIST_22:
273     case TOP_PATCHLIST_23:
274     case TOP_PATCHLIST_24:
275     case TOP_PATCHLIST_25:
276     case TOP_PATCHLIST_26:
277     case TOP_PATCHLIST_27:
278     case TOP_PATCHLIST_28:
279     case TOP_PATCHLIST_29:
280     case TOP_PATCHLIST_30:
281     case TOP_PATCHLIST_31:
282     case TOP_PATCHLIST_32:
283         return numPrims / (mode - TOP_PATCHLIST_BASE);
284 
285     case TOP_POLYGON:
286     case TOP_POINT_LIST_BF:
287     case TOP_LINE_STRIP_CONT:
288     case TOP_LINE_STRIP_BF:
289     case TOP_LINE_STRIP_CONT_BF:
290     case TOP_TRIANGLE_FAN_NOSTIPPLE:
291     case TOP_TRI_STRIP_REVERSE:
292     case TOP_PATCHLIST_BASE:
293     case TOP_UNKNOWN:
294         SWR_INVALID("Unsupported topology: %d", mode);
295         return 0;
296     }
297 
298     return 0;
299 }
300 
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
GetNumVerts(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
306 {
307     switch (mode)
308     {
309     case TOP_POINT_LIST:
310         return numPrims;
311     case TOP_TRIANGLE_LIST:
312         return numPrims * 3;
313     case TOP_TRIANGLE_STRIP:
314         return numPrims ? numPrims + 2 : 0;
315     case TOP_TRIANGLE_FAN:
316         return numPrims ? numPrims + 2 : 0;
317     case TOP_TRIANGLE_DISC:
318         return numPrims ? numPrims + 1 : 0;
319     case TOP_QUAD_LIST:
320         return numPrims * 4;
321     case TOP_QUAD_STRIP:
322         return numPrims ? numPrims * 2 + 2 : 0;
323     case TOP_LINE_STRIP:
324         return numPrims ? numPrims + 1 : 0;
325     case TOP_LINE_LIST:
326         return numPrims * 2;
327     case TOP_LINE_LOOP:
328         return numPrims;
329     case TOP_RECT_LIST:
330         return numPrims * 3;
331     case TOP_LINE_LIST_ADJ:
332         return numPrims * 4;
333     case TOP_LISTSTRIP_ADJ:
334         return numPrims ? numPrims + 3 : 0;
335     case TOP_TRI_LIST_ADJ:
336         return numPrims * 6;
337     case TOP_TRI_STRIP_ADJ:
338         return numPrims ? (numPrims + 2) * 2 : 0;
339 
340     case TOP_PATCHLIST_1:
341     case TOP_PATCHLIST_2:
342     case TOP_PATCHLIST_3:
343     case TOP_PATCHLIST_4:
344     case TOP_PATCHLIST_5:
345     case TOP_PATCHLIST_6:
346     case TOP_PATCHLIST_7:
347     case TOP_PATCHLIST_8:
348     case TOP_PATCHLIST_9:
349     case TOP_PATCHLIST_10:
350     case TOP_PATCHLIST_11:
351     case TOP_PATCHLIST_12:
352     case TOP_PATCHLIST_13:
353     case TOP_PATCHLIST_14:
354     case TOP_PATCHLIST_15:
355     case TOP_PATCHLIST_16:
356     case TOP_PATCHLIST_17:
357     case TOP_PATCHLIST_18:
358     case TOP_PATCHLIST_19:
359     case TOP_PATCHLIST_20:
360     case TOP_PATCHLIST_21:
361     case TOP_PATCHLIST_22:
362     case TOP_PATCHLIST_23:
363     case TOP_PATCHLIST_24:
364     case TOP_PATCHLIST_25:
365     case TOP_PATCHLIST_26:
366     case TOP_PATCHLIST_27:
367     case TOP_PATCHLIST_28:
368     case TOP_PATCHLIST_29:
369     case TOP_PATCHLIST_30:
370     case TOP_PATCHLIST_31:
371     case TOP_PATCHLIST_32:
372         return numPrims * (mode - TOP_PATCHLIST_BASE);
373 
374     case TOP_POLYGON:
375     case TOP_POINT_LIST_BF:
376     case TOP_LINE_STRIP_CONT:
377     case TOP_LINE_STRIP_BF:
378     case TOP_LINE_STRIP_CONT_BF:
379     case TOP_TRIANGLE_FAN_NOSTIPPLE:
380     case TOP_TRI_STRIP_REVERSE:
381     case TOP_PATCHLIST_BASE:
382     case TOP_UNKNOWN:
383         SWR_INVALID("Unsupported topology: %d", mode);
384         return 0;
385     }
386 
387     return 0;
388 }
389 
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology,bool includeAdjVerts)394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
395 {
396     uint32_t numVerts = 0;
397     switch (topology)
398     {
399     case TOP_POINT_LIST:
400     case TOP_POINT_LIST_BF:
401         numVerts = 1;
402         break;
403     case TOP_LINE_LIST:
404     case TOP_LINE_STRIP:
405     case TOP_LINE_LIST_ADJ:
406     case TOP_LINE_LOOP:
407     case TOP_LINE_STRIP_CONT:
408     case TOP_LINE_STRIP_BF:
409     case TOP_LISTSTRIP_ADJ:
410         numVerts = 2;
411         break;
412     case TOP_TRIANGLE_LIST:
413     case TOP_TRIANGLE_STRIP:
414     case TOP_TRIANGLE_FAN:
415     case TOP_TRI_LIST_ADJ:
416     case TOP_TRI_STRIP_ADJ:
417     case TOP_TRI_STRIP_REVERSE:
418     case TOP_RECT_LIST:
419         numVerts = 3;
420         break;
421     case TOP_QUAD_LIST:
422     case TOP_QUAD_STRIP:
423         numVerts = 4;
424         break;
425     case TOP_PATCHLIST_1:
426     case TOP_PATCHLIST_2:
427     case TOP_PATCHLIST_3:
428     case TOP_PATCHLIST_4:
429     case TOP_PATCHLIST_5:
430     case TOP_PATCHLIST_6:
431     case TOP_PATCHLIST_7:
432     case TOP_PATCHLIST_8:
433     case TOP_PATCHLIST_9:
434     case TOP_PATCHLIST_10:
435     case TOP_PATCHLIST_11:
436     case TOP_PATCHLIST_12:
437     case TOP_PATCHLIST_13:
438     case TOP_PATCHLIST_14:
439     case TOP_PATCHLIST_15:
440     case TOP_PATCHLIST_16:
441     case TOP_PATCHLIST_17:
442     case TOP_PATCHLIST_18:
443     case TOP_PATCHLIST_19:
444     case TOP_PATCHLIST_20:
445     case TOP_PATCHLIST_21:
446     case TOP_PATCHLIST_22:
447     case TOP_PATCHLIST_23:
448     case TOP_PATCHLIST_24:
449     case TOP_PATCHLIST_25:
450     case TOP_PATCHLIST_26:
451     case TOP_PATCHLIST_27:
452     case TOP_PATCHLIST_28:
453     case TOP_PATCHLIST_29:
454     case TOP_PATCHLIST_30:
455     case TOP_PATCHLIST_31:
456     case TOP_PATCHLIST_32:
457         numVerts = topology - TOP_PATCHLIST_BASE;
458         break;
459     default:
460         SWR_INVALID("Unsupported topology: %d", topology);
461         break;
462     }
463 
464     if (includeAdjVerts)
465     {
466         switch (topology)
467         {
468         case TOP_LISTSTRIP_ADJ:
469         case TOP_LINE_LIST_ADJ:
470             numVerts = 4;
471             break;
472         case TOP_TRI_STRIP_ADJ:
473         case TOP_TRI_LIST_ADJ:
474             numVerts = 6;
475             break;
476         default:
477             break;
478         }
479     }
480 
481     return numVerts;
482 }
483 
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
GenerateMask(uint32_t numItemsRemaining)487 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
488 {
489     uint32_t numActive =
490         (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
491     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492     return _simd_castps_si(_simd_vmask_ps(mask));
493 }
494 
GenerateMask16(uint32_t numItemsRemaining)495 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
496 {
497     uint32_t numActive =
498         (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
499     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
500     return _simd16_castps_si(_simd16_vmask_ps(mask));
501 }
502 
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 ///        Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
StreamOut(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,uint32_t * pPrimData,uint32_t streamIndex)509 static void StreamOut(
510     DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
511 {
512     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
513 
514     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
515 
516     const API_STATE&           state   = GetApiState(pDC);
517     const SWR_STREAMOUT_STATE& soState = state.soState;
518 
519     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
520 
521     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
522     // vertex.
523     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
524 
525     SWR_STREAMOUT_CONTEXT soContext = {0};
526 
527     // Setup buffer state pointers.
528     for (uint32_t i = 0; i < 4; ++i)
529     {
530         soContext.pBuffer[i] = &state.soBuffer[i];
531     }
532 
533     uint32_t numPrims = pa.NumPrims();
534 
535     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
536     {
537         unsigned long slot = 0;
538         uint64_t soMask = soState.streamMasks[streamIndex];
539 
540         // Write all entries into primitive data buffer for SOS.
541         while (_BitScanForward64(&slot, soMask))
542         {
543             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
544             uint32_t    paSlot = slot + soState.vertexAttribOffset[streamIndex];
545             pa.AssembleSingle(paSlot, primIndex, attrib);
546 
547             // Attribute offset is relative offset from start of vertex.
548             // Note that attributes start at slot 1 in the PA buffer. We need to write this
549             // to prim data starting at slot 0. Which is why we do (slot - 1).
550             // Also note: GL works slightly differently, and needs slot 0
551             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
552 
553             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
555             {
556                 uint32_t* pPrimDataAttrib =
557                     pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
558 
559                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
560             }
561 
562             soMask &= ~(uint64_t(1) << slot);
563         }
564 
565         // Update pPrimData pointer
566         soContext.pPrimData = pPrimData;
567 
568         // Call SOS
569         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
570                    "Trying to execute uninitialized streamout jit function.");
571         state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
572     }
573 
574     // Update SO write offset. The driver provides memory for the update.
575     for (uint32_t i = 0; i < 4; ++i)
576     {
577         if (state.soBuffer[i].pWriteOffset)
578         {
579             bool  nullTileAccessed = false;
580             void* pWriteOffset     = pDC->pContext->pfnTranslateGfxptrForWrite(
581                 GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
582             *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
583         }
584 
585         if (state.soBuffer[i].soWriteEnable)
586         {
587             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
588             pDC->dynState.SoWriteOffsetDirty[i] = true;
589         }
590     }
591 
592     pDC->dynState.soPrims += soContext.numPrimsWritten;
593 
594     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
595     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
596 
597     RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
598 }
599 
600 #if USE_SIMD16_FRONTEND
601 //////////////////////////////////////////////////////////////////////////
602 /// Is value an even number (a multiple of two)
603 ///
604 template <typename T>
IsEven(T value)605 INLINE static bool IsEven(T value)
606 {
607     return (value & 1) == 0;
608 }
609 
610 //////////////////////////////////////////////////////////////////////////
611 /// Round up value to an even number (a multiple of two)
612 ///
613 template <typename T>
RoundUpEven(T value)614 INLINE static T RoundUpEven(T value)
615 {
616     return (value + 1) & ~1;
617 }
618 
619 //////////////////////////////////////////////////////////////////////////
620 /// Round down value to an even number (a multiple of two)
621 ///
622 template <typename T>
RoundDownEven(T value)623 INLINE static T RoundDownEven(T value)
624 {
625     return value & ~1;
626 }
627 
628 //////////////////////////////////////////////////////////////////////////
629 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
630 ///
631 /// vertexCount is in terms of the source simdvertexes and must be even
632 ///
633 /// attribCount will limit the vector copies to those attribs specified
634 ///
635 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
636 ///
PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex * vertex_simd16,const simdvertex * vertex,uint32_t vertexCount,uint32_t attribCount)637 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex*     vertex_simd16,
638                                            const simdvertex* vertex,
639                                            uint32_t          vertexCount,
640                                            uint32_t          attribCount)
641 {
642     SWR_ASSERT(vertex);
643     SWR_ASSERT(vertex_simd16);
644     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
645 
646     simd16vertex temp;
647 
648     for (uint32_t i = 0; i < vertexCount; i += 2)
649     {
650         for (uint32_t j = 0; j < attribCount; j += 1)
651         {
652             for (uint32_t k = 0; k < 4; k += 1)
653             {
654                 temp.attrib[j][k] =
655                     _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
656 
657                 if ((i + 1) < vertexCount)
658                 {
659                     temp.attrib[j][k] =
660                         _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
661                 }
662             }
663         }
664 
665         for (uint32_t j = 0; j < attribCount; j += 1)
666         {
667             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
668         }
669     }
670 }
671 
672 #endif
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Computes number of invocations. The current index represents
675 ///        the start of the SIMD. The max index represents how much work
676 ///        items are remaining. If there is less then a SIMD's xmin of work
677 ///        then return the remaining amount of work.
678 /// @param curIndex - The start index for the SIMD.
679 /// @param maxIndex - The last index for all work items.
GetNumInvocations(uint32_t curIndex,uint32_t maxIndex)680 static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
681 {
682     uint32_t remainder = (maxIndex - curIndex);
683 #if USE_SIMD16_FRONTEND
684     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
685 #else
686     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
687 #endif
688 }
689 
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
692 ///        The geometry shader will loop over each active streamout buffer, assembling
693 ///        primitives for the downstream stages. When multistream output is enabled,
694 ///        the generated stream ID buffer from the GS needs to be converted to a cut
695 ///        buffer for the primitive assembler.
696 /// @param stream - stream id to generate the cut buffer for
697 /// @param pStreamIdBase - pointer to the stream ID buffer
698 /// @param numEmittedVerts - Number of total verts emitted by the GS
699 /// @param pCutBuffer - output buffer to write cuts to
ProcessStreamIdBuffer(uint32_t stream,uint8_t * pStreamIdBase,uint32_t numEmittedVerts,uint8_t * pCutBuffer)700 void ProcessStreamIdBuffer(uint32_t stream,
701                            uint8_t* pStreamIdBase,
702                            uint32_t numEmittedVerts,
703                            uint8_t* pCutBuffer)
704 {
705     SWR_ASSERT(stream < MAX_SO_STREAMS);
706 
707     uint32_t numOutputBytes = AlignUp(numEmittedVerts, 8) / 8;
708 
709     for (uint32_t b = 0; b < numOutputBytes; ++b)
710     {
711         uint8_t curInputByte = pStreamIdBase[2 * b];
712         uint8_t outByte      = 0;
713         for (uint32_t i = 0; i < 4; ++i)
714         {
715             if ((curInputByte & 0x3) != stream)
716             {
717                 outByte |= (1 << i);
718             }
719             curInputByte >>= 2;
720         }
721 
722         curInputByte = pStreamIdBase[2 * b + 1];
723         for (uint32_t i = 0; i < 4; ++i)
724         {
725             if ((curInputByte & 0x3) != stream)
726             {
727                 outByte |= (1 << (i + 4));
728             }
729             curInputByte >>= 2;
730         }
731 
732         *pCutBuffer++ = outByte;
733     }
734 }
735 
736 // Buffers that are allocated if GS is enabled
737 struct GsBuffers
738 {
739     uint8_t* pGsIn;
740     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
741     uint8_t* pGsTransposed;
742     void*    pStreamCutBuffer;
743 };
744 
745 //////////////////////////////////////////////////////////////////////////
746 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
747 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
748 /// assembler
749 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
750 /// @param numVerts - Number of vertices outputted by the GS
751 /// @param numAttribs - Number of attributes per vertex
752 template <typename SIMD_T, uint32_t SimdWidth>
TransposeSOAtoAOS(uint8_t * pDst,uint8_t * pSrc,uint32_t numVerts,uint32_t numAttribs)753 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
754 {
755     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
756     uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
757 
758     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
759 
760     for (uint32_t i = 0; i < SimdWidth; ++i)
761     {
762         gatherOffsets[i] = srcVertexStride * i;
763     }
764     auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
765 
766     uint32_t numSimd        = AlignUp(numVerts, SimdWidth) / SimdWidth;
767     uint32_t remainingVerts = numVerts;
768 
769     for (uint32_t s = 0; s < numSimd; ++s)
770     {
771         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
772         uint8_t* pDstBase = pDst + s * dstVertexStride;
773 
774         // Compute mask to prevent src overflow
775         uint32_t mask = std::min(remainingVerts, SimdWidth);
776         mask          = GenMask(mask);
777         auto vMask    = SIMD_T::vmask_ps(mask);
778         auto viMask   = SIMD_T::castps_si(vMask);
779 
780         for (uint32_t a = 0; a < numAttribs; ++a)
781         {
782             auto attribGatherX = SIMD_T::mask_i32gather_ps(
783                 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
784             auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
785                                                            (const float*)(pSrcBase + sizeof(float)),
786                                                            vGatherOffsets,
787                                                            vMask);
788             auto attribGatherZ =
789                 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
790                                           (const float*)(pSrcBase + sizeof(float) * 2),
791                                           vGatherOffsets,
792                                           vMask);
793             auto attribGatherW =
794                 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
795                                           (const float*)(pSrcBase + sizeof(float) * 3),
796                                           vGatherOffsets,
797                                           vMask);
798 
799             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
800             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
801             SIMD_T::maskstore_ps(
802                 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
803             SIMD_T::maskstore_ps(
804                 (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
805 
806             pSrcBase += sizeof(float) * 4;
807             pDstBase += sizeof(Float<SIMD_T>) * 4;
808         }
809         remainingVerts -= SimdWidth;
810     }
811 }
812 
813 
814 //////////////////////////////////////////////////////////////////////////
815 /// @brief Implements GS stage.
816 /// @param pDC - pointer to draw context.
817 /// @param workerId - thread's worker id. Even thread has a unique id.
818 /// @param pa - The primitive assembly object.
819 /// @param pGsOut - output stream for GS
820 template <typename HasStreamOutT, typename HasRastT>
GeometryShaderStage(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)821 static void GeometryShaderStage(DRAW_CONTEXT* pDC,
822                                 uint32_t      workerId,
823                                 PA_STATE&     pa,
824                                 GsBuffers*    pGsBuffers,
825                                 uint32_t*     pSoPrimData,
826 #if USE_SIMD16_FRONTEND
827                                 uint32_t numPrims_simd8,
828 #endif
829                                 simdscalari const& primID)
830 {
831     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
832 
833     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
834 
835     const API_STATE&    state  = GetApiState(pDC);
836     const SWR_GS_STATE* pState = &state.gsState;
837     SWR_GS_CONTEXT      gsContext;
838 
839     static uint8_t sNullBuffer[128] = {0};
840 
841     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
842     {
843         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
844     }
845     gsContext.pVerts      = (simdvector*)pGsBuffers->pGsIn;
846     gsContext.PrimitiveID = primID;
847 
848     uint32_t   numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
849     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
850 
851     // assemble all attributes for the input primitive
852     gsContext.inputVertStride = pState->inputVertStride;
853     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
854     {
855         uint32_t attribOffset = slot + pState->vertexAttribOffset;
856         pa.Assemble(attribOffset, attrib);
857 
858         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
859         {
860             gsContext.pVerts[attribOffset + pState->inputVertStride * i] = attrib[i];
861         }
862     }
863 
864     // record valid prims from the frontend to avoid over binning the newly generated
865     // prims from the GS
866 #if USE_SIMD16_FRONTEND
867     uint32_t numInputPrims = numPrims_simd8;
868 #else
869     uint32_t numInputPrims = pa.NumPrims();
870 #endif
871 
872     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
873     {
874         gsContext.InstanceID = instance;
875         gsContext.mask       = GenerateMask(numInputPrims);
876 
877         // execute the geometry shader
878         state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
879         AR_EVENT(GSStats((HANDLE)&gsContext.stats));
880 
881         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
882         {
883             gsContext.pStreams[i] += pState->allocationSize;
884         }
885     }
886 
887     // set up new binner and state for the GS output topology
888 #if USE_SIMD16_FRONTEND
889     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
890     if (HasRastT::value)
891     {
892         switch (pState->outputTopology)
893         {
894         case TOP_RECT_LIST:
895             pfnClipFunc = ClipRectangles_simd16;
896             break;
897         case TOP_TRIANGLE_STRIP:
898             pfnClipFunc = ClipTriangles_simd16;
899             break;
900         case TOP_LINE_STRIP:
901             pfnClipFunc = ClipLines_simd16;
902             break;
903         case TOP_POINT_LIST:
904             pfnClipFunc = ClipPoints_simd16;
905             break;
906         default:
907             SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
908         }
909     }
910 
911 #else
912     PFN_PROCESS_PRIMS pfnClipFunc   = nullptr;
913     if (HasRastT::value)
914     {
915         switch (pState->outputTopology)
916         {
917         case TOP_RECT_LIST:
918             pfnClipFunc = ClipRectangles;
919             break;
920         case TOP_TRIANGLE_STRIP:
921             pfnClipFunc = ClipTriangles;
922             break;
923         case TOP_LINE_STRIP:
924             pfnClipFunc = ClipLines;
925             break;
926         case TOP_POINT_LIST:
927             pfnClipFunc = ClipPoints;
928             break;
929         default:
930             SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
931         }
932     }
933 
934 #endif
935     // foreach input prim:
936     // - setup a new PA based on the emitted verts for that prim
937     // - loop over the new verts, calling PA to assemble each prim
938     uint32_t* pPrimitiveId = (uint32_t*)&primID;
939 
940     uint32_t totalPrimsGenerated = 0;
941     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
942     {
943         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
944 
945         // Vertex count is either emitted by shader or static
946         uint32_t vertexCount = 0;
947         if (pState->staticVertexCount)
948         {
949             vertexCount = pState->staticVertexCount;
950         }
951         else
952         {
953             // If emitted in shader, it should be the stored in the first dword of the output buffer
954             vertexCount = *(uint32_t*)pInstanceBase;
955         }
956 
957         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
958         {
959             uint32_t numEmittedVerts = vertexCount;
960             if (numEmittedVerts == 0)
961             {
962                 continue;
963             }
964 
965             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
966             uint8_t* pCutBase =
967                 pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
968             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
969 
970 #if USE_SIMD16_FRONTEND
971             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
972                                                           pVertexBaseAOS,
973                                                           vertexCount,
974                                                           pState->outputVertexSize);
975 #else
976             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
977                                                         pVertexBaseAOS,
978                                                         vertexCount,
979                                                         pState->outputVertexSize);
980 #endif
981 
982             uint32_t numAttribs = state.feNumAttributes;
983 
984             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
985             {
986                 bool     processCutVerts = false;
987                 uint8_t* pCutBuffer      = pCutBase;
988 
989                 // assign default stream ID, only relevant when GS is outputting a single stream
990                 uint32_t streamID = 0;
991                 if (pState->isSingleStream)
992                 {
993                     processCutVerts = true;
994                     streamID        = pState->singleStreamID;
995                     if (streamID != stream)
996                         continue;
997                 }
998                 else
999                 {
1000                     // early exit if this stream is not enabled for streamout
1001                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
1002                     {
1003                         continue;
1004                     }
1005 
1006                     // multi-stream output, need to translate StreamID buffer to a cut buffer
1007                     ProcessStreamIdBuffer(
1008                         stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
1009                     pCutBuffer      = (uint8_t*)pGsBuffers->pStreamCutBuffer;
1010                     processCutVerts = false;
1011                 }
1012 
1013 #if USE_SIMD16_FRONTEND
1014                 PA_STATE_CUT gsPa(pDC,
1015                                   (uint8_t*)pGsBuffers->pGsTransposed,
1016                                   numEmittedVerts,
1017                                   pState->outputVertexSize,
1018                                   reinterpret_cast<simd16mask*>(pCutBuffer),
1019                                   numEmittedVerts,
1020                                   numAttribs,
1021                                   pState->outputTopology,
1022                                   processCutVerts,
1023                                   pa.numVertsPerPrim);
1024 
1025 #else
1026                 PA_STATE_CUT gsPa(pDC,
1027                                   (uint8_t*)pGsBuffers->pGsTransposed,
1028                                   numEmittedVerts,
1029                                   pState->outputVertexSize,
1030                                   pCutBuffer,
1031                                   numEmittedVerts,
1032                                   numAttribs,
1033                                   pState->outputTopology,
1034                                   processCutVerts,
1035                                   pa.numVertsPerPrim);
1036 
1037 #endif
1038                 while (gsPa.GetNextStreamOutput())
1039                 {
1040                     do
1041                     {
1042 #if USE_SIMD16_FRONTEND
1043                         simd16vector attrib_simd16[3];
1044 
1045                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
1046 
1047 #else
1048                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
1049 
1050 #endif
1051                         if (assemble)
1052                         {
1053                             totalPrimsGenerated += gsPa.NumPrims();
1054 
1055                             if (HasStreamOutT::value)
1056                             {
1057 #if ENABLE_AVX512_SIMD16
1058                                 gsPa.useAlternateOffset = false;
1059 #endif
1060                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
1061                             }
1062 
1063                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
1064                             {
1065 #if USE_SIMD16_FRONTEND
1066                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
1067 
1068                                 // Gather data from the SVG if provided.
1069                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
1070                                 simd16scalari vRtIdx       = SIMD16::setzero_si();
1071                                 SIMD16::Vec4  svgAttrib[4];
1072 
1073                                 if (state.backendState.readViewportArrayIndex ||
1074                                     state.backendState.readRenderTargetArrayIndex)
1075                                 {
1076                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1077                                 }
1078 
1079                                 if (state.backendState.readViewportArrayIndex)
1080                                 {
1081                                     vViewportIdx =
1082                                         SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1083                                     gsPa.viewportArrayActive = true;
1084                                 }
1085                                 if (state.backendState.readRenderTargetArrayIndex)
1086                                 {
1087                                     vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1088                                     gsPa.rtArrayActive = true;
1089                                 }
1090 
1091                                 {
1092                                     // OOB VPAI indices => forced to zero.
1093                                     vViewportIdx =
1094                                         SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1095                                     simd16scalari vNumViewports =
1096                                         SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1097                                     simd16scalari vClearMask =
1098                                         SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1099                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1100 
1101                                     gsPa.useAlternateOffset = false;
1102                                     pfnClipFunc(pDC,
1103                                                 gsPa,
1104                                                 workerId,
1105                                                 attrib_simd16,
1106                                                 GenMask(gsPa.NumPrims()),
1107                                                 vPrimId,
1108                                                 vViewportIdx,
1109                                                 vRtIdx);
1110                                 }
1111 #else
1112                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1113 
1114                                 // Gather data from the SVG if provided.
1115                                 simdscalari vViewportIdx = SIMD::setzero_si();
1116                                 simdscalari vRtIdx       = SIMD::setzero_si();
1117                                 SIMD::Vec4  svgAttrib[4];
1118 
1119                                 if (state.backendState.readViewportArrayIndex ||
1120                                     state.backendState.readRenderTargetArrayIndex)
1121                                 {
1122                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1123                                 }
1124 
1125                                 if (state.backendState.readViewportArrayIndex)
1126                                 {
1127                                     vViewportIdx =
1128                                         SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1129 
1130                                     // OOB VPAI indices => forced to zero.
1131                                     vViewportIdx =
1132                                         SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1133                                     simdscalari vNumViewports =
1134                                         SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1135                                     simdscalari vClearMask =
1136                                         SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1137                                     vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
1138                                     gsPa.viewportArrayActive = true;
1139                                 }
1140                                 if (state.backendState.readRenderTargetArrayIndex)
1141                                 {
1142                                     vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1143                                     gsPa.rtArrayActive = true;
1144                                 }
1145 
1146                                 pfnClipFunc(pDC,
1147                                             gsPa,
1148                                             workerId,
1149                                             attrib,
1150                                             GenMask(gsPa.NumPrims()),
1151                                             vPrimId,
1152                                             vViewportIdx,
1153                                             vRtIdx);
1154 #endif
1155                             }
1156                         }
1157                     } while (gsPa.NextPrim());
1158                 }
1159             }
1160         }
1161     }
1162 
1163     // update GS pipeline stats
1164     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1165     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1166     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
1167     RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
1168 }
1169 
1170 //////////////////////////////////////////////////////////////////////////
1171 /// @brief Allocate GS buffers
1172 /// @param pDC - pointer to draw context.
1173 /// @param state - API state
1174 /// @param ppGsOut - pointer to GS output buffer allocation
1175 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1176 template <typename SIMD_T, uint32_t SIMD_WIDTH>
AllocateGsBuffers(DRAW_CONTEXT * pDC,const API_STATE & state,uint32_t vertsPerPrim,GsBuffers * pGsBuffers)1177 static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
1178                                      const API_STATE& state,
1179                                      uint32_t         vertsPerPrim,
1180                                      GsBuffers*       pGsBuffers)
1181 {
1182     auto pArena = pDC->pArena;
1183     SWR_ASSERT(pArena != nullptr);
1184     SWR_ASSERT(state.gsState.gsEnable);
1185 
1186     const SWR_GS_STATE& gsState = state.gsState;
1187 
1188     // Allocate storage for vertex inputs
1189     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1190     pGsBuffers->pGsIn           = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1191 
1192     // Allocate arena space to hold GS output verts
1193     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1194 
1195     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1196     {
1197         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1198     }
1199 
1200     // Allocate storage for transposed GS output
1201     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1202     uint32_t transposedBufferSize =
1203         numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
1204     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1205 
1206     // Allocate storage to hold temporary stream->cut buffer, if necessary
1207     if (state.gsState.isSingleStream)
1208     {
1209         pGsBuffers->pStreamCutBuffer = nullptr;
1210     }
1211     else
1212     {
1213         pGsBuffers->pStreamCutBuffer =
1214             (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1215     }
1216 }
1217 
1218 //////////////////////////////////////////////////////////////////////////
1219 /// @brief Contains all data generated by the HS and passed to the
1220 /// tessellator and DS.
1221 struct TessellationThreadLocalData
1222 {
1223     SWR_HS_CONTEXT hsContext;
1224     void*          pTxCtx;
1225     size_t         tsCtxSize;
1226 
1227     uint8_t*    pHSOutput;
1228     size_t      hsOutputAllocSize;
1229 
1230     simdscalar* pDSOutput;
1231     size_t      dsOutputAllocSize;
1232 };
1233 
1234 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1235 
1236 //////////////////////////////////////////////////////////////////////////
1237 /// @brief Allocate tessellation data for this worker thread.
1238 INLINE
AllocateTessellationData(SWR_CONTEXT * pContext)1239 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1240 {
1241     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1242     if (gt_pTessellationThreadData == nullptr)
1243     {
1244         gt_pTessellationThreadData =
1245             (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1246         memset((void*)gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1247     }
1248 }
1249 
1250 //////////////////////////////////////////////////////////////////////////
1251 /// @brief Implements Tessellation Stages.
1252 /// @param pDC - pointer to draw context.
1253 /// @param workerId - thread's worker id. Even thread has a unique id.
1254 /// @param pa - The primitive assembly object.
1255 /// @param pGsOut - output stream for GS
1256 template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
TessellationStages(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)1257 static void TessellationStages(DRAW_CONTEXT* pDC,
1258                                uint32_t      workerId,
1259                                PA_STATE&     pa,
1260                                GsBuffers*    pGsBuffers,
1261                                uint32_t*     pSoPrimData,
1262 #if USE_SIMD16_FRONTEND
1263                                uint32_t numPrims_simd8,
1264 #endif
1265                                simdscalari const& primID)
1266 {
1267     const API_STATE&    state   = GetApiState(pDC);
1268     const SWR_TS_STATE& tsState = state.tsState;
1269     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1270 
1271     SWR_ASSERT(gt_pTessellationThreadData);
1272 
1273     HANDLE tsCtx = TSInitCtx(tsState.domain,
1274                              tsState.partitioning,
1275                              tsState.tsOutputTopology,
1276                              gt_pTessellationThreadData->pTxCtx,
1277                              gt_pTessellationThreadData->tsCtxSize);
1278     if (tsCtx == nullptr)
1279     {
1280         gt_pTessellationThreadData->pTxCtx =
1281             AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1282         tsCtx = TSInitCtx(tsState.domain,
1283                           tsState.partitioning,
1284                           tsState.tsOutputTopology,
1285                           gt_pTessellationThreadData->pTxCtx,
1286                           gt_pTessellationThreadData->tsCtxSize);
1287     }
1288     SWR_ASSERT(tsCtx);
1289 
1290 #if USE_SIMD16_FRONTEND
1291     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1292     if (HasRastT::value)
1293     {
1294         switch (tsState.postDSTopology)
1295         {
1296         case TOP_TRIANGLE_LIST:
1297             pfnClipFunc = ClipTriangles_simd16;
1298             break;
1299         case TOP_LINE_LIST:
1300             pfnClipFunc = ClipLines_simd16;
1301             break;
1302         case TOP_POINT_LIST:
1303             pfnClipFunc = ClipPoints_simd16;
1304             break;
1305         default:
1306             SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1307         }
1308     }
1309 
1310 #else
1311     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1312     if (HasRastT::value)
1313     {
1314         switch (tsState.postDSTopology)
1315         {
1316         case TOP_TRIANGLE_LIST:
1317             pfnClipFunc = ClipTriangles;
1318             break;
1319         case TOP_LINE_LIST:
1320             pfnClipFunc = ClipLines;
1321             break;
1322         case TOP_POINT_LIST:
1323             pfnClipFunc = ClipPoints;
1324             break;
1325         default:
1326             SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1327         }
1328     }
1329 
1330 #endif
1331     SWR_HS_CONTEXT& hsContext       = gt_pTessellationThreadData->hsContext;
1332     hsContext.PrimitiveID           = primID;
1333     hsContext.outputSize = tsState.hsAllocationSize;
1334 
1335     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1336     // Max storage for one attribute for an entire simdprimitive
1337     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1338 
1339     // Assemble position separately
1340     // TESS_TODO: this could be avoided - fix it
1341     pa.Assemble(VERTEX_POSITION_SLOT, simdattrib);
1342     for (uint32_t i = 0; i < numVertsPerPrim; ++i) {
1343         hsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = simdattrib[i];
1344     }
1345 
1346     // assemble all attributes for the input primitives
1347     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1348     {
1349         uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
1350         pa.Assemble(attribSlot, simdattrib);
1351 
1352         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1353         {
1354             hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
1355         }
1356     }
1357 
1358     // Allocate HS output storage
1359     uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
1360 
1361     if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
1362     {
1363         AlignedFree(gt_pTessellationThreadData->pHSOutput);
1364         gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
1365         gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
1366     }
1367 
1368     hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
1369 
1370 #if defined(_DEBUG)
1371     //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1372 #endif
1373     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1374 
1375 #if USE_SIMD16_FRONTEND
1376     uint32_t numPrims = numPrims_simd8;
1377 #else
1378     uint32_t numPrims = pa.NumPrims();
1379 #endif
1380     hsContext.mask = GenerateMask(numPrims);
1381 
1382     // Run the HS
1383     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
1384     state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
1385     RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
1386 
1387     UPDATE_STAT_FE(HsInvocations, numPrims);
1388     AR_EVENT(HSStats((HANDLE)&hsContext.stats));
1389 
1390     const uint32_t* pPrimId = (const uint32_t*)&primID;
1391 
1392     for (uint32_t p = 0; p < numPrims; ++p)
1393     {
1394         ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
1395 
1396         SWR_TESSELLATION_FACTORS tessFactors;
1397         tessFactors                    = hsContext.pCPout[p].tessFactors;
1398 
1399           // Run Tessellator
1400         SWR_TS_TESSELLATED_DATA tsData = {0};
1401         RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
1402         TSTessellate(tsCtx, tessFactors, tsData);
1403         AR_EVENT(TessPrimCount(1));
1404         RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
1405 
1406         if (tsData.NumPrimitives == 0)
1407         {
1408             continue;
1409         }
1410         SWR_ASSERT(tsData.NumDomainPoints);
1411 
1412         // Allocate DS Output memory
1413         uint32_t requiredDSVectorInvocations =
1414             AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1415 #if USE_SIMD16_FRONTEND
1416         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
1417                                    tsState.dsAllocationSize; // simd8 -> simd16, padding
1418 #else
1419         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1420         size_t requiredAllocSize       = sizeof(simdvector) * requiredDSOutputVectors;
1421 #endif
1422         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1423         {
1424             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1425             gt_pTessellationThreadData->pDSOutput =
1426                 (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1427             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1428         }
1429         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1430         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1431 
1432 #if defined(_DEBUG)
1433         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1434 #endif
1435 
1436         // Run Domain Shader
1437         SWR_DS_CONTEXT dsContext;
1438         dsContext.PrimitiveID           = pPrimId[p];
1439         dsContext.pCpIn                 = pCPout;
1440         dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
1441         dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
1442         dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
1443         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1444 #if USE_SIMD16_FRONTEND
1445         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
1446 #else
1447         dsContext.vectorStride         = requiredDSVectorInvocations;
1448 #endif
1449 
1450         uint32_t dsInvocations = 0;
1451 
1452         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
1453              ++dsContext.vectorOffset)
1454         {
1455             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1456 
1457             RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
1458             state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
1459             RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
1460 
1461             AR_EVENT(DSStats((HANDLE)&dsContext.stats));
1462 
1463             dsInvocations += KNOB_SIMD_WIDTH;
1464         }
1465         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1466 
1467 #if USE_SIMD16_FRONTEND
1468         SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
1469 
1470 #endif
1471         PA_TESS tessPa(
1472             pDC,
1473 #if USE_SIMD16_FRONTEND
1474             reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
1475             dsContext.vectorStride / 2,                                   // simd8 -> simd16
1476 #else
1477             dsContext.pOutputData,
1478             dsContext.vectorStride,
1479 #endif
1480             SWR_VTX_NUM_SLOTS,
1481             tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
1482             tsData.ppIndices,
1483             tsData.NumPrimitives,
1484             tsState.postDSTopology,
1485             NumVertsPerPrim(tsState.postDSTopology, false));
1486 
1487         while (tessPa.HasWork())
1488         {
1489 #if USE_SIMD16_FRONTEND
1490             const uint32_t numPrims    = tessPa.NumPrims();
1491             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1492             const uint32_t numPrims_hi =
1493                 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1494 
1495             const simd16scalari primID    = _simd16_set1_epi32(dsContext.PrimitiveID);
1496             const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
1497             const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
1498 
1499 #endif
1500             if (HasGeometryShaderT::value)
1501             {
1502 #if USE_SIMD16_FRONTEND
1503                 tessPa.useAlternateOffset = false;
1504                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1505                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1506 
1507                 if (numPrims_hi)
1508                 {
1509                     tessPa.useAlternateOffset = true;
1510                     GeometryShaderStage<HasStreamOutT, HasRastT>(
1511                         pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1512                 }
1513 #else
1514                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1515                     pDC,
1516                     workerId,
1517                     tessPa,
1518                     pGsBuffers,
1519                     pSoPrimData,
1520                     _simd_set1_epi32(dsContext.PrimitiveID));
1521 #endif
1522             }
1523             else
1524             {
1525                 if (HasStreamOutT::value)
1526                 {
1527 #if ENABLE_AVX512_SIMD16
1528                     tessPa.useAlternateOffset = false;
1529 #endif
1530                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1531                 }
1532 
1533                 if (HasRastT::value)
1534                 {
1535 #if USE_SIMD16_FRONTEND
1536                     simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
1537 #else
1538                     simdvector prim[3]; // Only deal with triangles, lines, or points
1539 #endif
1540                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
1541                     bool assemble =
1542 #if USE_SIMD16_FRONTEND
1543                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1544 #else
1545                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1546 #endif
1547                     RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
1548                     SWR_ASSERT(assemble);
1549 
1550                     SWR_ASSERT(pfnClipFunc);
1551 #if USE_SIMD16_FRONTEND
1552                     // Gather data from the SVG if provided.
1553                     simd16scalari vViewportIdx = SIMD16::setzero_si();
1554                     simd16scalari vRtIdx       = SIMD16::setzero_si();
1555                     SIMD16::Vec4 svgAttrib[4] = {SIMD16::setzero_ps()};
1556 
1557                     if (state.backendState.readViewportArrayIndex ||
1558                         state.backendState.readRenderTargetArrayIndex)
1559                     {
1560                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1561                     }
1562 
1563                     if (state.backendState.readViewportArrayIndex)
1564                     {
1565                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1566                         tessPa.viewportArrayActive = true;
1567                     }
1568                     if (state.backendState.readRenderTargetArrayIndex)
1569                     {
1570                         vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1571                         tessPa.rtArrayActive = true;
1572                     }
1573 
1574 
1575                     {
1576                         // OOB VPAI indices => forced to zero.
1577                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1578                         simd16scalari vNumViewports =
1579                             SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1580                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1581                         vViewportIdx             = SIMD16::and_si(vClearMask, vViewportIdx);
1582 
1583                         tessPa.useAlternateOffset = false;
1584                         pfnClipFunc(pDC,
1585                                     tessPa,
1586                                     workerId,
1587                                     prim_simd16,
1588                                     GenMask(numPrims),
1589                                     primID,
1590                                     vViewportIdx,
1591                                     vRtIdx);
1592                     }
1593 #else
1594                     // Gather data from the SGV if provided.
1595                     simdscalari vViewportIdx = SIMD::setzero_si();
1596                     simdscalari vRtIdx       = SIMD::setzero_si();
1597                     SIMD::Vec4  svgAttrib[4];
1598 
1599                     if (state.backendState.readViewportArrayIndex ||
1600                         state.backendState.readRenderTargetArrayIndex)
1601                     {
1602                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1603                     }
1604 
1605                     if (state.backendState.readViewportArrayIndex)
1606                     {
1607                         vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1608 
1609                         // OOB VPAI indices => forced to zero.
1610                         vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
1611                         simdscalari vNumViewports  = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1612                         simdscalari vClearMask     = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
1613                         vViewportIdx               = SIMD::and_si(vClearMask, vViewportIdx);
1614                         tessPa.viewportArrayActive = true;
1615                     }
1616                     if (state.backendState.readRenderTargetArrayIndex)
1617                     {
1618                         vRtIdx               = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1619                         tessPa.rtArrayActive = true;
1620                     }
1621                     pfnClipFunc(pDC,
1622                                 tessPa,
1623                                 workerId,
1624                                 prim,
1625                                 GenMask(tessPa.NumPrims()),
1626                                 _simd_set1_epi32(dsContext.PrimitiveID),
1627                                 vViewportIdx,
1628                                 vRtIdx);
1629 #endif
1630                 }
1631             }
1632 
1633             tessPa.NextPrim();
1634 
1635         } // while (tessPa.HasWork())
1636     }     // for (uint32_t p = 0; p < numPrims; ++p)
1637 
1638 #if USE_SIMD16_FRONTEND
1639     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1640     {
1641         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1642         gt_pTessellationThreadData->pDSOutput = nullptr;
1643     }
1644     gt_pTessellationThreadData->dsOutputAllocSize = 0;
1645 
1646 #endif
1647     TSDestroyCtx(tsCtx);
1648 }
1649 
1650 THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
1651 THREAD uint32_t gVertexStoreSize           = 0;
1652 
1653 //////////////////////////////////////////////////////////////////////////
1654 /// @brief FE handler for SwrDraw.
1655 /// @tparam IsIndexedT - Is indexed drawing enabled
1656 /// @tparam HasTessellationT - Is tessellation enabled
1657 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1658 /// @tparam HasStreamOutT - Is stream-out enabled
1659 /// @tparam HasRastT - Is rasterization enabled
1660 /// @param pContext - pointer to SWR context.
1661 /// @param pDC - pointer to draw context.
1662 /// @param workerId - thread's worker id.
1663 /// @param pUserData - Pointer to DRAW_WORK
1664 template <typename IsIndexedT,
1665           typename IsCutIndexEnabledT,
1666           typename HasTessellationT,
1667           typename HasGeometryShaderT,
1668           typename HasStreamOutT,
1669           typename HasRastT>
ProcessDraw(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)1670 void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
1671 {
1672 #if KNOB_ENABLE_TOSS_POINTS
1673     if (KNOB_TOSS_QUEUE_FE)
1674     {
1675         return;
1676     }
1677 #endif
1678 
1679     RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
1680 
1681     void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1682 
1683     DRAW_WORK&       work  = *(DRAW_WORK*)pUserData;
1684     const API_STATE& state = GetApiState(pDC);
1685 
1686     uint32_t indexSize = 0;
1687     uint32_t endVertex = work.numVerts;
1688 
1689     gfxptr_t xpLastRequestedIndex = 0;
1690     if (IsIndexedT::value)
1691     {
1692         switch (work.type)
1693         {
1694         case R32_UINT:
1695             indexSize = sizeof(uint32_t);
1696             break;
1697         case R16_UINT:
1698             indexSize = sizeof(uint16_t);
1699             break;
1700         case R8_UINT:
1701             indexSize = sizeof(uint8_t);
1702             break;
1703         default:
1704             SWR_INVALID("Invalid work.type: %d", work.type);
1705         }
1706         xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
1707     }
1708     else
1709     {
1710         // No cuts, prune partial primitives.
1711         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1712     }
1713 
1714 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1715     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1716 #endif
1717 
1718     GsBuffers gsBuffers;
1719     if (HasGeometryShaderT::value)
1720     {
1721 #if USE_SIMD16_FRONTEND
1722         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
1723             pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1724 #else
1725         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
1726             pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1727 #endif
1728     }
1729 
1730     if (HasTessellationT::value)
1731     {
1732         SWR_ASSERT(state.tsState.tsEnable == true);
1733         SWR_ASSERT(state.pfnHsFunc != nullptr);
1734         SWR_ASSERT(state.pfnDsFunc != nullptr);
1735 
1736         AllocateTessellationData(pContext);
1737     }
1738     else
1739     {
1740         SWR_ASSERT(state.tsState.tsEnable == false);
1741         SWR_ASSERT(state.pfnHsFunc == nullptr);
1742         SWR_ASSERT(state.pfnDsFunc == nullptr);
1743     }
1744 
1745     // allocate space for streamout input prim data
1746     uint32_t* pSoPrimData = nullptr;
1747     if (HasStreamOutT::value)
1748     {
1749         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1750     }
1751 
1752     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1753 #if USE_SIMD16_FRONTEND
1754     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1755 #else
1756     uint32_t          simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1757 #endif
1758 
1759     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1760 
1761     // Compute storage requirements for vertex store
1762     // TODO: allocation needs to be rethought for better cut support
1763     uint32_t numVerts        = vertexCount + 2; // Need extra space for PA state machine
1764     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1765 
1766     // grow the vertex store for the PA as necessary
1767     if (gVertexStoreSize < vertexStoreSize)
1768     {
1769         if (gpVertexStore != nullptr)
1770         {
1771             AlignedFree(gpVertexStore);
1772             gpVertexStore = nullptr;
1773         }
1774 
1775         SWR_ASSERT(gpVertexStore == nullptr);
1776 
1777         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
1778         gVertexStoreSize = vertexStoreSize;
1779 
1780         SWR_ASSERT(gpVertexStore != nullptr);
1781     }
1782 
1783     // choose primitive assembler
1784 
1785     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
1786                                                          state.topology,
1787                                                          work.numVerts,
1788                                                          gpVertexStore,
1789                                                          numVerts,
1790                                                          state.frontendState.vsVertexSize,
1791                                                          GetNumVerts(state.topology, 1));
1792     PA_STATE&                                  pa = paFactory.GetPA();
1793 
1794 #if USE_SIMD16_FRONTEND
1795 #if USE_SIMD16_SHADERS
1796     simd16vertex vin;
1797 #else
1798     simdvertex vin_lo;
1799     simdvertex vin_hi;
1800 #endif
1801     SWR_VS_CONTEXT vsContext_lo;
1802     SWR_VS_CONTEXT vsContext_hi;
1803 
1804 #if USE_SIMD16_SHADERS
1805     vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
1806     vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
1807 #else
1808     vsContext_lo.pVin = &vin_lo;
1809     vsContext_hi.pVin = &vin_hi;
1810 #endif
1811     vsContext_lo.AlternateOffset = 0;
1812     vsContext_hi.AlternateOffset = 1;
1813 
1814     SWR_FETCH_CONTEXT fetchInfo_lo = {0};
1815 
1816     fetchInfo_lo.pStreams      = &state.vertexBuffers[0];
1817     fetchInfo_lo.StartInstance = work.startInstance;
1818     fetchInfo_lo.StartVertex   = 0;
1819 
1820     if (IsIndexedT::value)
1821     {
1822         fetchInfo_lo.BaseVertex = work.baseVertex;
1823 
1824         // if the entire index buffer isn't being consumed, set the last index
1825         // so that fetches < a SIMD wide will be masked off
1826         fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
1827         if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
1828         {
1829             fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
1830         }
1831     }
1832     else
1833     {
1834         fetchInfo_lo.StartVertex = work.startVertex;
1835     }
1836 
1837     SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
1838 
1839     const simd16scalari vScale =
1840         _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1841 
1842     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1843     {
1844         uint32_t i = 0;
1845 
1846         simd16scalari vIndex;
1847 
1848         if (IsIndexedT::value)
1849         {
1850             fetchInfo_lo.xpIndices = work.xpIB;
1851             fetchInfo_hi.xpIndices =
1852                 fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
1853         }
1854         else
1855         {
1856             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1857 
1858             fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
1859             fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
1860                 GetPrivateState(pDC),
1861                 &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1862         }
1863 
1864         fetchInfo_lo.CurInstance = instanceNum;
1865         fetchInfo_hi.CurInstance = instanceNum;
1866 
1867         vsContext_lo.InstanceID = instanceNum;
1868         vsContext_hi.InstanceID = instanceNum;
1869 
1870         while (pa.HasWork())
1871         {
1872             // GetNextVsOutput currently has the side effect of updating some PA state machine
1873             // state. So we need to keep this outside of (i < endVertex) check.
1874 
1875             simdmask* pvCutIndices_lo = nullptr;
1876             simdmask* pvCutIndices_hi = nullptr;
1877 
1878             if (IsIndexedT::value)
1879             {
1880                 // simd16mask <=> simdmask[2]
1881 
1882                 pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
1883                 pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
1884             }
1885 
1886             simd16vertex& vout = pa.GetNextVsOutput();
1887 
1888             vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
1889             vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
1890 
1891             if (i < endVertex)
1892             {
1893                 if (!IsIndexedT::value)
1894                 {
1895                     fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
1896                     uint32_t offset;
1897                     offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
1898                     offset *= 4; // convert from index to address
1899 #if USE_SIMD16_SHADERS
1900                     fetchInfo_lo.xpLastIndex += offset;
1901 #else
1902                     fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
1903                     uint32_t offset2 =
1904                         std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
1905                     assert(offset >= 0);
1906                     fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
1907                     fetchInfo_hi.xpLastIndex += offset2;
1908 #endif
1909                 }
1910                 // 1. Execute FS/VS for a single SIMD.
1911                 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
1912 #if USE_SIMD16_SHADERS
1913                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
1914 #else
1915                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
1916 
1917                 if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1918                 {
1919                     state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
1920                 }
1921 #endif
1922                 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
1923 
1924                 // forward fetch generated vertex IDs to the vertex shader
1925 #if USE_SIMD16_SHADERS
1926 #if USE_SIMD16_VS
1927                 vsContext_lo.VertexID16 =
1928                     _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1929                 vsContext_lo.VertexID16 =
1930                     _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1931 #else
1932                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1933                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1934 #endif
1935 #else
1936                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1937                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1938 #endif
1939 
1940                 // Setup active mask for vertex shader.
1941 #if USE_SIMD16_VS
1942                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1943 #else
1944                 vsContext_lo.mask     = GenerateMask(endVertex - i);
1945                 vsContext_hi.mask     = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1946 #endif
1947 
1948                 // forward cut mask to the PA
1949                 if (IsIndexedT::value)
1950                 {
1951 #if USE_SIMD16_SHADERS
1952                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1953                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1954 #else
1955                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1956                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1957 #endif
1958                 }
1959 
1960                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1961 
1962 #if KNOB_ENABLE_TOSS_POINTS
1963                 if (!KNOB_TOSS_FETCH)
1964 #endif
1965                 {
1966                     RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
1967 #if USE_SIMD16_VS
1968                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1969                     AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1970 #else
1971                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
1972                     AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
1973 
1974                     if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
1975                     {
1976                         state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
1977                         AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
1978                     }
1979 #endif
1980                     RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
1981 
1982                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1983                 }
1984             }
1985 
1986             // 2. Assemble primitives given the last two SIMD.
1987             do
1988             {
1989                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1990 
1991                 RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
1992                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1993                 RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
1994 
1995 #if KNOB_ENABLE_TOSS_POINTS
1996                 if (!KNOB_TOSS_FETCH)
1997 #endif
1998                 {
1999 #if KNOB_ENABLE_TOSS_POINTS
2000                     if (!KNOB_TOSS_VS)
2001 #endif
2002                     {
2003                         if (assemble)
2004                         {
2005                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2006 
2007                             const uint32_t numPrims = pa.NumPrims();
2008                             const uint32_t numPrims_lo =
2009                                 std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
2010                             const uint32_t numPrims_hi =
2011                                 std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
2012 
2013                             const simd16scalari primID    = pa.GetPrimID(work.startPrimID);
2014                             const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
2015                             const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
2016 
2017                             if (HasTessellationT::value)
2018                             {
2019                                 pa.useAlternateOffset = false;
2020                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2021                                     pDC,
2022                                     workerId,
2023                                     pa,
2024                                     &gsBuffers,
2025                                     pSoPrimData,
2026                                     numPrims_lo,
2027                                     primID_lo);
2028 
2029                                 if (numPrims_hi)
2030                                 {
2031                                     pa.useAlternateOffset = true;
2032                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2033                                         pDC,
2034                                         workerId,
2035                                         pa,
2036                                         &gsBuffers,
2037                                         pSoPrimData,
2038                                         numPrims_hi,
2039                                         primID_hi);
2040                                 }
2041                             }
2042                             else if (HasGeometryShaderT::value)
2043                             {
2044                                 pa.useAlternateOffset = false;
2045                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2046                                                                              workerId,
2047                                                                              pa,
2048                                                                              &gsBuffers,
2049                                                                              pSoPrimData,
2050                                                                              numPrims_lo,
2051                                                                              primID_lo);
2052 
2053                                 if (numPrims_hi)
2054                                 {
2055                                     pa.useAlternateOffset = true;
2056                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
2057                                                                                  workerId,
2058                                                                                  pa,
2059                                                                                  &gsBuffers,
2060                                                                                  pSoPrimData,
2061                                                                                  numPrims_hi,
2062                                                                                  primID_hi);
2063                                 }
2064                             }
2065                             else
2066                             {
2067                                 // If streamout is enabled then stream vertices out to memory.
2068                                 if (HasStreamOutT::value)
2069                                 {
2070                                     pa.useAlternateOffset = false;
2071                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2072                                 }
2073 
2074                                 if (HasRastT::value)
2075                                 {
2076                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
2077                                     // Gather data from the SVG if provided.
2078                                     simd16scalari vpai = SIMD16::setzero_si();
2079                                     simd16scalari rtai = SIMD16::setzero_si();
2080                                     SIMD16::Vec4  svgAttrib[4];
2081 
2082                                     if (state.backendState.readViewportArrayIndex ||
2083                                         state.backendState.readRenderTargetArrayIndex)
2084                                     {
2085                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2086                                     }
2087 
2088                                     if (state.backendState.readViewportArrayIndex)
2089                                     {
2090                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2091                                         pa.viewportArrayActive = true;
2092                                     }
2093                                     if (state.backendState.readRenderTargetArrayIndex)
2094                                     {
2095                                         rtai =
2096                                             SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2097                                         pa.rtArrayActive = true;
2098                                     }
2099 
2100                                     {
2101                                         // OOB VPAI indices => forced to zero.
2102                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
2103                                         simd16scalari vNumViewports =
2104                                             SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2105                                         simd16scalari vClearMask =
2106                                             SIMD16::cmplt_epi32(vpai, vNumViewports);
2107                                         vpai = SIMD16::and_si(vClearMask, vpai);
2108 
2109                                         pa.useAlternateOffset = false;
2110                                         pDC->pState->pfnProcessPrims_simd16(pDC,
2111                                                                             pa,
2112                                                                             workerId,
2113                                                                             prim_simd16,
2114                                                                             GenMask(numPrims),
2115                                                                             primID,
2116                                                                             vpai,
2117                                                                             rtai);
2118                                     }
2119                                 }
2120                             }
2121                         }
2122                     }
2123                 }
2124             } while (pa.NextPrim());
2125 
2126             if (IsIndexedT::value)
2127             {
2128                 fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2129                 fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
2130             }
2131             else
2132             {
2133                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
2134             }
2135 
2136             i += KNOB_SIMD16_WIDTH;
2137         }
2138 
2139         pa.Reset();
2140     }
2141 
2142 #else
2143     SWR_VS_CONTEXT    vsContext;
2144     SWR_FETCH_CONTEXT fetchInfo = {0};
2145 
2146     fetchInfo.pStreams      = &state.vertexBuffers[0];
2147     fetchInfo.StartInstance = work.startInstance;
2148     fetchInfo.StartVertex   = 0;
2149 
2150     if (IsIndexedT::value)
2151     {
2152         fetchInfo.BaseVertex = work.baseVertex;
2153 
2154         // if the entire index buffer isn't being consumed, set the last index
2155         // so that fetches < a SIMD wide will be masked off
2156         fetchInfo.pLastIndex =
2157             (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
2158         if (xpLastRequestedIndex < fetchInfo.pLastIndex)
2159         {
2160             fetchInfo.pLastIndex = xpLastRequestedIndex;
2161         }
2162     }
2163     else
2164     {
2165         fetchInfo.StartVertex = work.startVertex;
2166     }
2167 
2168     const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2169 
2170     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2171     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
2172     {
2173         simdscalari vIndex;
2174         uint32_t    i = 0;
2175 
2176         if (IsIndexedT::value)
2177         {
2178             fetchInfo.pIndices = work.pIB;
2179         }
2180         else
2181         {
2182             vIndex             = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
2183             fetchInfo.pIndices = (const int32_t*)&vIndex;
2184         }
2185 
2186         fetchInfo.CurInstance = instanceNum;
2187         vsContext.InstanceID  = instanceNum;
2188 
2189         while (pa.HasWork())
2190         {
2191             // GetNextVsOutput currently has the side effect of updating some PA state machine
2192             // state. So we need to keep this outside of (i < endVertex) check.
2193             simdmask* pvCutIndices = nullptr;
2194             if (IsIndexedT::value)
2195             {
2196                 pvCutIndices = &pa.GetNextVsIndices();
2197             }
2198 
2199             simdvertex& vout = pa.GetNextVsOutput();
2200             vsContext.pVin   = &vout;
2201             vsContext.pVout  = &vout;
2202 
2203             if (i < endVertex)
2204             {
2205                 // 1. Execute FS/VS for a single SIMD.
2206                 RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
2207                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
2208                 RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
2209 
2210                 // forward fetch generated vertex IDs to the vertex shader
2211                 vsContext.VertexID = fetchInfo.VertexID;
2212 
2213                 // Setup active mask for vertex shader.
2214                 vsContext.mask = GenerateMask(endVertex - i);
2215 
2216                 // forward cut mask to the PA
2217                 if (IsIndexedT::value)
2218                 {
2219                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2220                 }
2221 
2222                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2223 
2224 #if KNOB_ENABLE_TOSS_POINTS
2225                 if (!KNOB_TOSS_FETCH)
2226 #endif
2227                 {
2228                     RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
2229                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
2230                     RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
2231 
2232                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2233                     AR_EVENT(VSStats((HANDLE)&vsContext.stats));
2234                 }
2235             }
2236 
2237             // 2. Assemble primitives given the last two SIMD.
2238             do
2239             {
2240                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2241                 // PaAssemble returns false if there is not enough verts to assemble.
2242                 RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
2243                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2244                 RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
2245 
2246 #if KNOB_ENABLE_TOSS_POINTS
2247                 if (!KNOB_TOSS_FETCH)
2248 #endif
2249                 {
2250 #if KNOB_ENABLE_TOSS_POINTS
2251                     if (!KNOB_TOSS_VS)
2252 #endif
2253                     {
2254                         if (assemble)
2255                         {
2256                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2257 
2258                             if (HasTessellationT::value)
2259                             {
2260                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2261                                     pDC,
2262                                     workerId,
2263                                     pa,
2264                                     &gsBuffers,
2265                                     pSoPrimData,
2266                                     pa.GetPrimID(work.startPrimID));
2267                             }
2268                             else if (HasGeometryShaderT::value)
2269                             {
2270                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
2271                                     pDC,
2272                                     workerId,
2273                                     pa,
2274                                     &gsBuffers,
2275                                     pSoPrimData,
2276                                     pa.GetPrimID(work.startPrimID));
2277                             }
2278                             else
2279                             {
2280                                 // If streamout is enabled then stream vertices out to memory.
2281                                 if (HasStreamOutT::value)
2282                                 {
2283                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2284                                 }
2285 
2286                                 if (HasRastT::value)
2287                                 {
2288                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
2289 
2290                                     // Gather data from the SVG if provided.
2291                                     simdscalari vViewportIdx = SIMD::setzero_si();
2292                                     simdscalari vRtIdx       = SIMD::setzero_si();
2293                                     SIMD::Vec4  svgAttrib[4];
2294 
2295                                     if (state.backendState.readViewportArrayIndex ||
2296                                         state.backendState.readRenderTargetArrayIndex)
2297                                     {
2298                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2299                                     }
2300 
2301                                     if (state.backendState.readViewportArrayIndex)
2302                                     {
2303                                         vViewportIdx =
2304                                             SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2305 
2306                                         // OOB VPAI indices => forced to zero.
2307                                         vViewportIdx =
2308                                             SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
2309                                         simdscalari vNumViewports =
2310                                             SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2311                                         simdscalari vClearMask =
2312                                             SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
2313                                         vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
2314                                         pa.viewportArrayActive = true;
2315                                     }
2316                                     if (state.backendState.readRenderTargetArrayIndex)
2317                                     {
2318                                         vRtIdx =
2319                                             SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2320                                         pa.rtArrayActive = true;
2321                                     }
2322 
2323                                     pDC->pState->pfnProcessPrims(pDC,
2324                                                                  pa,
2325                                                                  workerId,
2326                                                                  prim,
2327                                                                  GenMask(pa.NumPrims()),
2328                                                                  pa.GetPrimID(work.startPrimID),
2329                                                                  vViewportIdx,
2330                                                                  vRtIdx);
2331                                 }
2332                             }
2333                         }
2334                     }
2335                 }
2336             } while (pa.NextPrim());
2337 
2338             if (IsIndexedT::value)
2339             {
2340                 fetchInfo.pIndices =
2341                     (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2342             }
2343             else
2344             {
2345                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2346             }
2347 
2348             i += KNOB_SIMD_WIDTH;
2349         }
2350         pa.Reset();
2351     }
2352 
2353 #endif
2354 
2355     RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
2356 }
2357 
2358 struct FEDrawChooser
2359 {
2360     typedef PFN_FE_WORK_FUNC FuncType;
2361 
2362     template <typename... ArgsB>
GetFuncFEDrawChooser2363     static FuncType GetFunc()
2364     {
2365         return ProcessDraw<ArgsB...>;
2366     }
2367 };
2368 
2369 // Selector for correct templated Draw front-end function
GetProcessDrawFunc(bool IsIndexed,bool IsCutIndexEnabled,bool HasTessellation,bool HasGeometryShader,bool HasStreamOut,bool HasRasterization)2370 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
2371                                     bool IsCutIndexEnabled,
2372                                     bool HasTessellation,
2373                                     bool HasGeometryShader,
2374                                     bool HasStreamOut,
2375                                     bool HasRasterization)
2376 {
2377     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
2378                                                        IsCutIndexEnabled,
2379                                                        HasTessellation,
2380                                                        HasGeometryShader,
2381                                                        HasStreamOut,
2382                                                        HasRasterization);
2383 }
2384