1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.cpp
24 *
25 * @brief Implementation for Frontend which handles vertex processing,
26 *        primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29 
30 #include "api.h"
31 #include "frontend.h"
32 #include "backend.h"
33 #include "context.h"
34 #include "rdtsc_core.h"
35 #include "utils.h"
36 #include "threads.h"
37 #include "pa.h"
38 #include "clip.h"
39 #include "tilemgr.h"
40 #include "tessellator.h"
41 #include <limits>
42 #include <iostream>
43 
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
GenMask(uint32_t numBits)46 static INLINE uint32_t GenMask(uint32_t numBits)
47 {
48     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
49     return ((1U << numBits) - 1);
50 }
51 
52 //////////////////////////////////////////////////////////////////////////
53 /// @brief FE handler for SwrSync.
54 /// @param pContext - pointer to SWR context.
55 /// @param pDC - pointer to draw context.
56 /// @param workerId - thread's worker id. Even thread has a unique id.
57 /// @param pUserData - Pointer to user data passed back to sync callback.
58 /// @todo This should go away when we switch this to use compute threading.
ProcessSync(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)59 void ProcessSync(
60     SWR_CONTEXT *pContext,
61     DRAW_CONTEXT *pDC,
62     uint32_t workerId,
63     void *pUserData)
64 {
65     BE_WORK work;
66     work.type = SYNC;
67     work.pfnWork = ProcessSyncBE;
68 
69     MacroTileMgr *pTileMgr = pDC->pTileMgr;
70     pTileMgr->enqueue(0, 0, &work);
71 }
72 
73 //////////////////////////////////////////////////////////////////////////
74 /// @brief FE handler for SwrDestroyContext.
75 /// @param pContext - pointer to SWR context.
76 /// @param pDC - pointer to draw context.
77 /// @param workerId - thread's worker id. Even thread has a unique id.
78 /// @param pUserData - Pointer to user data passed back to sync callback.
ProcessShutdown(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)79 void ProcessShutdown(
80     SWR_CONTEXT *pContext,
81     DRAW_CONTEXT *pDC,
82     uint32_t workerId,
83     void *pUserData)
84 {
85     BE_WORK work;
86     work.type = SHUTDOWN;
87     work.pfnWork = ProcessShutdownBE;
88 
89     MacroTileMgr *pTileMgr = pDC->pTileMgr;
90     // Enqueue at least 1 work item for each worker thread
91     // account for number of numa nodes
92     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
93 
94     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
95     {
96         for (uint32_t n = 0; n < numNumaNodes; ++n)
97         {
98             pTileMgr->enqueue(i, n, &work);
99         }
100     }
101 }
102 
103 //////////////////////////////////////////////////////////////////////////
104 /// @brief FE handler for SwrClearRenderTarget.
105 /// @param pContext - pointer to SWR context.
106 /// @param pDC - pointer to draw context.
107 /// @param workerId - thread's worker id. Even thread has a unique id.
108 /// @param pUserData - Pointer to user data passed back to clear callback.
109 /// @todo This should go away when we switch this to use compute threading.
ProcessClear(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)110 void ProcessClear(
111     SWR_CONTEXT *pContext,
112     DRAW_CONTEXT *pDC,
113     uint32_t workerId,
114     void *pUserData)
115 {
116     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
117     MacroTileMgr *pTileMgr = pDC->pTileMgr;
118 
119     // queue a clear to each macro tile
120     // compute macro tile bounds for the specified rect
121     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
122     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
123     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
124     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
125 
126     BE_WORK work;
127     work.type = CLEAR;
128     work.pfnWork = ProcessClearBE;
129     work.desc.clear = *pDesc;
130 
131     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
132     {
133         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
134         {
135             pTileMgr->enqueue(x, y, &work);
136         }
137     }
138 }
139 
140 //////////////////////////////////////////////////////////////////////////
141 /// @brief FE handler for SwrStoreTiles.
142 /// @param pContext - pointer to SWR context.
143 /// @param pDC - pointer to draw context.
144 /// @param workerId - thread's worker id. Even thread has a unique id.
145 /// @param pUserData - Pointer to user data passed back to callback.
146 /// @todo This should go away when we switch this to use compute threading.
ProcessStoreTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)147 void ProcessStoreTiles(
148     SWR_CONTEXT *pContext,
149     DRAW_CONTEXT *pDC,
150     uint32_t workerId,
151     void *pUserData)
152 {
153     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
154     MacroTileMgr *pTileMgr = pDC->pTileMgr;
155     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
156 
157     // queue a store to each macro tile
158     // compute macro tile bounds for the specified rect
159     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
160     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
161     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
162     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
163 
164     // store tiles
165     BE_WORK work;
166     work.type = STORETILES;
167     work.pfnWork = ProcessStoreTilesBE;
168     work.desc.storeTiles = *pDesc;
169 
170     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
171     {
172         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
173         {
174             pTileMgr->enqueue(x, y, &work);
175         }
176     }
177 
178     AR_END(FEProcessStoreTiles, 0);
179 }
180 
181 //////////////////////////////////////////////////////////////////////////
182 /// @brief FE handler for SwrInvalidateTiles.
183 /// @param pContext - pointer to SWR context.
184 /// @param pDC - pointer to draw context.
185 /// @param workerId - thread's worker id. Even thread has a unique id.
186 /// @param pUserData - Pointer to user data passed back to callback.
187 /// @todo This should go away when we switch this to use compute threading.
ProcessDiscardInvalidateTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)188 void ProcessDiscardInvalidateTiles(
189     SWR_CONTEXT *pContext,
190     DRAW_CONTEXT *pDC,
191     uint32_t workerId,
192     void *pUserData)
193 {
194     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
195     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
196     MacroTileMgr *pTileMgr = pDC->pTileMgr;
197 
198     // compute macro tile bounds for the specified rect
199     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
200     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
201     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
202     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
203 
204     if (pDesc->fullTilesOnly == false)
205     {
206         // include partial tiles
207         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
208         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
209         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
210         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
211     }
212 
213     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
214     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
215 
216     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
217     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
218 
219     // load tiles
220     BE_WORK work;
221     work.type = DISCARDINVALIDATETILES;
222     work.pfnWork = ProcessDiscardInvalidateTilesBE;
223     work.desc.discardInvalidateTiles = *pDesc;
224 
225     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
226     {
227         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
228         {
229             pTileMgr->enqueue(x, y, &work);
230         }
231     }
232 
233     AR_END(FEProcessInvalidateTiles, 0);
234 }
235 
236 //////////////////////////////////////////////////////////////////////////
237 /// @brief Computes the number of primitives given the number of verts.
238 /// @param mode - primitive topology for draw operation.
239 /// @param numPrims - number of vertices or indices for draw.
240 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
GetNumPrims(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)241 uint32_t GetNumPrims(
242     PRIMITIVE_TOPOLOGY mode,
243     uint32_t numPrims)
244 {
245     switch (mode)
246     {
247     case TOP_POINT_LIST: return numPrims;
248     case TOP_TRIANGLE_LIST: return numPrims / 3;
249     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
250     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
251     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
252     case TOP_QUAD_LIST: return numPrims / 4;
253     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
254     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
255     case TOP_LINE_LIST: return numPrims / 2;
256     case TOP_LINE_LOOP: return numPrims;
257     case TOP_RECT_LIST: return numPrims / 3;
258     case TOP_LINE_LIST_ADJ: return numPrims / 4;
259     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
260     case TOP_TRI_LIST_ADJ: return numPrims / 6;
261     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
262 
263     case TOP_PATCHLIST_1:
264     case TOP_PATCHLIST_2:
265     case TOP_PATCHLIST_3:
266     case TOP_PATCHLIST_4:
267     case TOP_PATCHLIST_5:
268     case TOP_PATCHLIST_6:
269     case TOP_PATCHLIST_7:
270     case TOP_PATCHLIST_8:
271     case TOP_PATCHLIST_9:
272     case TOP_PATCHLIST_10:
273     case TOP_PATCHLIST_11:
274     case TOP_PATCHLIST_12:
275     case TOP_PATCHLIST_13:
276     case TOP_PATCHLIST_14:
277     case TOP_PATCHLIST_15:
278     case TOP_PATCHLIST_16:
279     case TOP_PATCHLIST_17:
280     case TOP_PATCHLIST_18:
281     case TOP_PATCHLIST_19:
282     case TOP_PATCHLIST_20:
283     case TOP_PATCHLIST_21:
284     case TOP_PATCHLIST_22:
285     case TOP_PATCHLIST_23:
286     case TOP_PATCHLIST_24:
287     case TOP_PATCHLIST_25:
288     case TOP_PATCHLIST_26:
289     case TOP_PATCHLIST_27:
290     case TOP_PATCHLIST_28:
291     case TOP_PATCHLIST_29:
292     case TOP_PATCHLIST_30:
293     case TOP_PATCHLIST_31:
294     case TOP_PATCHLIST_32:
295         return numPrims / (mode - TOP_PATCHLIST_BASE);
296 
297     case TOP_POLYGON:
298     case TOP_POINT_LIST_BF:
299     case TOP_LINE_STRIP_CONT:
300     case TOP_LINE_STRIP_BF:
301     case TOP_LINE_STRIP_CONT_BF:
302     case TOP_TRIANGLE_FAN_NOSTIPPLE:
303     case TOP_TRI_STRIP_REVERSE:
304     case TOP_PATCHLIST_BASE:
305     case TOP_UNKNOWN:
306         SWR_INVALID("Unsupported topology: %d", mode);
307         return 0;
308     }
309 
310     return 0;
311 }
312 
313 //////////////////////////////////////////////////////////////////////////
314 /// @brief Computes the number of verts given the number of primitives.
315 /// @param mode - primitive topology for draw operation.
316 /// @param numPrims - number of primitives for draw.
GetNumVerts(PRIMITIVE_TOPOLOGY mode,uint32_t numPrims)317 uint32_t GetNumVerts(
318     PRIMITIVE_TOPOLOGY mode,
319     uint32_t numPrims)
320 {
321     switch (mode)
322     {
323     case TOP_POINT_LIST: return numPrims;
324     case TOP_TRIANGLE_LIST: return numPrims * 3;
325     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
326     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
327     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
328     case TOP_QUAD_LIST: return numPrims * 4;
329     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
330     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
331     case TOP_LINE_LIST: return numPrims * 2;
332     case TOP_LINE_LOOP: return numPrims;
333     case TOP_RECT_LIST: return numPrims * 3;
334     case TOP_LINE_LIST_ADJ: return numPrims * 4;
335     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
336     case TOP_TRI_LIST_ADJ: return numPrims * 6;
337     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
338 
339     case TOP_PATCHLIST_1:
340     case TOP_PATCHLIST_2:
341     case TOP_PATCHLIST_3:
342     case TOP_PATCHLIST_4:
343     case TOP_PATCHLIST_5:
344     case TOP_PATCHLIST_6:
345     case TOP_PATCHLIST_7:
346     case TOP_PATCHLIST_8:
347     case TOP_PATCHLIST_9:
348     case TOP_PATCHLIST_10:
349     case TOP_PATCHLIST_11:
350     case TOP_PATCHLIST_12:
351     case TOP_PATCHLIST_13:
352     case TOP_PATCHLIST_14:
353     case TOP_PATCHLIST_15:
354     case TOP_PATCHLIST_16:
355     case TOP_PATCHLIST_17:
356     case TOP_PATCHLIST_18:
357     case TOP_PATCHLIST_19:
358     case TOP_PATCHLIST_20:
359     case TOP_PATCHLIST_21:
360     case TOP_PATCHLIST_22:
361     case TOP_PATCHLIST_23:
362     case TOP_PATCHLIST_24:
363     case TOP_PATCHLIST_25:
364     case TOP_PATCHLIST_26:
365     case TOP_PATCHLIST_27:
366     case TOP_PATCHLIST_28:
367     case TOP_PATCHLIST_29:
368     case TOP_PATCHLIST_30:
369     case TOP_PATCHLIST_31:
370     case TOP_PATCHLIST_32:
371         return numPrims * (mode - TOP_PATCHLIST_BASE);
372 
373     case TOP_POLYGON:
374     case TOP_POINT_LIST_BF:
375     case TOP_LINE_STRIP_CONT:
376     case TOP_LINE_STRIP_BF:
377     case TOP_LINE_STRIP_CONT_BF:
378     case TOP_TRIANGLE_FAN_NOSTIPPLE:
379     case TOP_TRI_STRIP_REVERSE:
380     case TOP_PATCHLIST_BASE:
381     case TOP_UNKNOWN:
382         SWR_INVALID("Unsupported topology: %d", mode);
383         return 0;
384     }
385 
386     return 0;
387 }
388 
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Return number of verts per primitive.
391 /// @param topology - topology
392 /// @param includeAdjVerts - include adjacent verts in primitive vertices
NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology,bool includeAdjVerts)393 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
394 {
395     uint32_t numVerts = 0;
396     switch (topology)
397     {
398     case TOP_POINT_LIST:
399     case TOP_POINT_LIST_BF:
400         numVerts = 1;
401         break;
402     case TOP_LINE_LIST:
403     case TOP_LINE_STRIP:
404     case TOP_LINE_LIST_ADJ:
405     case TOP_LINE_LOOP:
406     case TOP_LINE_STRIP_CONT:
407     case TOP_LINE_STRIP_BF:
408     case TOP_LISTSTRIP_ADJ:
409         numVerts = 2;
410         break;
411     case TOP_TRIANGLE_LIST:
412     case TOP_TRIANGLE_STRIP:
413     case TOP_TRIANGLE_FAN:
414     case TOP_TRI_LIST_ADJ:
415     case TOP_TRI_STRIP_ADJ:
416     case TOP_TRI_STRIP_REVERSE:
417     case TOP_RECT_LIST:
418         numVerts = 3;
419         break;
420     case TOP_QUAD_LIST:
421     case TOP_QUAD_STRIP:
422         numVerts = 4;
423         break;
424     case TOP_PATCHLIST_1:
425     case TOP_PATCHLIST_2:
426     case TOP_PATCHLIST_3:
427     case TOP_PATCHLIST_4:
428     case TOP_PATCHLIST_5:
429     case TOP_PATCHLIST_6:
430     case TOP_PATCHLIST_7:
431     case TOP_PATCHLIST_8:
432     case TOP_PATCHLIST_9:
433     case TOP_PATCHLIST_10:
434     case TOP_PATCHLIST_11:
435     case TOP_PATCHLIST_12:
436     case TOP_PATCHLIST_13:
437     case TOP_PATCHLIST_14:
438     case TOP_PATCHLIST_15:
439     case TOP_PATCHLIST_16:
440     case TOP_PATCHLIST_17:
441     case TOP_PATCHLIST_18:
442     case TOP_PATCHLIST_19:
443     case TOP_PATCHLIST_20:
444     case TOP_PATCHLIST_21:
445     case TOP_PATCHLIST_22:
446     case TOP_PATCHLIST_23:
447     case TOP_PATCHLIST_24:
448     case TOP_PATCHLIST_25:
449     case TOP_PATCHLIST_26:
450     case TOP_PATCHLIST_27:
451     case TOP_PATCHLIST_28:
452     case TOP_PATCHLIST_29:
453     case TOP_PATCHLIST_30:
454     case TOP_PATCHLIST_31:
455     case TOP_PATCHLIST_32:
456         numVerts = topology - TOP_PATCHLIST_BASE;
457         break;
458     default:
459         SWR_INVALID("Unsupported topology: %d", topology);
460         break;
461     }
462 
463     if (includeAdjVerts)
464     {
465         switch (topology)
466         {
467         case TOP_LISTSTRIP_ADJ:
468         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
469         case TOP_TRI_STRIP_ADJ:
470         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
471         default: break;
472         }
473     }
474 
475     return numVerts;
476 }
477 
478 //////////////////////////////////////////////////////////////////////////
479 /// @brief Generate mask from remaining work.
480 /// @param numWorkItems - Number of items being worked on by a SIMD.
GenerateMask(uint32_t numItemsRemaining)481 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
482 {
483     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
484     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
485     return _simd_castps_si(_simd_vmask_ps(mask));
486 }
487 
GenerateMask16(uint32_t numItemsRemaining)488 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
489 {
490     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
491     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
492     return _simd16_castps_si(_simd16_vmask_ps(mask));
493 }
494 
495 //////////////////////////////////////////////////////////////////////////
496 /// @brief StreamOut - Streams vertex data out to SO buffers.
497 ///        Generally, we are only streaming out a SIMDs worth of triangles.
498 /// @param pDC - pointer to draw context.
499 /// @param workerId - thread's worker id. Even thread has a unique id.
500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
StreamOut(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,uint32_t * pPrimData,uint32_t streamIndex)501 static void StreamOut(
502     DRAW_CONTEXT* pDC,
503     PA_STATE& pa,
504     uint32_t workerId,
505     uint32_t* pPrimData,
506     uint32_t streamIndex)
507 {
508     SWR_CONTEXT *pContext = pDC->pContext;
509 
510     AR_BEGIN(FEStreamout, pDC->drawId);
511 
512     const API_STATE& state = GetApiState(pDC);
513     const SWR_STREAMOUT_STATE &soState = state.soState;
514 
515     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
516 
517     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
518     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
519 
520     SWR_STREAMOUT_CONTEXT soContext = { 0 };
521 
522     // Setup buffer state pointers.
523     for (uint32_t i = 0; i < 4; ++i)
524     {
525         soContext.pBuffer[i] = &state.soBuffer[i];
526     }
527 
528     uint32_t numPrims = pa.NumPrims();
529 
530     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
531     {
532         DWORD slot = 0;
533         uint32_t soMask = soState.streamMasks[streamIndex];
534 
535         // Write all entries into primitive data buffer for SOS.
536         while (_BitScanForward(&slot, soMask))
537         {
538             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
539             uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
540             pa.AssembleSingle(paSlot, primIndex, attrib);
541 
542             // Attribute offset is relative offset from start of vertex.
543             // Note that attributes start at slot 1 in the PA buffer. We need to write this
544             // to prim data starting at slot 0. Which is why we do (slot - 1).
545             // Also note: GL works slightly differently, and needs slot 0
546             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
547 
548             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
549             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
550             {
551                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
552 
553                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
554             }
555 
556             soMask &= ~(1 << slot);
557         }
558 
559         // Update pPrimData pointer
560         soContext.pPrimData = pPrimData;
561 
562         // Call SOS
563         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
564         state.pfnSoFunc[streamIndex](soContext);
565     }
566 
567     // Update SO write offset. The driver provides memory for the update.
568     for (uint32_t i = 0; i < 4; ++i)
569     {
570         if (state.soBuffer[i].pWriteOffset)
571         {
572             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
573         }
574 
575         if (state.soBuffer[i].soWriteEnable)
576         {
577             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
578             pDC->dynState.SoWriteOffsetDirty[i] = true;
579         }
580     }
581 
582     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
583     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
584 
585     AR_END(FEStreamout, 1);
586 }
587 
588 #if USE_SIMD16_FRONTEND
589 //////////////////////////////////////////////////////////////////////////
590 /// Is value an even number (a multiple of two)
591 ///
592 template <typename T>
IsEven(T value)593 INLINE static bool IsEven(T value)
594 {
595     return (value & 1) == 0;
596 }
597 
598 //////////////////////////////////////////////////////////////////////////
599 /// Round up value to an even number (a multiple of two)
600 ///
601 template <typename T>
RoundUpEven(T value)602 INLINE static T RoundUpEven(T value)
603 {
604     return (value + 1) & ~1;
605 }
606 
607 //////////////////////////////////////////////////////////////////////////
608 /// Round down value to an even number (a multiple of two)
609 ///
610 template <typename T>
RoundDownEven(T value)611 INLINE static T RoundDownEven(T value)
612 {
613     return value & ~1;
614 }
615 
616 //////////////////////////////////////////////////////////////////////////
617 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
618 ///
619 /// vertexCount is in terms of the source simdvertexes and must be even
620 ///
621 /// attribCount will limit the vector copies to those attribs specified
622 ///
623 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
624 ///
PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex * vertex_simd16,const simdvertex * vertex,uint32_t vertexCount,uint32_t attribCount)625 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
626 {
627     SWR_ASSERT(vertex);
628     SWR_ASSERT(vertex_simd16);
629     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
630 
631     simd16vertex temp;
632 
633     for (uint32_t i = 0; i < vertexCount; i += 2)
634     {
635         for (uint32_t j = 0; j < attribCount; j += 1)
636         {
637             for (uint32_t k = 0; k < 4; k += 1)
638             {
639                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
640 
641                 if ((i + 1) < vertexCount)
642                 {
643                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
644                 }
645             }
646         }
647 
648         for (uint32_t j = 0; j < attribCount; j += 1)
649         {
650             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
651         }
652     }
653 }
654 
655 #endif
656 //////////////////////////////////////////////////////////////////////////
657 /// @brief Computes number of invocations. The current index represents
658 ///        the start of the SIMD. The max index represents how much work
659 ///        items are remaining. If there is less then a SIMD's xmin of work
660 ///        then return the remaining amount of work.
661 /// @param curIndex - The start index for the SIMD.
662 /// @param maxIndex - The last index for all work items.
GetNumInvocations(uint32_t curIndex,uint32_t maxIndex)663 static INLINE uint32_t GetNumInvocations(
664     uint32_t curIndex,
665     uint32_t maxIndex)
666 {
667     uint32_t remainder = (maxIndex - curIndex);
668 #if USE_SIMD16_FRONTEND
669     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
670 #else
671     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
672 #endif
673 }
674 
675 //////////////////////////////////////////////////////////////////////////
676 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
677 ///        The geometry shader will loop over each active streamout buffer, assembling
678 ///        primitives for the downstream stages. When multistream output is enabled,
679 ///        the generated stream ID buffer from the GS needs to be converted to a cut
680 ///        buffer for the primitive assembler.
681 /// @param stream - stream id to generate the cut buffer for
682 /// @param pStreamIdBase - pointer to the stream ID buffer
683 /// @param numEmittedVerts - Number of total verts emitted by the GS
684 /// @param pCutBuffer - output buffer to write cuts to
ProcessStreamIdBuffer(uint32_t stream,uint8_t * pStreamIdBase,uint32_t numEmittedVerts,uint8_t * pCutBuffer)685 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
686 {
687     SWR_ASSERT(stream < MAX_SO_STREAMS);
688 
689     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
690     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
691 
692     for (uint32_t b = 0; b < numOutputBytes; ++b)
693     {
694         uint8_t curInputByte = pStreamIdBase[2*b];
695         uint8_t outByte = 0;
696         for (uint32_t i = 0; i < 4; ++i)
697         {
698             if ((curInputByte & 0x3) != stream)
699             {
700                 outByte |= (1 << i);
701             }
702             curInputByte >>= 2;
703         }
704 
705         curInputByte = pStreamIdBase[2 * b + 1];
706         for (uint32_t i = 0; i < 4; ++i)
707         {
708             if ((curInputByte & 0x3) != stream)
709             {
710                 outByte |= (1 << (i + 4));
711             }
712             curInputByte >>= 2;
713         }
714 
715         *pCutBuffer++ = outByte;
716     }
717 }
718 
719 // Buffers that are allocated if GS is enabled
720 struct GsBuffers
721 {
722     uint8_t* pGsIn;
723     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
724     uint8_t* pGsTransposed;
725     void* pStreamCutBuffer;
726 };
727 
728 //////////////////////////////////////////////////////////////////////////
729 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
730 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
731 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
732 /// @param numVerts - Number of vertices outputted by the GS
733 /// @param numAttribs - Number of attributes per vertex
734 template<typename SIMD_T, uint32_t SimdWidth>
TransposeSOAtoAOS(uint8_t * pDst,uint8_t * pSrc,uint32_t numVerts,uint32_t numAttribs)735 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
736 {
737     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
738     uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
739 
740     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
741 
742     for (uint32_t i = 0; i < SimdWidth; ++i)
743     {
744         gatherOffsets[i] = srcVertexStride * i;
745     }
746     auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);
747 
748     uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
749     uint32_t remainingVerts = numVerts;
750 
751     for (uint32_t s = 0; s < numSimd; ++s)
752     {
753         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
754         uint8_t* pDstBase = pDst + s * dstVertexStride;
755 
756         // Compute mask to prevent src overflow
757         uint32_t mask = std::min(remainingVerts, SimdWidth);
758         mask = GenMask(mask);
759         auto vMask = SIMD_T::vmask_ps(mask);
760         auto viMask = SIMD_T::castps_si(vMask);
761 
762         for (uint32_t a = 0; a < numAttribs; ++a)
763         {
764             auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
765             auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
766             auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
767             auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
768 
769             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
770             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
771             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
772             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
773 
774             pSrcBase += sizeof(float) * 4;
775             pDstBase += sizeof(typename SIMD_T::Float) * 4;
776         }
777         remainingVerts -= SimdWidth;
778     }
779 }
780 
781 
782 //////////////////////////////////////////////////////////////////////////
783 /// @brief Implements GS stage.
784 /// @param pDC - pointer to draw context.
785 /// @param workerId - thread's worker id. Even thread has a unique id.
786 /// @param pa - The primitive assembly object.
787 /// @param pGsOut - output stream for GS
788 template <
789     typename HasStreamOutT,
790     typename HasRastT>
GeometryShaderStage(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)791 static void GeometryShaderStage(
792     DRAW_CONTEXT *pDC,
793     uint32_t workerId,
794     PA_STATE& pa,
795     GsBuffers* pGsBuffers,
796     uint32_t* pSoPrimData,
797 #if USE_SIMD16_FRONTEND
798     uint32_t numPrims_simd8,
799 #endif
800     simdscalari const &primID)
801 {
802     SWR_CONTEXT *pContext = pDC->pContext;
803 
804     AR_BEGIN(FEGeometryShader, pDC->drawId);
805 
806     const API_STATE& state = GetApiState(pDC);
807     const SWR_GS_STATE* pState = &state.gsState;
808     SWR_GS_CONTEXT gsContext;
809 
810     static uint8_t sNullBuffer[128] = { 0 };
811 
812     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
813     {
814         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
815     }
816     gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
817     gsContext.PrimitiveID = primID;
818 
819     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
820     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
821 
822     // assemble all attributes for the input primitive
823     gsContext.inputVertStride = pState->inputVertStride;
824     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
825     {
826         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
827         uint32_t attribSlot = pState->vertexAttribOffset + slot;
828         pa.Assemble(srcAttribSlot, attrib);
829 
830         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
831         {
832             gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
833         }
834     }
835 
836     // assemble position
837     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
838     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
839     {
840         gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
841     }
842 
843     // record valid prims from the frontend to avoid over binning the newly generated
844     // prims from the GS
845 #if USE_SIMD16_FRONTEND
846     uint32_t numInputPrims = numPrims_simd8;
847 #else
848     uint32_t numInputPrims = pa.NumPrims();
849 #endif
850 
851     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
852     {
853         gsContext.InstanceID = instance;
854         gsContext.mask = GenerateMask(numInputPrims);
855 
856         // execute the geometry shader
857         state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
858 
859         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
860         {
861             gsContext.pStreams[i] += pState->allocationSize;
862         }
863     }
864 
865     // set up new binner and state for the GS output topology
866 #if USE_SIMD16_FRONTEND
867     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
868     if (HasRastT::value)
869     {
870         switch (pState->outputTopology)
871         {
872         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
873         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
874         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
875         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
876         }
877     }
878 
879 #else
880     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
881     if (HasRastT::value)
882     {
883         switch (pState->outputTopology)
884         {
885         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
886         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
887         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
888         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
889         }
890     }
891 
892 #endif
893     // foreach input prim:
894     // - setup a new PA based on the emitted verts for that prim
895     // - loop over the new verts, calling PA to assemble each prim
896     uint32_t* pPrimitiveId = (uint32_t*)&primID;
897 
898     uint32_t totalPrimsGenerated = 0;
899     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
900     {
901         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
902 
903         // Vertex count is either emitted by shader or static
904         uint32_t vertexCount = 0;
905         if (pState->staticVertexCount)
906         {
907             vertexCount = pState->staticVertexCount;
908         }
909         else
910         {
911             // If emitted in shader, it should be the stored in the first dword of the output buffer
912             vertexCount = *(uint32_t*)pInstanceBase;
913         }
914 
915         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
916         {
917             uint32_t numEmittedVerts = vertexCount;
918             if (numEmittedVerts == 0)
919             {
920                 continue;
921             }
922 
923             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
924             uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
925             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
926 
927 #if USE_SIMD16_FRONTEND
928             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
929 #else
930             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
931 #endif
932 
933             uint32_t numAttribs = state.feNumAttributes;
934 
935             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
936             {
937                 bool processCutVerts = false;
938                 uint8_t* pCutBuffer = pCutBase;
939 
940                 // assign default stream ID, only relevant when GS is outputting a single stream
941                 uint32_t streamID = 0;
942                 if (pState->isSingleStream)
943                 {
944                     processCutVerts = true;
945                     streamID = pState->singleStreamID;
946                     if (streamID != stream) continue;
947                 }
948                 else
949                 {
950                     // early exit if this stream is not enabled for streamout
951                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
952                     {
953                         continue;
954                     }
955 
956                     // multi-stream output, need to translate StreamID buffer to a cut buffer
957                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
958                     pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
959                     processCutVerts = false;
960                 }
961 
962 #if USE_SIMD16_FRONTEND
963                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
964 
965 #else
966                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
967 
968 #endif
969                 while (gsPa.GetNextStreamOutput())
970                 {
971                     do
972                     {
973 #if USE_SIMD16_FRONTEND
974                         simd16vector attrib_simd16[3];
975 
976                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
977 
978 #else
979                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
980 
981 #endif
982                         if (assemble)
983                         {
984                             totalPrimsGenerated += gsPa.NumPrims();
985 
986                             if (HasStreamOutT::value)
987                             {
988 #if ENABLE_AVX512_SIMD16
989                                 gsPa.useAlternateOffset = false;
990 #endif
991                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
992                             }
993 
994                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
995                             {
996 #if USE_SIMD16_FRONTEND
997                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
998 
999                                 // Gather data from the SVG if provided.
1000                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
1001                                 simd16scalari vRtIdx = SIMD16::setzero_si();
1002                                 SIMD16::Vec4 svgAttrib[4];
1003 
1004                                 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1005                                 {
1006                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1007                                 }
1008 
1009 
1010                                 if (state.backendState.readViewportArrayIndex)
1011                                 {
1012                                     vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1013                                     gsPa.viewportArrayActive = true;
1014                                 }
1015                                 if (state.backendState.readRenderTargetArrayIndex)
1016                                 {
1017                                     vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1018                                     gsPa.rtArrayActive = true;
1019                                 }
1020 
1021                                 {
1022                                     // OOB VPAI indices => forced to zero.
1023                                     vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1024                                     simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1025                                     simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1026                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1027 
1028                                     gsPa.useAlternateOffset = false;
1029                                     pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1030                                 }
1031 #else
1032                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
1033 
1034                                 // Gather data from the SVG if provided.
1035                                 simdscalari vViewportIdx = SIMD16::setzero_si();
1036                                 simdscalari vRtIdx = SIMD16::setzero_si();
1037                                 SIMD8::Vec4 svgAttrib[4];
1038 
1039                                 if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1040                                 {
1041                                     tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1042                                 }
1043 
1044 
1045                                 if (state.backendState.readViewportArrayIndex)
1046                                 {
1047                                     vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1048 
1049                                     // OOB VPAI indices => forced to zero.
1050                                     vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
1051                                     simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1052                                     simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
1053                                     vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
1054                                     tessPa.viewportArrayActive = true;
1055                                 }
1056                                 if (state.backendState.readRenderTargetArrayIndex)
1057                                 {
1058                                     vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1059                                     tessPa.rtArrayActive = true;
1060                                 }
1061 
1062                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
1063 #endif
1064                             }
1065                         }
1066                     } while (gsPa.NextPrim());
1067                 }
1068             }
1069         }
1070     }
1071 
1072     // update GS pipeline stats
1073     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
1074     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
1075     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
1076     AR_END(FEGeometryShader, 1);
1077 }
1078 
1079 //////////////////////////////////////////////////////////////////////////
1080 /// @brief Allocate GS buffers
1081 /// @param pDC - pointer to draw context.
1082 /// @param state - API state
1083 /// @param ppGsOut - pointer to GS output buffer allocation
1084 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1085 template<typename SIMD_T, uint32_t SIMD_WIDTH>
AllocateGsBuffers(DRAW_CONTEXT * pDC,const API_STATE & state,uint32_t vertsPerPrim,GsBuffers * pGsBuffers)1086 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
1087 {
1088     auto pArena = pDC->pArena;
1089     SWR_ASSERT(pArena != nullptr);
1090     SWR_ASSERT(state.gsState.gsEnable);
1091 
1092     const SWR_GS_STATE& gsState = state.gsState;
1093 
1094     // Allocate storage for vertex inputs
1095     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
1096     pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
1097 
1098     // Allocate arena space to hold GS output verts
1099     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
1100 
1101     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1102     {
1103         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
1104     }
1105 
1106     // Allocate storage for transposed GS output
1107     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
1108     uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
1109     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
1110 
1111     // Allocate storage to hold temporary stream->cut buffer, if necessary
1112     if (state.gsState.isSingleStream)
1113     {
1114         pGsBuffers->pStreamCutBuffer = nullptr;
1115     }
1116     else
1117     {
1118         pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
1119     }
1120 }
1121 
1122 //////////////////////////////////////////////////////////////////////////
1123 /// @brief Contains all data generated by the HS and passed to the
1124 /// tessellator and DS.
1125 struct TessellationThreadLocalData
1126 {
1127     SWR_HS_CONTEXT hsContext;
1128     ScalarPatch patchData[KNOB_SIMD_WIDTH];
1129     void* pTxCtx;
1130     size_t tsCtxSize;
1131 
1132     simdscalar* pDSOutput;
1133     size_t dsOutputAllocSize;
1134 };
1135 
1136 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
1137 
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief Allocate tessellation data for this worker thread.
1140 INLINE
AllocateTessellationData(SWR_CONTEXT * pContext)1141 static void AllocateTessellationData(SWR_CONTEXT* pContext)
1142 {
1143     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
1144     if (gt_pTessellationThreadData == nullptr)
1145     {
1146         gt_pTessellationThreadData = (TessellationThreadLocalData*)
1147             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1148         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1149     }
1150 }
1151 
1152 //////////////////////////////////////////////////////////////////////////
1153 /// @brief Implements Tessellation Stages.
1154 /// @param pDC - pointer to draw context.
1155 /// @param workerId - thread's worker id. Even thread has a unique id.
1156 /// @param pa - The primitive assembly object.
1157 /// @param pGsOut - output stream for GS
1158 template <
1159     typename HasGeometryShaderT,
1160     typename HasStreamOutT,
1161     typename HasRastT>
TessellationStages(DRAW_CONTEXT * pDC,uint32_t workerId,PA_STATE & pa,GsBuffers * pGsBuffers,uint32_t * pSoPrimData,uint32_t numPrims_simd8,simdscalari const & primID)1162 static void TessellationStages(
1163     DRAW_CONTEXT *pDC,
1164     uint32_t workerId,
1165     PA_STATE& pa,
1166     GsBuffers* pGsBuffers,
1167     uint32_t* pSoPrimData,
1168 #if USE_SIMD16_FRONTEND
1169     uint32_t numPrims_simd8,
1170 #endif
1171     simdscalari const &primID)
1172 {
1173     SWR_CONTEXT *pContext = pDC->pContext;
1174     const API_STATE& state = GetApiState(pDC);
1175     const SWR_TS_STATE& tsState = state.tsState;
1176 
1177     SWR_ASSERT(gt_pTessellationThreadData);
1178 
1179     HANDLE tsCtx = TSInitCtx(
1180         tsState.domain,
1181         tsState.partitioning,
1182         tsState.tsOutputTopology,
1183         gt_pTessellationThreadData->pTxCtx,
1184         gt_pTessellationThreadData->tsCtxSize);
1185     if (tsCtx == nullptr)
1186     {
1187         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1188         tsCtx = TSInitCtx(
1189             tsState.domain,
1190             tsState.partitioning,
1191             tsState.tsOutputTopology,
1192             gt_pTessellationThreadData->pTxCtx,
1193             gt_pTessellationThreadData->tsCtxSize);
1194     }
1195     SWR_ASSERT(tsCtx);
1196 
1197 #if USE_SIMD16_FRONTEND
1198     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
1199     if (HasRastT::value)
1200     {
1201         switch (tsState.postDSTopology)
1202         {
1203         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
1204         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
1205         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
1206         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1207         }
1208     }
1209 
1210 #else
1211     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1212     if (HasRastT::value)
1213     {
1214         switch (tsState.postDSTopology)
1215         {
1216         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1217         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1218         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1219         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
1220         }
1221     }
1222 
1223 #endif
1224     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1225     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1226     hsContext.PrimitiveID = primID;
1227 
1228     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1229     // Max storage for one attribute for an entire simdprimitive
1230     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1231 
1232     // assemble all attributes for the input primitives
1233     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1234     {
1235         uint32_t attribSlot = tsState.vertexAttribOffset + slot;
1236         pa.Assemble(attribSlot, simdattrib);
1237 
1238         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1239         {
1240             hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
1241         }
1242     }
1243 
1244 #if defined(_DEBUG)
1245     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1246 #endif
1247 
1248 #if USE_SIMD16_FRONTEND
1249     uint32_t numPrims = numPrims_simd8;
1250 #else
1251     uint32_t numPrims = pa.NumPrims();
1252 #endif
1253     hsContext.mask = GenerateMask(numPrims);
1254 
1255     // Run the HS
1256     AR_BEGIN(FEHullShader, pDC->drawId);
1257     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1258     AR_END(FEHullShader, 0);
1259 
1260     UPDATE_STAT_FE(HsInvocations, numPrims);
1261 
1262     const uint32_t* pPrimId = (const uint32_t*)&primID;
1263 
1264     for (uint32_t p = 0; p < numPrims; ++p)
1265     {
1266         // Run Tessellator
1267         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1268         AR_BEGIN(FETessellation, pDC->drawId);
1269         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1270         AR_EVENT(TessPrimCount(1));
1271         AR_END(FETessellation, 0);
1272 
1273         if (tsData.NumPrimitives == 0)
1274         {
1275             continue;
1276         }
1277         SWR_ASSERT(tsData.NumDomainPoints);
1278 
1279         // Allocate DS Output memory
1280         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1281 #if USE_SIMD16_FRONTEND
1282         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;      // simd8 -> simd16, padding
1283 #else
1284         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
1285         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1286 #endif
1287         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
1288         {
1289             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1290             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1291             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
1292         }
1293         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1294         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
1295 
1296 #if defined(_DEBUG)
1297         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1298 #endif
1299 
1300         // Run Domain Shader
1301         SWR_DS_CONTEXT dsContext;
1302         dsContext.PrimitiveID = pPrimId[p];
1303         dsContext.pCpIn = &hsContext.pCPout[p];
1304         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1305         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1306         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1307         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
1308 #if USE_SIMD16_FRONTEND
1309         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
1310 #else
1311         dsContext.vectorStride = requiredDSVectorInvocations;
1312 #endif
1313 
1314         uint32_t dsInvocations = 0;
1315 
1316         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1317         {
1318             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1319 
1320             AR_BEGIN(FEDomainShader, pDC->drawId);
1321             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1322             AR_END(FEDomainShader, 0);
1323 
1324             dsInvocations += KNOB_SIMD_WIDTH;
1325         }
1326         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1327 
1328 #if USE_SIMD16_FRONTEND
1329         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
1330 
1331 #endif
1332         PA_TESS tessPa(
1333             pDC,
1334 #if USE_SIMD16_FRONTEND
1335             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
1336             dsContext.vectorStride / 2,                                         // simd8 -> simd16
1337 #else
1338             dsContext.pOutputData,
1339             dsContext.vectorStride,
1340 #endif
1341             SWR_VTX_NUM_SLOTS,
1342             tsState.numDsOutputAttribs,
1343             tsData.ppIndices,
1344             tsData.NumPrimitives,
1345             tsState.postDSTopology,
1346             numVertsPerPrim);
1347 
1348         while (tessPa.HasWork())
1349         {
1350 #if USE_SIMD16_FRONTEND
1351             const uint32_t numPrims = tessPa.NumPrims();
1352             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1353             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1354 
1355             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
1356             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1357             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1358 
1359 #endif
1360             if (HasGeometryShaderT::value)
1361             {
1362 #if USE_SIMD16_FRONTEND
1363                 tessPa.useAlternateOffset = false;
1364                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1365 
1366                 if (numPrims_hi)
1367                 {
1368                     tessPa.useAlternateOffset = true;
1369                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1370                 }
1371 #else
1372                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1373                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
1374 #endif
1375             }
1376             else
1377             {
1378                 if (HasStreamOutT::value)
1379                 {
1380 #if ENABLE_AVX512_SIMD16
1381                     tessPa.useAlternateOffset = false;
1382 #endif
1383                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1384                 }
1385 
1386                 if (HasRastT::value)
1387                 {
1388 #if USE_SIMD16_FRONTEND
1389                     simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
1390 #else
1391                     simdvector      prim[3];        // Only deal with triangles, lines, or points
1392 #endif
1393                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1394                     bool assemble =
1395 #if USE_SIMD16_FRONTEND
1396                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1397 #else
1398                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1399 #endif
1400                     AR_END(FEPAAssemble, 1);
1401                     SWR_ASSERT(assemble);
1402 
1403                     SWR_ASSERT(pfnClipFunc);
1404 #if USE_SIMD16_FRONTEND
1405                     // Gather data from the SVG if provided.
1406                     simd16scalari vViewportIdx = SIMD16::setzero_si();
1407                     simd16scalari vRtIdx = SIMD16::setzero_si();
1408                     SIMD16::Vec4 svgAttrib[4];
1409 
1410                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1411                     {
1412                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1413                     }
1414 
1415 
1416                     if (state.backendState.readViewportArrayIndex)
1417                     {
1418                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1419                         tessPa.viewportArrayActive = true;
1420                     }
1421                     if (state.backendState.readRenderTargetArrayIndex)
1422                     {
1423                         vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1424                         tessPa.rtArrayActive = true;
1425                     }
1426 
1427 
1428                     {
1429                         // OOB VPAI indices => forced to zero.
1430                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
1431                         simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1432                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
1433                         vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
1434 
1435                         tessPa.useAlternateOffset = false;
1436                         pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx);
1437                     }
1438 #else
1439                     // Gather data from the SVG if provided.
1440                     simdscalari vViewportIdx = SIMD16::setzero_si();
1441                     simdscalari vRtIdx = SIMD16::setzero_si();
1442                     SIMD8::Vec4 svgAttrib[4];
1443 
1444                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1445                     {
1446                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1447                     }
1448 
1449                     if (state.backendState.readViewportArrayIndex)
1450                     {
1451                         vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1452 
1453                         // OOB VPAI indices => forced to zero.
1454                         vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
1455                         simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1456                         simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
1457                         vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
1458                         tessPa.viewportArrayActive = true;
1459                     }
1460                     if (state.backendState.readRenderTargetArrayIndex)
1461                     {
1462                         vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1463                         tessPa.rtArrayActive = true;
1464                     }
1465                     pfnClipFunc(pDC, tessPa, workerId, prim,
1466                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx);
1467 #endif
1468                 }
1469             }
1470 
1471             tessPa.NextPrim();
1472 
1473         } // while (tessPa.HasWork())
1474     } // for (uint32_t p = 0; p < numPrims; ++p)
1475 
1476 #if USE_SIMD16_FRONTEND
1477     if (gt_pTessellationThreadData->pDSOutput != nullptr)
1478     {
1479         AlignedFree(gt_pTessellationThreadData->pDSOutput);
1480         gt_pTessellationThreadData->pDSOutput = nullptr;
1481     }
1482     gt_pTessellationThreadData->dsOutputAllocSize = 0;
1483 
1484 #endif
1485     TSDestroyCtx(tsCtx);
1486 }
1487 
1488 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
1489 THREAD uint32_t gVertexStoreSize = 0;
1490 
1491 //////////////////////////////////////////////////////////////////////////
1492 /// @brief FE handler for SwrDraw.
1493 /// @tparam IsIndexedT - Is indexed drawing enabled
1494 /// @tparam HasTessellationT - Is tessellation enabled
1495 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1496 /// @tparam HasStreamOutT - Is stream-out enabled
1497 /// @tparam HasRastT - Is rasterization enabled
1498 /// @param pContext - pointer to SWR context.
1499 /// @param pDC - pointer to draw context.
1500 /// @param workerId - thread's worker id.
1501 /// @param pUserData - Pointer to DRAW_WORK
1502 template <
1503     typename IsIndexedT,
1504     typename IsCutIndexEnabledT,
1505     typename HasTessellationT,
1506     typename HasGeometryShaderT,
1507     typename HasStreamOutT,
1508     typename HasRastT>
ProcessDraw(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,void * pUserData)1509 void ProcessDraw(
1510     SWR_CONTEXT *pContext,
1511     DRAW_CONTEXT *pDC,
1512     uint32_t workerId,
1513     void *pUserData)
1514 {
1515 
1516 #if KNOB_ENABLE_TOSS_POINTS
1517     if (KNOB_TOSS_QUEUE_FE)
1518     {
1519         return;
1520     }
1521 #endif
1522 
1523     AR_BEGIN(FEProcessDraw, pDC->drawId);
1524 
1525     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1526     const API_STATE&    state = GetApiState(pDC);
1527 
1528     uint32_t indexSize = 0;
1529     uint32_t endVertex = work.numVerts;
1530 
1531     const int32_t* pLastRequestedIndex = nullptr;
1532     if (IsIndexedT::value)
1533     {
1534         switch (work.type)
1535         {
1536         case R32_UINT:
1537             indexSize = sizeof(uint32_t);
1538             pLastRequestedIndex = &(work.pIB[endVertex]);
1539             break;
1540         case R16_UINT:
1541             indexSize = sizeof(uint16_t);
1542             // nasty address offset to last index
1543             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1544             break;
1545         case R8_UINT:
1546             indexSize = sizeof(uint8_t);
1547             // nasty address offset to last index
1548             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1549             break;
1550         default:
1551             SWR_INVALID("Invalid work.type: %d", work.type);
1552         }
1553     }
1554     else
1555     {
1556         // No cuts, prune partial primitives.
1557         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1558     }
1559 
1560 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1561     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1562 #endif
1563 
1564     GsBuffers gsBuffers;
1565     if (HasGeometryShaderT::value)
1566     {
1567 #if USE_SIMD16_FRONTEND
1568         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1569 #else
1570         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
1571 #endif
1572     }
1573 
1574     if (HasTessellationT::value)
1575     {
1576         SWR_ASSERT(state.tsState.tsEnable == true);
1577         SWR_ASSERT(state.pfnHsFunc != nullptr);
1578         SWR_ASSERT(state.pfnDsFunc != nullptr);
1579 
1580         AllocateTessellationData(pContext);
1581     }
1582     else
1583     {
1584         SWR_ASSERT(state.tsState.tsEnable == false);
1585         SWR_ASSERT(state.pfnHsFunc == nullptr);
1586         SWR_ASSERT(state.pfnDsFunc == nullptr);
1587     }
1588 
1589     // allocate space for streamout input prim data
1590     uint32_t* pSoPrimData = nullptr;
1591     if (HasStreamOutT::value)
1592     {
1593         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1594     }
1595 
1596     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
1597 #if USE_SIMD16_FRONTEND
1598     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
1599 #else
1600     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
1601 #endif
1602 
1603     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
1604 
1605     // Compute storage requirements for vertex store
1606     // TODO: allocation needs to be rethought for better cut support
1607     uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
1608     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
1609 
1610     // grow the vertex store for the PA as necessary
1611     if (gVertexStoreSize < vertexStoreSize)
1612     {
1613         if (gpVertexStore != nullptr)
1614         {
1615             AlignedFree(gpVertexStore);
1616             gpVertexStore = nullptr;
1617         }
1618 
1619         SWR_ASSERT(gpVertexStore == nullptr);
1620 
1621         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
1622         gVertexStoreSize = vertexStoreSize;
1623 
1624         SWR_ASSERT(gpVertexStore != nullptr);
1625     }
1626 
1627     // choose primitive assembler
1628 
1629     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
1630     PA_STATE& pa = paFactory.GetPA();
1631 
1632 #if USE_SIMD16_FRONTEND
1633 #if USE_SIMD16_SHADERS
1634     simd16vertex        vin;
1635 #else
1636     simdvertex          vin_lo;
1637     simdvertex          vin_hi;
1638 #endif
1639     SWR_VS_CONTEXT      vsContext_lo;
1640     SWR_VS_CONTEXT      vsContext_hi;
1641 
1642 #if USE_SIMD16_SHADERS
1643     vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
1644     vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
1645 #else
1646     vsContext_lo.pVin = &vin_lo;
1647     vsContext_hi.pVin = &vin_hi;
1648 #endif
1649     vsContext_lo.AlternateOffset = 0;
1650     vsContext_hi.AlternateOffset = 1;
1651 
1652     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
1653 
1654     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
1655     fetchInfo_lo.StartInstance = work.startInstance;
1656     fetchInfo_lo.StartVertex = 0;
1657 
1658     if (IsIndexedT::value)
1659     {
1660         fetchInfo_lo.BaseVertex = work.baseVertex;
1661 
1662         // if the entire index buffer isn't being consumed, set the last index
1663         // so that fetches < a SIMD wide will be masked off
1664         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1665         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
1666         {
1667             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
1668         }
1669     }
1670     else
1671     {
1672         fetchInfo_lo.StartVertex = work.startVertex;
1673     }
1674 
1675     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
1676 
1677     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1678 
1679     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1680     {
1681         uint32_t  i = 0;
1682 
1683         simd16scalari vIndex;
1684 
1685         if (IsIndexedT::value)
1686         {
1687             fetchInfo_lo.pIndices = work.pIB;
1688             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
1689         }
1690         else
1691         {
1692             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
1693 
1694             fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
1695             fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
1696         }
1697 
1698         fetchInfo_lo.CurInstance = instanceNum;
1699         fetchInfo_hi.CurInstance = instanceNum;
1700 
1701         vsContext_lo.InstanceID = instanceNum;
1702         vsContext_hi.InstanceID = instanceNum;
1703 
1704         while (pa.HasWork())
1705         {
1706             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1707             // So we need to keep this outside of (i < endVertex) check.
1708 
1709             simdmask *pvCutIndices_lo = nullptr;
1710             simdmask *pvCutIndices_hi = nullptr;
1711 
1712             if (IsIndexedT::value)
1713             {
1714                 // simd16mask <=> simdmask[2]
1715 
1716                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
1717                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
1718             }
1719 
1720             simd16vertex &vout = pa.GetNextVsOutput();
1721 
1722             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
1723             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
1724 
1725             if (i < endVertex)
1726             {
1727                 if (!IsIndexedT::value)
1728                 {
1729                     fetchInfo_lo.pLastIndex = fetchInfo_lo.pIndices;
1730                     uint32_t offset;
1731                     offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH);
1732 #if USE_SIMD16_SHADERS
1733                     fetchInfo_lo.pLastIndex += offset;
1734 #else
1735                     fetchInfo_lo.pLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH);
1736                     uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
1737                     assert(offset >= 0);
1738                     fetchInfo_hi.pLastIndex = fetchInfo_hi.pIndices;
1739                     fetchInfo_hi.pLastIndex += offset2;
1740 #endif
1741                 }
1742                 // 1. Execute FS/VS for a single SIMD.
1743                 AR_BEGIN(FEFetchShader, pDC->drawId);
1744 #if USE_SIMD16_SHADERS
1745                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
1746 #else
1747                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo);
1748 
1749                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1750                 {
1751                     state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
1752                 }
1753 #endif
1754                 AR_END(FEFetchShader, 0);
1755 
1756                 // forward fetch generated vertex IDs to the vertex shader
1757 #if USE_SIMD16_SHADERS
1758 #if USE_SIMD16_VS
1759                 vsContext_lo.VertexID16 = _simd16_insert_si(
1760                     vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
1761                 vsContext_lo.VertexID16 = _simd16_insert_si(
1762                     vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
1763 #else
1764                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1765                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
1766 #endif
1767 #else
1768                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
1769                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
1770 #endif
1771 
1772                 // Setup active mask for vertex shader.
1773 #if USE_SIMD16_VS
1774                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
1775 #else
1776                 vsContext_lo.mask = GenerateMask(endVertex - i);
1777                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
1778 #endif
1779 
1780                 // forward cut mask to the PA
1781                 if (IsIndexedT::value)
1782                 {
1783 #if USE_SIMD16_SHADERS
1784                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1785                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
1786 #else
1787                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
1788                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
1789 #endif
1790                 }
1791 
1792                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1793 
1794 #if KNOB_ENABLE_TOSS_POINTS
1795                 if (!KNOB_TOSS_FETCH)
1796 #endif
1797                 {
1798                     AR_BEGIN(FEVertexShader, pDC->drawId);
1799 #if USE_SIMD16_VS
1800                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1801 #else
1802                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
1803 
1804                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
1805                     {
1806                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
1807                     }
1808 #endif
1809                     AR_END(FEVertexShader, 0);
1810 
1811                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1812                 }
1813             }
1814 
1815             // 2. Assemble primitives given the last two SIMD.
1816             do
1817             {
1818                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
1819 
1820                 RDTSC_START(FEPAAssemble);
1821                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
1822                 RDTSC_STOP(FEPAAssemble, 1, 0);
1823 
1824 #if KNOB_ENABLE_TOSS_POINTS
1825                 if (!KNOB_TOSS_FETCH)
1826 #endif
1827                 {
1828 #if KNOB_ENABLE_TOSS_POINTS
1829                     if (!KNOB_TOSS_VS)
1830 #endif
1831                     {
1832                         if (assemble)
1833                         {
1834                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1835 
1836                             const uint32_t numPrims = pa.NumPrims();
1837                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
1838                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
1839 
1840                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
1841                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
1842                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);
1843 
1844                             if (HasTessellationT::value)
1845                             {
1846                                 pa.useAlternateOffset = false;
1847                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1848 
1849                                 if (numPrims_hi)
1850                                 {
1851                                     pa.useAlternateOffset = true;
1852                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1853                                 }
1854                             }
1855                             else if (HasGeometryShaderT::value)
1856                             {
1857                                 pa.useAlternateOffset = false;
1858                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
1859 
1860                                 if (numPrims_hi)
1861                                 {
1862                                     pa.useAlternateOffset = true;
1863                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
1864                                 }
1865                             }
1866                             else
1867                             {
1868                                 // If streamout is enabled then stream vertices out to memory.
1869                                 if (HasStreamOutT::value)
1870                                 {
1871                                     pa.useAlternateOffset = false;
1872                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1873                                 }
1874 
1875                                 if (HasRastT::value)
1876                                 {
1877                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
1878                                     // Gather data from the SVG if provided.
1879                                     simd16scalari vpai = SIMD16::setzero_si();
1880                                     simd16scalari rtai = SIMD16::setzero_si();
1881                                     SIMD16::Vec4 svgAttrib[4];
1882 
1883                                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
1884                                     {
1885                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
1886                                     }
1887 
1888 
1889                                     if (state.backendState.readViewportArrayIndex)
1890                                     {
1891                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
1892                                         pa.viewportArrayActive = true;
1893                                     }
1894                                     if (state.backendState.readRenderTargetArrayIndex)
1895                                     {
1896                                         rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
1897                                         pa.rtArrayActive = true;
1898                                     }
1899 
1900                                     {
1901                                         // OOB VPAI indices => forced to zero.
1902                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
1903                                         simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1904                                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports);
1905                                         vpai = SIMD16::and_si(vClearMask, vpai);
1906 
1907                                         pa.useAlternateOffset = false;
1908                                         pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai);
1909                                     }
1910                                 }
1911                             }
1912                         }
1913                     }
1914                 }
1915             } while (pa.NextPrim());
1916 
1917             if (IsIndexedT::value)
1918             {
1919                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1920                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
1921             }
1922             else
1923             {
1924                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
1925             }
1926 
1927             i += KNOB_SIMD16_WIDTH;
1928         }
1929 
1930         pa.Reset();
1931     }
1932 
1933 #else
1934     SWR_VS_CONTEXT      vsContext;
1935     SWR_FETCH_CONTEXT   fetchInfo = { 0 };
1936 
1937     fetchInfo.pStreams = &state.vertexBuffers[0];
1938     fetchInfo.StartInstance = work.startInstance;
1939     fetchInfo.StartVertex = 0;
1940 
1941     if (IsIndexedT::value)
1942     {
1943         fetchInfo.BaseVertex = work.baseVertex;
1944 
1945         // if the entire index buffer isn't being consumed, set the last index
1946         // so that fetches < a SIMD wide will be masked off
1947         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1948         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1949         {
1950             fetchInfo.pLastIndex = pLastRequestedIndex;
1951         }
1952     }
1953     else
1954     {
1955         fetchInfo.StartVertex = work.startVertex;
1956     }
1957 
1958     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1959 
1960     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1961     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1962     {
1963         simdscalari vIndex;
1964         uint32_t  i = 0;
1965 
1966         if (IsIndexedT::value)
1967         {
1968             fetchInfo.pIndices = work.pIB;
1969         }
1970         else
1971         {
1972             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1973             fetchInfo.pIndices = (const int32_t*)&vIndex;
1974         }
1975 
1976         fetchInfo.CurInstance = instanceNum;
1977         vsContext.InstanceID = instanceNum;
1978 
1979         while (pa.HasWork())
1980         {
1981             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1982             // So we need to keep this outside of (i < endVertex) check.
1983             simdmask* pvCutIndices = nullptr;
1984             if (IsIndexedT::value)
1985             {
1986                 pvCutIndices = &pa.GetNextVsIndices();
1987             }
1988 
1989             simdvertex& vout = pa.GetNextVsOutput();
1990             vsContext.pVin = &vout;
1991             vsContext.pVout = &vout;
1992 
1993             if (i < endVertex)
1994             {
1995 
1996                 // 1. Execute FS/VS for a single SIMD.
1997                 AR_BEGIN(FEFetchShader, pDC->drawId);
1998                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
1999                 AR_END(FEFetchShader, 0);
2000 
2001                 // forward fetch generated vertex IDs to the vertex shader
2002                 vsContext.VertexID = fetchInfo.VertexID;
2003 
2004                 // Setup active mask for vertex shader.
2005                 vsContext.mask = GenerateMask(endVertex - i);
2006 
2007                 // forward cut mask to the PA
2008                 if (IsIndexedT::value)
2009                 {
2010                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
2011                 }
2012 
2013                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
2014 
2015 #if KNOB_ENABLE_TOSS_POINTS
2016                 if (!KNOB_TOSS_FETCH)
2017 #endif
2018                 {
2019                     AR_BEGIN(FEVertexShader, pDC->drawId);
2020                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
2021                     AR_END(FEVertexShader, 0);
2022 
2023                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
2024                 }
2025             }
2026 
2027             // 2. Assemble primitives given the last two SIMD.
2028             do
2029             {
2030                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
2031                 // PaAssemble returns false if there is not enough verts to assemble.
2032                 AR_BEGIN(FEPAAssemble, pDC->drawId);
2033                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
2034                 AR_END(FEPAAssemble, 1);
2035 
2036 #if KNOB_ENABLE_TOSS_POINTS
2037                 if (!KNOB_TOSS_FETCH)
2038 #endif
2039                 {
2040 #if KNOB_ENABLE_TOSS_POINTS
2041                     if (!KNOB_TOSS_VS)
2042 #endif
2043                     {
2044                         if (assemble)
2045                         {
2046                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
2047 
2048                             if (HasTessellationT::value)
2049                             {
2050                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
2051                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2052                             }
2053                             else if (HasGeometryShaderT::value)
2054                             {
2055                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
2056                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
2057                             }
2058                             else
2059                             {
2060                                 // If streamout is enabled then stream vertices out to memory.
2061                                 if (HasStreamOutT::value)
2062                                 {
2063                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
2064                                 }
2065 
2066                                 if (HasRastT::value)
2067                                 {
2068                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
2069 
2070                                     // Gather data from the SVG if provided.
2071                                     simdscalari vViewportIdx = SIMD16::setzero_si();
2072                                     simdscalari vRtIdx = SIMD16::setzero_si();
2073                                     SIMD8::Vec4 svgAttrib[4];
2074 
2075                                     if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
2076                                     {
2077                                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
2078                                     }
2079 
2080                                     if (state.backendState.readViewportArrayIndex)
2081                                     {
2082                                         vViewportIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
2083 
2084                                         // OOB VPAI indices => forced to zero.
2085                                         vViewportIdx = SIMD8::max_epi32(vViewportIdx, SIMD8::setzero_si());
2086                                         simd16scalari vNumViewports = SIMD8::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2087                                         simd16scalari vClearMask = SIMD8::cmplt_epi32(vViewportIdx, vNumViewports);
2088                                         vViewportIdx = SIMD8::and_si(vClearMask, vViewportIdx);
2089                                         tessPa.viewportArrayActive = true;
2090                                     }
2091                                     if (state.backendState.readRenderTargetArrayIndex)
2092                                     {
2093                                         vRtIdx = SIMD8::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
2094                                         tessPa.rtArrayActive = true;
2095                                     }
2096 
2097                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
2098                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx);
2099                                 }
2100                             }
2101                         }
2102                     }
2103                 }
2104             } while (pa.NextPrim());
2105 
2106             if (IsIndexedT::value)
2107             {
2108                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
2109             }
2110             else
2111             {
2112                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
2113             }
2114 
2115             i += KNOB_SIMD_WIDTH;
2116         }
2117         pa.Reset();
2118     }
2119 
2120 #endif
2121 
2122     AR_END(FEProcessDraw, numPrims * work.numInstances);
2123 }
2124 
2125 struct FEDrawChooser
2126 {
2127     typedef PFN_FE_WORK_FUNC FuncType;
2128 
2129     template <typename... ArgsB>
GetFuncFEDrawChooser2130     static FuncType GetFunc()
2131     {
2132         return ProcessDraw<ArgsB...>;
2133     }
2134 };
2135 
2136 
2137 // Selector for correct templated Draw front-end function
GetProcessDrawFunc(bool IsIndexed,bool IsCutIndexEnabled,bool HasTessellation,bool HasGeometryShader,bool HasStreamOut,bool HasRasterization)2138 PFN_FE_WORK_FUNC GetProcessDrawFunc(
2139     bool IsIndexed,
2140     bool IsCutIndexEnabled,
2141     bool HasTessellation,
2142     bool HasGeometryShader,
2143     bool HasStreamOut,
2144     bool HasRasterization)
2145 {
2146     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
2147 }
2148