1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file pa.h
24 *
25 * @brief Definitions for primitive assembly.
26 *        N primitives are assembled at a time, where N is the SIMD width.
27 *        A state machine, that is specific for a given topology, drives the
28 *        assembly of vertices into triangles.
29 *
30 ******************************************************************************/
31 #pragma once
32 
33 #include "frontend.h"
34 
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38     enum
39     {
40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42         SIMD_WIDTH_LOG2 = 4
43     };
44 
45     typedef         simd16mask          SIMDMASK;
46 
47     typedef         simd16scalar        SIMDSCALAR;
48     typedef         simd16vector        SIMDVECTOR;
49     typedef         simd16vertex        SIMDVERTEX;
50 
51     typedef         simd16scalari       SIMDSCALARI;
52 
53 #else
54     enum
55     {
56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58         SIMD_WIDTH_LOG2 = 3
59     };
60 
61     typedef         simdmask            SIMDMASK;
62 
63     typedef         simdscalar          SIMDSCALAR;
64     typedef         simdvector          SIMDVECTOR;
65     typedef         simdvertex          SIMDVERTEX;
66 
67     typedef         simdscalari         SIMDSCALARI;
68 
69 #endif
70     DRAW_CONTEXT *pDC{ nullptr };       // draw context
71     uint8_t* pStreamBase{ nullptr };    // vertex stream
72     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
73     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
74 
75     // The topology the binner will use. In some cases the FE changes the topology from the api state.
76     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
77 
78 #if ENABLE_AVX512_SIMD16
79     bool useAlternateOffset{ false };
80 #endif
81 
82     bool viewportArrayActive{ false };
83     bool rtArrayActive { false };
84     uint32_t numVertsPerPrim{ 0 };
85 
PA_STATEPA_STATE86     PA_STATE(){}
PA_STATEPA_STATE87     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) :
88         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {}
89 
90     virtual bool HasWork() = 0;
91     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
92 #if ENABLE_AVX512_SIMD16
93     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
94 #endif
95     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
96 #if ENABLE_AVX512_SIMD16
97     virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
98 #endif
99     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
100     virtual bool NextPrim() = 0;
101     virtual SIMDVERTEX& GetNextVsOutput() = 0;
102     virtual bool GetNextStreamOutput() = 0;
103     virtual SIMDMASK& GetNextVsIndices() = 0;
104     virtual uint32_t NumPrims() = 0;
105     virtual void Reset() = 0;
106     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
107 };
108 
109 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
110 // output. Here is the sequence
111 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
112 //    2. Execute PA function to assemble and bin triangles.
113 //        a.    The PA function is a set of functions that collectively make up the
114 //            state machine for a given topology.
115 //                1.    We use a state index to track which PA function to call.
116 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
117 //                1.    We call this the current and previous simd vertex.
118 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
119 //                    order to assemble the second triangle, for a triangle list, we'll need the
120 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
121 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
122 //
123 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
124 // cuts
125 struct PA_STATE_OPT : public PA_STATE
126 {
127     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
128     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
129 
130     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
131 
132     uint32_t cur{ 0 };                   // index to current VS output.
133     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
134     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
135 
136     uint32_t counter{ 0 };               // state counter
137     bool reset{ false };                 // reset state
138 
139     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
140     SIMDSCALARI primID;
141 
142     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
143 #if ENABLE_AVX512_SIMD16
144     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
145 #endif
146     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
147 
148     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
149 #if ENABLE_AVX512_SIMD16
150     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
151 #endif
152     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
153     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
154 #if ENABLE_AVX512_SIMD16
155     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
156 #endif
157 
158     // state used to advance the PA when Next is called
159     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
160 #if ENABLE_AVX512_SIMD16
161     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
162 #endif
163     uint32_t           nextNumSimdPrims{ 0 };
164     uint32_t           nextNumPrimsIncrement{ 0 };
165     bool               nextReset{ false };
166     bool               isStreaming{ false };
167 
168     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
169 
PA_STATE_OPTPA_STATE_OPT170     PA_STATE_OPT() {}
171     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
172         uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
173 
HasWorkPA_STATE_OPT174     bool HasWork()
175     {
176         return (this->numPrimsComplete < this->numPrims) ? true : false;
177     }
178 
GetSimdVectorPA_STATE_OPT179     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
180     {
181         SWR_ASSERT(slot < vertexStride);
182         uint32_t offset = index * vertexStride + slot;
183         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
184         return vertexSlot;
185     }
186 
187 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_STATE_OPT188     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
189     {
190         SWR_ASSERT(slot < vertexStride);
191         uint32_t offset = index * vertexStride + slot;
192         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
193         return vertexSlot;
194     }
195 
196 #endif
197     // Assembles 4 triangles. Each simdvector is a single vertex from 4
198     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
AssemblePA_STATE_OPT199     bool Assemble(uint32_t slot, simdvector verts[])
200     {
201         return this->pfnPaFunc(*this, slot, verts);
202     }
203 
204 #if ENABLE_AVX512_SIMD16
AssemblePA_STATE_OPT205     bool Assemble(uint32_t slot, simd16vector verts[])
206     {
207         return this->pfnPaFunc_simd16(*this, slot, verts);
208     }
209 
210 #endif
211     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
AssembleSinglePA_STATE_OPT212     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
213     {
214         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
215     }
216 
NextPrimPA_STATE_OPT217     bool NextPrim()
218     {
219         this->pfnPaFunc = this->pfnPaNextFunc;
220 #if ENABLE_AVX512_SIMD16
221         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
222 #endif
223         this->numSimdPrims = this->nextNumSimdPrims;
224         this->numPrimsComplete += this->nextNumPrimsIncrement;
225         this->reset = this->nextReset;
226 
227         if (this->isStreaming)
228         {
229             this->reset = false;
230         }
231 
232         bool morePrims = false;
233 
234         if (this->numSimdPrims > 0)
235         {
236             morePrims = true;
237             this->numSimdPrims--;
238         }
239         else
240         {
241             this->counter = (this->reset) ? 0 : (this->counter + 1);
242             this->reset = false;
243         }
244 
245         if (!HasWork())
246         {
247             morePrims = false;    // no more to do
248         }
249 
250         return morePrims;
251     }
252 
GetNextVsOutputPA_STATE_OPT253     SIMDVERTEX& GetNextVsOutput()
254     {
255         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
256 
257         // increment cur and prev indices
258         if (counter < numSimdVerts)
259         {
260             // prev undefined for first state
261             prev = cur;
262             cur = counter;
263         }
264         else
265         {
266             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
267             uint32_t temp = prev;
268 
269             prev = cur;
270             cur = temp;
271         }
272 
273         SWR_ASSERT(cur < numSimdVerts);
274         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
275 
276         return *(SIMDVERTEX*)pVertex;
277     }
278 
GetNextVsIndicesPA_STATE_OPT279     SIMDMASK& GetNextVsIndices()
280     {
281         // unused in optimized PA, pass tmp buffer back
282         return junkIndices;
283     }
284 
GetNextStreamOutputPA_STATE_OPT285     bool GetNextStreamOutput()
286     {
287         this->prev = this->cur;
288         this->cur = this->counter;
289 
290         return HasWork();
291     }
292 
NumPrimsPA_STATE_OPT293     uint32_t NumPrims()
294     {
295         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
296             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
297     }
298 
299     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
300         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
301         uint32_t numSimdPrims = 0,
302         uint32_t numPrimsIncrement = 0,
303         bool reset = false)
304     {
305         this->pfnPaNextFunc = pfnPaNextFunc;
306         this->nextNumSimdPrims = numSimdPrims;
307         this->nextNumPrimsIncrement = numPrimsIncrement;
308         this->nextReset = reset;
309 
310         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
311     }
312 
313 #if ENABLE_AVX512_SIMD16
314     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
315         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
316         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
317         uint32_t numSimdPrims = 0,
318         uint32_t numPrimsIncrement = 0,
319         bool reset = false)
320     {
321         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
322         this->pfnPaNextFunc = pfnPaNextFunc;
323         this->nextNumSimdPrims = numSimdPrims;
324         this->nextNumPrimsIncrement = numPrimsIncrement;
325         this->nextReset = reset;
326 
327         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
328     }
329 
330 #endif
ResetPA_STATE_OPT331     void Reset()
332     {
333 #if ENABLE_AVX512_SIMD16
334         useAlternateOffset = false;
335 
336 #endif
337         this->pfnPaFunc = this->pfnPaFuncReset;
338 #if ENABLE_AVX512_SIMD16
339         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
340 #endif
341         this->numPrimsComplete = 0;
342         this->numSimdPrims = 0;
343         this->cur = 0;
344         this->prev = 0;
345         this->counter = 0;
346         this->reset = false;
347     }
348 
GetPrimIDPA_STATE_OPT349     SIMDSCALARI GetPrimID(uint32_t startID)
350     {
351 #if USE_SIMD16_FRONTEND
352         return _simd16_add_epi32(this->primID,
353             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
354 #else
355         return _simd_add_epi32(this->primID,
356             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
357 #endif
358     }
359 };
360 
361 // helper C wrappers to avoid having to rewrite all the PA topology state functions
362 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
363     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
364     uint32_t numSimdPrims = 0,
365     uint32_t numPrimsIncrement = 0,
366     bool reset = false)
367 {
368     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
369 }
370 
371 #if ENABLE_AVX512_SIMD16
372 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
373     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
374     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
375     uint32_t numSimdPrims = 0,
376     uint32_t numPrimsIncrement = 0,
377     bool reset = false)
378 {
379     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
380 }
381 
382 #endif
PaGetSimdVector(PA_STATE & pa,uint32_t index,uint32_t slot)383 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
384 {
385     return pa.GetSimdVector(index, slot);
386 }
387 
388 #if ENABLE_AVX512_SIMD16
PaGetSimdVector_simd16(PA_STATE & pa,uint32_t index,uint32_t slot)389 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
390 {
391     return pa.GetSimdVector_simd16(index, slot);
392 }
393 
394 #endif
395 // Cut-aware primitive assembler.
396 struct PA_STATE_CUT : public PA_STATE
397 {
398     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
399     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
400     uint32_t numAttribs{ 0 };            // number of attributes
401     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
402     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
403 #if ENABLE_AVX512_SIMD16
404     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
405 #else
406     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
407 #endif
408     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
409     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
410     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
411     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
412     uint32_t curVertex{ 0 };             // current unprocessed vertex
413     uint32_t startPrimId{ 0 };           // starting prim id
414     SIMDSCALARI vPrimId;                 // vector of prim ID
415     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
416     uint32_t vertsPerPrim{ 0 };
417     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
418                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
419                                          // while the GS sends valid verts for every index
420 
421     simdvector      junkVector;          // junk simdvector for unimplemented API
422 #if ENABLE_AVX512_SIMD16
423     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
424 #endif
425 
426     // Topology state tracking
427     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
428     uint32_t curIndex{ 0 };
429     bool reverseWinding{ false };        // indicates reverse winding for strips
430     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
431 
432     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
433     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
434 
PA_STATE_CUTPA_STATE_CUT435     PA_STATE_CUT() {}
PA_STATE_CUTPA_STATE_CUT436     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
437         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim)
438         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
439     {
440         numVerts = in_streamSizeInVerts;
441         numAttribs = in_numAttribs;
442         binTopology = topo;
443         needOffsets = false;
444         processCutVerts = in_processCutVerts;
445 
446         numVertsToAssemble = numRemainingVerts = in_numVerts;
447         numPrimsAssembled = 0;
448         headVertex = tailVertex = curVertex = 0;
449 
450         curIndex = 0;
451         pCutIndices = in_pIndices;
452         memset(indices, 0, sizeof(indices));
453 #if USE_SIMD16_FRONTEND
454         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
455 #else
456         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
457 #endif
458         reverseWinding = false;
459         adjExtraVert = -1;
460 
461         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
462         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
463 
464         switch (topo)
465         {
466         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
467         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
468         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
469         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
470                                     {
471                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
472                                     }
473                                     else
474                                     {
475                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
476                                     }
477                                     break;
478 
479         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
480         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
481         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
482         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
483         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
484         default: assert(0 && "Unimplemented topology");
485         }
486     }
487 
GetNextVsOutputPA_STATE_CUT488     SIMDVERTEX& GetNextVsOutput()
489     {
490         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
491         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
492         this->needOffsets = true;
493         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
494 
495         return *(SIMDVERTEX*)pVertex;
496     }
497 
GetNextVsIndicesPA_STATE_CUT498     SIMDMASK& GetNextVsIndices()
499     {
500         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
501         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
502         return *pCurCutIndex;
503     }
504 
GetSimdVectorPA_STATE_CUT505     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
506     {
507         // unused
508         SWR_ASSERT(0 && "Not implemented");
509         return junkVector;
510     }
511 
512 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_STATE_CUT513     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
514     {
515         // unused
516         SWR_ASSERT(0 && "Not implemented");
517         return junkVector_simd16;
518     }
519 
520 #endif
GetNextStreamOutputPA_STATE_CUT521     bool GetNextStreamOutput()
522     {
523         this->headVertex += SIMD_WIDTH;
524         this->needOffsets = true;
525         return HasWork();
526     }
527 
GetPrimIDPA_STATE_CUT528     SIMDSCALARI GetPrimID(uint32_t startID)
529     {
530 #if USE_SIMD16_FRONTEND
531         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
532 #else
533         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
534 #endif
535     }
536 
ResetPA_STATE_CUT537     void Reset()
538     {
539 #if ENABLE_AVX512_SIMD16
540         useAlternateOffset = false;
541 
542 #endif
543         this->numRemainingVerts = this->numVertsToAssemble;
544         this->numPrimsAssembled = 0;
545         this->curIndex = 0;
546         this->curVertex = 0;
547         this->tailVertex = 0;
548         this->headVertex = 0;
549         this->reverseWinding = false;
550         this->adjExtraVert = -1;
551 #if USE_SIMD16_FRONTEND
552         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
553 #else
554         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
555 #endif
556     }
557 
HasWorkPA_STATE_CUT558     bool HasWork()
559     {
560         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
561     }
562 
IsVertexStoreFullPA_STATE_CUT563     bool IsVertexStoreFull()
564     {
565         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
566     }
567 
RestartTopologyPA_STATE_CUT568     void RestartTopology()
569     {
570         this->curIndex = 0;
571         this->reverseWinding = false;
572         this->adjExtraVert = -1;
573     }
574 
IsCutIndexPA_STATE_CUT575     bool IsCutIndex(uint32_t vertex)
576     {
577         uint32_t vertexIndex = vertex / SIMD_WIDTH;
578         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
579         return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
580     }
581 
582     // iterates across the unprocessed verts until we hit the end or we
583     // have assembled SIMD prims
ProcessVertsPA_STATE_CUT584     void ProcessVerts()
585     {
586         while (this->numPrimsAssembled != SIMD_WIDTH &&
587             this->numRemainingVerts > 0 &&
588             this->curVertex != this->headVertex)
589         {
590             // if cut index, restart topology
591             if (IsCutIndex(this->curVertex))
592             {
593                 if (this->processCutVerts)
594                 {
595                     (this->*pfnPa)(this->curVertex, false);
596                 }
597                 // finish off tri strip w/ adj before restarting topo
598                 if (this->adjExtraVert != -1)
599                 {
600                     (this->*pfnPa)(this->curVertex, true);
601                 }
602                 RestartTopology();
603             }
604             else
605             {
606                 (this->*pfnPa)(this->curVertex, false);
607             }
608 
609             this->curVertex++;
610             if (this->curVertex >= this->numVerts) {
611                this->curVertex = 0;
612             }
613             this->numRemainingVerts--;
614         }
615 
616         // special case last primitive for tri strip w/ adj
617         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
618         {
619             (this->*pfnPa)(this->curVertex, true);
620         }
621     }
622 
AdvancePA_STATE_CUT623     void Advance()
624     {
625         // done with current batch
626         // advance tail to the current unsubmitted vertex
627         this->tailVertex = this->curVertex;
628         this->numPrimsAssembled = 0;
629 #if USE_SIMD16_FRONTEND
630         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
631 #else
632         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
633 #endif
634     }
635 
NextPrimPA_STATE_CUT636     bool NextPrim()
637     {
638         // if we've assembled enough prims, we can advance to the next set of verts
639         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
640         {
641             Advance();
642         }
643         return false;
644     }
645 
ComputeOffsetsPA_STATE_CUT646     void ComputeOffsets()
647     {
648         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
649         {
650             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
651             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
652 
653             // step to simdvertex batch
654             const uint32_t simdShift = SIMD_WIDTH_LOG2;
655 #if USE_SIMD16_FRONTEND
656             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
657             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
658 #else
659             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
660             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
661 #endif
662 
663             // step to index
664             const uint32_t simdMask = SIMD_WIDTH - 1;
665 #if USE_SIMD16_FRONTEND
666             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
667             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
668 #else
669             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
670             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
671 #endif
672         }
673     }
674 
AssemblePA_STATE_CUT675     bool Assemble(uint32_t slot, simdvector *verts)
676     {
677         // process any outstanding verts
678         ProcessVerts();
679 
680         // return false if we don't have enough prims assembled
681         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
682         {
683             return false;
684         }
685 
686         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
687         if (this->needOffsets)
688         {
689             ComputeOffsets();
690             this->needOffsets = false;
691         }
692 
693         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
694         {
695             SIMDSCALARI offsets = this->vOffsets[v];
696 
697             // step to attribute
698 #if USE_SIMD16_FRONTEND
699             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
700 #else
701             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
702 #endif
703 
704             float* pBase = (float*)this->pStreamBase;
705             for (uint32_t c = 0; c < 4; ++c)
706             {
707 #if USE_SIMD16_FRONTEND
708                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
709 
710                 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
711                 simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
712                 verts[v].v[c] = t;
713 #else
714                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
715 #endif
716 
717                 // move base to next component
718                 pBase += SIMD_WIDTH;
719             }
720         }
721 
722         return true;
723     }
724 
725 #if ENABLE_AVX512_SIMD16
AssemblePA_STATE_CUT726     bool Assemble(uint32_t slot, simd16vector verts[])
727     {
728         // process any outstanding verts
729         ProcessVerts();
730 
731         // return false if we don't have enough prims assembled
732         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
733         {
734             return false;
735         }
736 
737         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
738         if (this->needOffsets)
739         {
740             ComputeOffsets();
741             this->needOffsets = false;
742         }
743 
744         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
745         {
746             SIMDSCALARI offsets = this->vOffsets[v];
747 
748             // step to attribute
749 #if USE_SIMD16_FRONTEND
750             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
751 #else
752             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
753 #endif
754 
755             float* pBase = (float*)this->pStreamBase;
756             for (uint32_t c = 0; c < 4; ++c)
757             {
758 #if USE_SIMD16_FRONTEND
759                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
760 #else
761                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
762 #endif
763 
764                 // move base to next component
765                 pBase += SIMD_WIDTH;
766             }
767         }
768 
769         return true;
770     }
771 
772 #endif
AssembleSinglePA_STATE_CUT773     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
774     {
775         // move to slot
776         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
777         {
778             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
779 #if USE_SIMD16_FRONTEND
780             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
781 #else
782             uint32_t offset = pOffset[triIndex];
783 #endif
784             offset += sizeof(SIMDVECTOR) * slot;
785             float* pVert = (float*)&tri[v];
786             for (uint32_t c = 0; c < 4; ++c)
787             {
788                 float* pComponent = (float*)(this->pStreamBase + offset);
789                 pVert[c] = *pComponent;
790                 offset += SIMD_WIDTH * sizeof(float);
791             }
792         }
793     }
794 
NumPrimsPA_STATE_CUT795     uint32_t NumPrims()
796     {
797         return this->numPrimsAssembled;
798     }
799 
800     // Per-topology functions
ProcessVertTriStripPA_STATE_CUT801     void ProcessVertTriStrip(uint32_t index, bool finish)
802     {
803         this->vert[this->curIndex] = index;
804         this->curIndex++;
805         if (this->curIndex == 3)
806         {
807             // assembled enough verts for prim, add to gather indices
808             this->indices[0][this->numPrimsAssembled] = this->vert[0];
809             if (reverseWinding)
810             {
811                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
812                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
813             }
814             else
815             {
816                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
817                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
818             }
819 
820             // increment numPrimsAssembled
821             this->numPrimsAssembled++;
822 
823             // set up next prim state
824             this->vert[0] = this->vert[1];
825             this->vert[1] = this->vert[2];
826             this->curIndex = 2;
827             this->reverseWinding ^= 1;
828         }
829     }
830 
831     template<bool gsEnabled>
AssembleTriStripAdjPA_STATE_CUT832     void AssembleTriStripAdj()
833     {
834         if (!gsEnabled)
835         {
836             this->vert[1] = this->vert[2];
837             this->vert[2] = this->vert[4];
838 
839             this->indices[0][this->numPrimsAssembled] = this->vert[0];
840             this->indices[1][this->numPrimsAssembled] = this->vert[1];
841             this->indices[2][this->numPrimsAssembled] = this->vert[2];
842 
843             this->vert[4] = this->vert[2];
844             this->vert[2] = this->vert[1];
845         }
846         else
847         {
848             this->indices[0][this->numPrimsAssembled] = this->vert[0];
849             this->indices[1][this->numPrimsAssembled] = this->vert[1];
850             this->indices[2][this->numPrimsAssembled] = this->vert[2];
851             this->indices[3][this->numPrimsAssembled] = this->vert[3];
852             this->indices[4][this->numPrimsAssembled] = this->vert[4];
853             this->indices[5][this->numPrimsAssembled] = this->vert[5];
854         }
855         this->numPrimsAssembled++;
856     }
857 
858 
859     template<bool gsEnabled>
ProcessVertTriStripAdjPA_STATE_CUT860     void ProcessVertTriStripAdj(uint32_t index, bool finish)
861     {
862         // handle last primitive of tristrip
863         if (finish && this->adjExtraVert != -1)
864         {
865             this->vert[3] = this->adjExtraVert;
866             AssembleTriStripAdj<gsEnabled>();
867             this->adjExtraVert = -1;
868             return;
869         }
870 
871         switch (this->curIndex)
872         {
873         case 0:
874         case 1:
875         case 2:
876         case 4:
877             this->vert[this->curIndex] = index;
878             this->curIndex++;
879             break;
880         case 3:
881             this->vert[5] = index;
882             this->curIndex++;
883             break;
884         case 5:
885             if (this->adjExtraVert == -1)
886             {
887                 this->adjExtraVert = index;
888             }
889             else
890             {
891                 this->vert[3] = index;
892                 if (!gsEnabled)
893                 {
894                     AssembleTriStripAdj<gsEnabled>();
895 
896                     uint32_t nextTri[6];
897                     if (this->reverseWinding)
898                     {
899                         nextTri[0] = this->vert[4];
900                         nextTri[1] = this->vert[0];
901                         nextTri[2] = this->vert[2];
902                         nextTri[4] = this->vert[3];
903                         nextTri[5] = this->adjExtraVert;
904                     }
905                     else
906                     {
907                         nextTri[0] = this->vert[2];
908                         nextTri[1] = this->adjExtraVert;
909                         nextTri[2] = this->vert[3];
910                         nextTri[4] = this->vert[4];
911                         nextTri[5] = this->vert[0];
912                     }
913                     for (uint32_t i = 0; i < 6; ++i)
914                     {
915                         this->vert[i] = nextTri[i];
916                     }
917 
918                     this->adjExtraVert = -1;
919                     this->reverseWinding ^= 1;
920                 }
921                 else
922                 {
923                     this->curIndex++;
924                 }
925             }
926             break;
927         case 6:
928             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
929             AssembleTriStripAdj<gsEnabled>();
930 
931             uint32_t nextTri[6];
932             if (this->reverseWinding)
933             {
934                 nextTri[0] = this->vert[4];
935                 nextTri[1] = this->vert[0];
936                 nextTri[2] = this->vert[2];
937                 nextTri[4] = this->vert[3];
938                 nextTri[5] = this->adjExtraVert;
939             }
940             else
941             {
942                 nextTri[0] = this->vert[2];
943                 nextTri[1] = this->adjExtraVert;
944                 nextTri[2] = this->vert[3];
945                 nextTri[4] = this->vert[4];
946                 nextTri[5] = this->vert[0];
947             }
948             for (uint32_t i = 0; i < 6; ++i)
949             {
950                 this->vert[i] = nextTri[i];
951             }
952             this->reverseWinding ^= 1;
953             this->adjExtraVert = index;
954             this->curIndex--;
955             break;
956         }
957     }
958 
ProcessVertTriListPA_STATE_CUT959     void ProcessVertTriList(uint32_t index, bool finish)
960     {
961         this->vert[this->curIndex] = index;
962         this->curIndex++;
963         if (this->curIndex == 3)
964         {
965             // assembled enough verts for prim, add to gather indices
966             this->indices[0][this->numPrimsAssembled] = this->vert[0];
967             this->indices[1][this->numPrimsAssembled] = this->vert[1];
968             this->indices[2][this->numPrimsAssembled] = this->vert[2];
969 
970             // increment numPrimsAssembled
971             this->numPrimsAssembled++;
972 
973             // set up next prim state
974             this->curIndex = 0;
975         }
976     }
977 
ProcessVertTriListAdjPA_STATE_CUT978     void ProcessVertTriListAdj(uint32_t index, bool finish)
979     {
980         this->vert[this->curIndex] = index;
981         this->curIndex++;
982         if (this->curIndex == 6)
983         {
984             // assembled enough verts for prim, add to gather indices
985             this->indices[0][this->numPrimsAssembled] = this->vert[0];
986             this->indices[1][this->numPrimsAssembled] = this->vert[1];
987             this->indices[2][this->numPrimsAssembled] = this->vert[2];
988             this->indices[3][this->numPrimsAssembled] = this->vert[3];
989             this->indices[4][this->numPrimsAssembled] = this->vert[4];
990             this->indices[5][this->numPrimsAssembled] = this->vert[5];
991 
992             // increment numPrimsAssembled
993             this->numPrimsAssembled++;
994 
995             // set up next prim state
996             this->curIndex = 0;
997         }
998     }
999 
ProcessVertTriListAdjNoGsPA_STATE_CUT1000     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1001     {
1002         this->vert[this->curIndex] = index;
1003         this->curIndex++;
1004         if (this->curIndex == 6)
1005         {
1006             // assembled enough verts for prim, add to gather indices
1007             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1008             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1009             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1010 
1011             // increment numPrimsAssembled
1012             this->numPrimsAssembled++;
1013 
1014             // set up next prim state
1015             this->curIndex = 0;
1016         }
1017     }
1018 
1019 
ProcessVertLineListPA_STATE_CUT1020     void ProcessVertLineList(uint32_t index, bool finish)
1021     {
1022         this->vert[this->curIndex] = index;
1023         this->curIndex++;
1024         if (this->curIndex == 2)
1025         {
1026             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1027             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1028 
1029             this->numPrimsAssembled++;
1030             this->curIndex = 0;
1031         }
1032     }
1033 
ProcessVertLineStripPA_STATE_CUT1034     void ProcessVertLineStrip(uint32_t index, bool finish)
1035     {
1036         this->vert[this->curIndex] = index;
1037         this->curIndex++;
1038         if (this->curIndex == 2)
1039         {
1040             // assembled enough verts for prim, add to gather indices
1041             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1042             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1043 
1044             // increment numPrimsAssembled
1045             this->numPrimsAssembled++;
1046 
1047             // set up next prim state
1048             this->vert[0] = this->vert[1];
1049             this->curIndex = 1;
1050         }
1051     }
1052 
ProcessVertLineStripAdjPA_STATE_CUT1053     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1054     {
1055         this->vert[this->curIndex] = index;
1056         this->curIndex++;
1057         if (this->curIndex == 4)
1058         {
1059             // assembled enough verts for prim, add to gather indices
1060             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1061             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1062             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1063             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1064 
1065             // increment numPrimsAssembled
1066             this->numPrimsAssembled++;
1067 
1068             // set up next prim state
1069             this->vert[0] = this->vert[1];
1070             this->vert[1] = this->vert[2];
1071             this->vert[2] = this->vert[3];
1072             this->curIndex = 3;
1073         }
1074     }
1075 
ProcessVertLineStripAdjNoGsPA_STATE_CUT1076     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1077     {
1078         this->vert[this->curIndex] = index;
1079         this->curIndex++;
1080         if (this->curIndex == 4)
1081         {
1082             // assembled enough verts for prim, add to gather indices
1083             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1084             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1085 
1086             // increment numPrimsAssembled
1087             this->numPrimsAssembled++;
1088 
1089             // set up next prim state
1090             this->vert[0] = this->vert[1];
1091             this->vert[1] = this->vert[2];
1092             this->vert[2] = this->vert[3];
1093             this->curIndex = 3;
1094         }
1095     }
1096 
ProcessVertLineListAdjPA_STATE_CUT1097     void ProcessVertLineListAdj(uint32_t index, bool finish)
1098     {
1099         this->vert[this->curIndex] = index;
1100         this->curIndex++;
1101         if (this->curIndex == 4)
1102         {
1103             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1104             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1105             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1106             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1107 
1108             this->numPrimsAssembled++;
1109             this->curIndex = 0;
1110         }
1111     }
1112 
ProcessVertLineListAdjNoGsPA_STATE_CUT1113     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1114     {
1115         this->vert[this->curIndex] = index;
1116         this->curIndex++;
1117         if (this->curIndex == 4)
1118         {
1119             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1120             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1121 
1122             this->numPrimsAssembled++;
1123             this->curIndex = 0;
1124         }
1125     }
1126 
ProcessVertPointListPA_STATE_CUT1127     void ProcessVertPointList(uint32_t index, bool finish)
1128     {
1129         this->vert[this->curIndex] = index;
1130         this->curIndex++;
1131         if (this->curIndex == 1)
1132         {
1133             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1134             this->numPrimsAssembled++;
1135             this->curIndex = 0;
1136         }
1137     }
1138 };
1139 
1140 // Primitive Assembly for data output from the DomainShader.
1141 struct PA_TESS : PA_STATE
1142 {
PA_TESSPA_TESS1143     PA_TESS(
1144         DRAW_CONTEXT *in_pDC,
1145         const SIMDSCALAR* in_pVertData,
1146         uint32_t in_attributeStrideInVectors,
1147         uint32_t in_vertexStride,
1148         uint32_t in_numAttributes,
1149         uint32_t* (&in_ppIndices)[3],
1150         uint32_t in_numPrims,
1151         PRIMITIVE_TOPOLOGY in_binTopology,
1152         uint32_t numVertsPerPrim) :
1153 
1154         PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
1155         m_pVertexData(in_pVertData),
1156         m_attributeStrideInVectors(in_attributeStrideInVectors),
1157         m_numAttributes(in_numAttributes),
1158         m_numPrims(in_numPrims)
1159     {
1160 #if USE_SIMD16_FRONTEND
1161         m_vPrimId = _simd16_setzero_si();
1162 #else
1163         m_vPrimId = _simd_setzero_si();
1164 #endif
1165         binTopology = in_binTopology;
1166         m_ppIndices[0] = in_ppIndices[0];
1167         m_ppIndices[1] = in_ppIndices[1];
1168         m_ppIndices[2] = in_ppIndices[2];
1169 
1170         switch (binTopology)
1171         {
1172         case TOP_POINT_LIST:
1173             m_numVertsPerPrim = 1;
1174             break;
1175 
1176         case TOP_LINE_LIST:
1177             m_numVertsPerPrim = 2;
1178             break;
1179 
1180         case TOP_TRIANGLE_LIST:
1181             m_numVertsPerPrim = 3;
1182             break;
1183 
1184         default:
1185             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1186             break;
1187         }
1188     }
1189 
HasWorkPA_TESS1190     bool HasWork()
1191     {
1192         return m_numPrims != 0;
1193     }
1194 
GetSimdVectorPA_TESS1195     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1196     {
1197         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1198         return junkVector;
1199     }
1200 
1201 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_TESS1202     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1203     {
1204         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1205         return junkVector_simd16;
1206     }
1207 
1208 #endif
GenPrimMaskPA_TESS1209     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1210     {
1211         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1212 #if USE_SIMD16_FRONTEND
1213         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1214         {
1215             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1216             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1217         };
1218 
1219         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1220 #else
1221         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1222         {
1223             -1, -1, -1, -1, -1, -1, -1, -1,
1224             0,  0,  0,  0,  0,  0,  0,  0
1225         };
1226 
1227         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1228 #endif
1229     }
1230 
AssemblePA_TESS1231     bool Assemble(uint32_t slot, simdvector verts[])
1232     {
1233         SWR_ASSERT(slot < m_numAttributes);
1234 
1235         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1236         if (0 == numPrimsToAssemble)
1237         {
1238             return false;
1239         }
1240 
1241         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1242 
1243         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1244         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1245         {
1246 #if USE_SIMD16_FRONTEND
1247             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1248 #else
1249             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1250 #endif
1251 
1252             const float* pBase = pBaseAttrib;
1253             for (uint32_t c = 0; c < 4; ++c)
1254             {
1255 #if USE_SIMD16_FRONTEND
1256                 simd16scalar temp = _simd16_mask_i32gather_ps(
1257                     _simd16_setzero_ps(),
1258                     pBase,
1259                     indices,
1260                     _simd16_castsi_ps(mask),
1261                     4 /* gcc doesn't like sizeof(float) */);
1262 
1263                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1264 #else
1265                 verts[i].v[c] = _simd_mask_i32gather_ps(
1266                     _simd_setzero_ps(),
1267                     pBase,
1268                     indices,
1269                     _simd_castsi_ps(mask),
1270                     4); // gcc doesn't like sizeof(float)
1271 #endif
1272                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1273             }
1274         }
1275 
1276         return true;
1277     }
1278 
1279 #if ENABLE_AVX512_SIMD16
AssemblePA_TESS1280     bool Assemble(uint32_t slot, simd16vector verts[])
1281     {
1282         SWR_ASSERT(slot < m_numAttributes);
1283 
1284         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1285         if (0 == numPrimsToAssemble)
1286         {
1287             return false;
1288         }
1289 
1290         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1291 
1292         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1293         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1294         {
1295 #if USE_SIMD16_FRONTEND
1296             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1297 #else
1298             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1299 #endif
1300 
1301             const float* pBase = pBaseAttrib;
1302             for (uint32_t c = 0; c < 4; ++c)
1303             {
1304 #if USE_SIMD16_FRONTEND
1305                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1306                     _simd16_setzero_ps(),
1307                     pBase,
1308                     indices,
1309                     _simd16_castsi_ps(mask),
1310                     4 /* gcc doesn't like sizeof(float) */);
1311 #else
1312                 simdscalar temp = _simd_mask_i32gather_ps(
1313                     _simd_setzero_ps(),
1314                     pBase,
1315                     indices,
1316                     _simd_castsi_ps(mask),
1317                     4 /* gcc doesn't like sizeof(float) */);
1318                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1319 #endif
1320                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1321             }
1322         }
1323 
1324         return true;
1325     }
1326 
1327 #endif
AssembleSinglePA_TESS1328     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1329     {
1330         SWR_ASSERT(slot < m_numAttributes);
1331         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1332 
1333         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1334         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1335         {
1336 #if USE_SIMD16_FRONTEND
1337             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1338 #else
1339             uint32_t index = m_ppIndices[i][primIndex];
1340 #endif
1341             const float* pVertData = pVertDataBase;
1342             float* pVert = (float*)&verts[i];
1343 
1344             for (uint32_t c = 0; c < 4; ++c)
1345             {
1346                 pVert[c] = pVertData[index];
1347                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1348             }
1349         }
1350     }
1351 
NextPrimPA_TESS1352     bool NextPrim()
1353     {
1354         uint32_t numPrims = PA_TESS::NumPrims();
1355         m_numPrims -= numPrims;
1356         m_ppIndices[0] += numPrims;
1357         m_ppIndices[1] += numPrims;
1358         m_ppIndices[2] += numPrims;
1359 
1360         return HasWork();
1361     }
1362 
GetNextVsOutputPA_TESS1363     SIMDVERTEX& GetNextVsOutput()
1364     {
1365         SWR_NOT_IMPL;
1366         return junkVertex;
1367     }
1368 
GetNextStreamOutputPA_TESS1369     bool GetNextStreamOutput()
1370     {
1371         SWR_NOT_IMPL;
1372         return false;
1373     }
1374 
GetNextVsIndicesPA_TESS1375     SIMDMASK& GetNextVsIndices()
1376     {
1377         SWR_NOT_IMPL;
1378         return junkIndices;
1379     }
1380 
NumPrimsPA_TESS1381     uint32_t NumPrims()
1382     {
1383         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1384     }
1385 
ResetPA_TESS1386     void Reset()
1387     {
1388         SWR_NOT_IMPL;
1389     }
1390 
GetPrimIDPA_TESS1391     SIMDSCALARI GetPrimID(uint32_t startID)
1392     {
1393 #if USE_SIMD16_FRONTEND
1394         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1395 #else
1396         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1397 #endif
1398     }
1399 
1400 private:
1401     const SIMDSCALAR*   m_pVertexData = nullptr;
1402     uint32_t            m_attributeStrideInVectors = 0;
1403     uint32_t            m_numAttributes = 0;
1404     uint32_t            m_numPrims = 0;
1405     uint32_t*           m_ppIndices[3];
1406 
1407     uint32_t            m_numVertsPerPrim = 0;
1408 
1409     SIMDSCALARI         m_vPrimId;
1410 
1411     simdvector          junkVector;         // junk simdvector for unimplemented API
1412 #if ENABLE_AVX512_SIMD16
1413     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
1414 #endif
1415     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
1416     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
1417 };
1418 
1419 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1420 // based on state.
1421 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1422 struct PA_FACTORY
1423 {
PA_FACTORYPA_FACTORY1424     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo)
1425     {
1426 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1427         const API_STATE& state = GetApiState(pDC);
1428         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1429             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1430             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1431             topo == TOP_TRIANGLE_LIST)) ||
1432 
1433             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1434             // for them in the optimized PA
1435             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1436         {
1437             memset(&indexStore, 0, sizeof(indexStore));
1438             uint32_t numAttribs = state.feNumAttributes;
1439 
1440             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1441                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim);
1442             cutPA = true;
1443         }
1444         else
1445 #endif
1446         {
1447             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1448             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim);
1449             cutPA = false;
1450         }
1451 
1452     }
1453 
GetPAPA_FACTORY1454     PA_STATE& GetPA()
1455     {
1456 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1457         if (cutPA)
1458         {
1459             return this->paCut;
1460         }
1461         else
1462 #endif
1463         {
1464             return this->paOpt;
1465         }
1466     }
1467 
1468     PA_STATE_OPT paOpt;
1469     PA_STATE_CUT paCut;
1470 
1471     bool cutPA{ false };
1472 
1473     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1474 
1475     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1476 };
1477