1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file pa.h
24  *
25  * @brief Definitions for primitive assembly.
26  *        N primitives are assembled at a time, where N is the SIMD width.
27  *        A state machine, that is specific for a given topology, drives the
28  *        assembly of vertices into triangles.
29  *
30  ******************************************************************************/
31 #pragma once
32 
33 #include "frontend.h"
34 
35 struct PA_STATE
36 {
37 #if USE_SIMD16_FRONTEND
38     enum
39     {
40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
42         SIMD_WIDTH_LOG2 = 4
43     };
44 
45     typedef simd16mask SIMDMASK;
46 
47     typedef simd16scalar SIMDSCALAR;
48     typedef simd16vector SIMDVECTOR;
49     typedef simd16vertex SIMDVERTEX;
50 
51     typedef simd16scalari SIMDSCALARI;
52 
53 #else
54     enum
55     {
56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
58         SIMD_WIDTH_LOG2 = 3
59     };
60 
61     typedef simdmask SIMDMASK;
62 
63     typedef simdscalar SIMDSCALAR;
64     typedef simdvector SIMDVECTOR;
65     typedef simdvertex SIMDVERTEX;
66 
67     typedef simdscalari SIMDSCALARI;
68 
69 #endif
70     DRAW_CONTEXT* pDC{nullptr};         // draw context
71     uint8_t*      pStreamBase{nullptr}; // vertex stream
72     uint32_t      streamSizeInVerts{0}; // total size of the input stream in verts
73     uint32_t      vertexStride{0};      // stride of a vertex in simdvector units
74 
75     // The topology the binner will use. In some cases the FE changes the topology from the api
76     // state.
77     PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};
78 
79 #if ENABLE_AVX512_SIMD16
80     bool useAlternateOffset{false};
81 #endif
82 
83     bool     viewportArrayActive{false};
84     bool     rtArrayActive{false};
85     uint32_t numVertsPerPrim{0};
86 
PA_STATEPA_STATE87     PA_STATE() {}
PA_STATEPA_STATE88     PA_STATE(DRAW_CONTEXT* in_pDC,
89              uint8_t*      in_pStreamBase,
90              uint32_t      in_streamSizeInVerts,
91              uint32_t      in_vertexStride,
92              uint32_t      in_numVertsPerPrim) :
93         pDC(in_pDC),
94         pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
95         vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
96     {
97     }
98 
99     virtual bool        HasWork()                                    = 0;
100     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
101 #if ENABLE_AVX512_SIMD16
102     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
103 #endif
104     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
105 #if ENABLE_AVX512_SIMD16
106     virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
107 #endif
108     virtual void        AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
109     virtual bool        NextPrim()                                                             = 0;
110     virtual SIMDVERTEX& GetNextVsOutput()                                                      = 0;
111     virtual bool        GetNextStreamOutput()                                                  = 0;
112     virtual SIMDMASK&   GetNextVsIndices()                                                     = 0;
113     virtual uint32_t    NumPrims()                                                             = 0;
114     virtual void        Reset()                                                                = 0;
115     virtual SIMDSCALARI GetPrimID(uint32_t startID)                                            = 0;
116 };
117 
118 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
119 // output. Here is the sequence
120 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
121 //    2. Execute PA function to assemble and bin triangles.
122 //        a.    The PA function is a set of functions that collectively make up the
123 //            state machine for a given topology.
124 //                1.    We use a state index to track which PA function to call.
125 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
126 //                1.    We call this the current and previous simd vertex.
127 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
128 //                    order to assemble the second triangle, for a triangle list, we'll need the
129 //                    last vertex from the previous simd and the first 2 vertices from the current
130 //                    simd.
131 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
132 //
133 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
134 // cuts
135 struct PA_STATE_OPT : public PA_STATE
136 {
137     uint32_t numPrims{0};         // Total number of primitives for draw.
138     uint32_t numPrimsComplete{0}; // Total number of complete primitives.
139 
140     uint32_t numSimdPrims{0}; // Number of prims in current simd.
141 
142     uint32_t       cur{0};   // index to current VS output.
143     uint32_t       prev{0};  // index to prev VS output. Not really needed in the state.
144     const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.
145 
146     uint32_t counter{0};   // state counter
147     bool     reset{false}; // reset state
148 
149     uint32_t    primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
150     SIMDSCALARI primID;
151 
152     typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
153 #if ENABLE_AVX512_SIMD16
154     typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
155 #endif
156     typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
157                                        uint32_t      slot,
158                                        uint32_t      primIndex,
159                                        simd4scalar   verts[]);
160 
161     PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
162 #if ENABLE_AVX512_SIMD16
163     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
164 #endif
165     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
166         nullptr}; // PA state machine function for assembling single triangle.
167     PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
168 #if ENABLE_AVX512_SIMD16
169     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
170 #endif
171 
172     // state used to advance the PA when Next is called
173     PFN_PA_FUNC pfnPaNextFunc{nullptr};
174 #if ENABLE_AVX512_SIMD16
175     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
176 #endif
177     uint32_t nextNumSimdPrims{0};
178     uint32_t nextNumPrimsIncrement{0};
179     bool     nextReset{false};
180     bool     isStreaming{false};
181 
182     SIMDMASK junkIndices{0}; // temporary index store for unused virtual function
183 
PA_STATE_OPTPA_STATE_OPT184     PA_STATE_OPT() {}
185     PA_STATE_OPT(DRAW_CONTEXT*      pDC,
186                  uint32_t           numPrims,
187                  uint8_t*           pStream,
188                  uint32_t           streamSizeInVerts,
189                  uint32_t           vertexStride,
190                  bool               in_isStreaming,
191                  uint32_t           numVertsPerPrim,
192                  PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
193 
HasWorkPA_STATE_OPT194     bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }
195 
GetSimdVectorPA_STATE_OPT196     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
197     {
198         SWR_ASSERT(slot < vertexStride);
199         uint32_t    offset     = index * vertexStride + slot;
200         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
201         return vertexSlot;
202     }
203 
204 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_STATE_OPT205     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
206     {
207         SWR_ASSERT(slot < vertexStride);
208         uint32_t      offset     = index * vertexStride + slot;
209         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
210         return vertexSlot;
211     }
212 
213 #endif
214     // Assembles 4 triangles. Each simdvector is a single vertex from 4
215     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
AssemblePA_STATE_OPT216     bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }
217 
218 #if ENABLE_AVX512_SIMD16
AssemblePA_STATE_OPT219     bool Assemble(uint32_t slot, simd16vector verts[])
220     {
221         return this->pfnPaFunc_simd16(*this, slot, verts);
222     }
223 
224 #endif
225     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
AssembleSinglePA_STATE_OPT226     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
227     {
228         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
229     }
230 
NextPrimPA_STATE_OPT231     bool NextPrim()
232     {
233         this->pfnPaFunc = this->pfnPaNextFunc;
234 #if ENABLE_AVX512_SIMD16
235         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
236 #endif
237         this->numSimdPrims = this->nextNumSimdPrims;
238         this->numPrimsComplete += this->nextNumPrimsIncrement;
239         this->reset = this->nextReset;
240 
241         if (this->isStreaming)
242         {
243             this->reset = false;
244         }
245 
246         bool morePrims = false;
247 
248         if (this->numSimdPrims > 0)
249         {
250             morePrims = true;
251             this->numSimdPrims--;
252         }
253         else
254         {
255             this->counter = (this->reset) ? 0 : (this->counter + 1);
256             this->reset   = false;
257         }
258 
259         if (!HasWork())
260         {
261             morePrims = false; // no more to do
262         }
263 
264         return morePrims;
265     }
266 
GetNextVsOutputPA_STATE_OPT267     SIMDVERTEX& GetNextVsOutput()
268     {
269         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
270 
271         // increment cur and prev indices
272         if (counter < numSimdVerts)
273         {
274             // prev undefined for first state
275             prev = cur;
276             cur  = counter;
277         }
278         else
279         {
280             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
281             // the buffer
282             uint32_t temp = prev;
283 
284             prev = cur;
285             cur  = temp;
286         }
287 
288         SWR_ASSERT(cur < numSimdVerts);
289         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
290 
291         return *(SIMDVERTEX*)pVertex;
292     }
293 
GetNextVsIndicesPA_STATE_OPT294     SIMDMASK& GetNextVsIndices()
295     {
296         // unused in optimized PA, pass tmp buffer back
297         return junkIndices;
298     }
299 
GetNextStreamOutputPA_STATE_OPT300     bool GetNextStreamOutput()
301     {
302         this->prev = this->cur;
303         this->cur  = this->counter;
304 
305         return HasWork();
306     }
307 
NumPrimsPA_STATE_OPT308     uint32_t NumPrims()
309     {
310         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
311                    ? (SIMD_WIDTH -
312                       (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
313                    : SIMD_WIDTH;
314     }
315 
316     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
317                       PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
318                       uint32_t                         numSimdPrims      = 0,
319                       uint32_t                         numPrimsIncrement = 0,
320                       bool                             reset             = false)
321     {
322         this->pfnPaNextFunc         = pfnPaNextFunc;
323         this->nextNumSimdPrims      = numSimdPrims;
324         this->nextNumPrimsIncrement = numPrimsIncrement;
325         this->nextReset             = reset;
326 
327         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
328     }
329 
330 #if ENABLE_AVX512_SIMD16
331     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
332                              PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
333                              PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
334                              uint32_t                         numSimdPrims      = 0,
335                              uint32_t                         numPrimsIncrement = 0,
336                              bool                             reset             = false)
337     {
338         this->pfnPaNextFunc_simd16  = pfnPaNextFunc_simd16;
339         this->pfnPaNextFunc         = pfnPaNextFunc;
340         this->nextNumSimdPrims      = numSimdPrims;
341         this->nextNumPrimsIncrement = numPrimsIncrement;
342         this->nextReset             = reset;
343 
344         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
345     }
346 
347 #endif
ResetPA_STATE_OPT348     void Reset()
349     {
350 #if ENABLE_AVX512_SIMD16
351         useAlternateOffset = false;
352 
353 #endif
354         this->pfnPaFunc = this->pfnPaFuncReset;
355 #if ENABLE_AVX512_SIMD16
356         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
357 #endif
358         this->numPrimsComplete = 0;
359         this->numSimdPrims     = 0;
360         this->cur              = 0;
361         this->prev             = 0;
362         this->counter          = 0;
363         this->reset            = false;
364     }
365 
GetPrimIDPA_STATE_OPT366     SIMDSCALARI GetPrimID(uint32_t startID)
367     {
368 #if USE_SIMD16_FRONTEND
369         return _simd16_add_epi32(
370             this->primID,
371             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
372 #else
373         return _simd_add_epi32(
374             this->primID,
375             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
376 #endif
377     }
378 };
379 
380 // helper C wrappers to avoid having to rewrite all the PA topology state functions
381 INLINE void SetNextPaState(PA_STATE_OPT&                    pa,
382                            PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
383                            PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
384                            uint32_t                         numSimdPrims      = 0,
385                            uint32_t                         numPrimsIncrement = 0,
386                            bool                             reset             = false)
387 {
388     return pa.SetNextState(
389         pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
390 }
391 
392 #if ENABLE_AVX512_SIMD16
393 INLINE void SetNextPaState_simd16(PA_STATE_OPT&                    pa,
394                                   PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
395                                   PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
396                                   PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
397                                   uint32_t                         numSimdPrims      = 0,
398                                   uint32_t                         numPrimsIncrement = 0,
399                                   bool                             reset             = false)
400 {
401     return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
402                                   pfnPaNextFunc,
403                                   pfnPaNextSingleFunc,
404                                   numSimdPrims,
405                                   numPrimsIncrement,
406                                   reset);
407 }
408 
409 #endif
PaGetSimdVector(PA_STATE & pa,uint32_t index,uint32_t slot)410 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
411 {
412     return pa.GetSimdVector(index, slot);
413 }
414 
415 #if ENABLE_AVX512_SIMD16
PaGetSimdVector_simd16(PA_STATE & pa,uint32_t index,uint32_t slot)416 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
417 {
418     return pa.GetSimdVector_simd16(index, slot);
419 }
420 
421 #endif
422 // Cut-aware primitive assembler.
423 struct PA_STATE_CUT : public PA_STATE
424 {
425     SIMDMASK* pCutIndices{nullptr};  // cut indices buffer, 1 bit per vertex
426     uint32_t  numVerts{0};           // number of vertices available in buffer store
427     uint32_t  numAttribs{0};         // number of attributes
428     int32_t   numRemainingVerts{0};  // number of verts remaining to be assembled
429     uint32_t  numVertsToAssemble{0}; // total number of verts to assemble for the draw
430 #if ENABLE_AVX512_SIMD16
431     OSALIGNSIMD16(uint32_t)
432     indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
433 #else
434     OSALIGNSIMD(uint32_t)
435     indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
436 #endif
437     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
438     uint32_t    numPrimsAssembled{0};             // number of primitives that are fully assembled
439     uint32_t    headVertex{0};      // current unused vertex slot in vertex buffer store
440     uint32_t    tailVertex{0};      // beginning vertex currently assembling
441     uint32_t    curVertex{0};       // current unprocessed vertex
442     uint32_t    startPrimId{0};     // starting prim id
443     SIMDSCALARI vPrimId;            // vector of prim ID
444     bool        needOffsets{false}; // need to compute gather offsets for current SIMD
445     uint32_t    vertsPerPrim{0};
446     bool        processCutVerts{
447         false}; // vertex indices with cuts should be processed as normal, otherwise they
448                 // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
449                 // while the GS sends valid verts for every index
450 
451     simdvector junkVector; // junk simdvector for unimplemented API
452 #if ENABLE_AVX512_SIMD16
453     simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
454 #endif
455 
456     // Topology state tracking
457     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
458     uint32_t curIndex{0};
459     bool     reverseWinding{false}; // indicates reverse winding for strips
460     int32_t  adjExtraVert{0};       // extra vert uses for tristrip w/ adj
461 
462     typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
463     PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert
464 
PA_STATE_CUTPA_STATE_CUT465     PA_STATE_CUT() {}
PA_STATE_CUTPA_STATE_CUT466     PA_STATE_CUT(DRAW_CONTEXT*      pDC,
467                  uint8_t*           in_pStream,
468                  uint32_t           in_streamSizeInVerts,
469                  uint32_t           in_vertexStride,
470                  SIMDMASK*          in_pIndices,
471                  uint32_t           in_numVerts,
472                  uint32_t           in_numAttribs,
473                  PRIMITIVE_TOPOLOGY topo,
474                  bool               in_processCutVerts,
475                  uint32_t           in_numVertsPerPrim) :
476         PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
477     {
478         numVerts        = in_streamSizeInVerts;
479         numAttribs      = in_numAttribs;
480         binTopology     = topo;
481         needOffsets     = false;
482         processCutVerts = in_processCutVerts;
483 
484         numVertsToAssemble = numRemainingVerts = in_numVerts;
485         numPrimsAssembled                      = 0;
486         headVertex = tailVertex = curVertex = 0;
487 
488         curIndex    = 0;
489         pCutIndices = in_pIndices;
490         memset(indices, 0, sizeof(indices));
491 #if USE_SIMD16_FRONTEND
492         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
493 #else
494         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
495 #endif
496         reverseWinding = false;
497         adjExtraVert   = -1;
498 
499         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
500         vertsPerPrim   = NumVertsPerPrim(topo, gsEnabled);
501 
502         switch (topo)
503         {
504         case TOP_TRIANGLE_LIST:
505             pfnPa = &PA_STATE_CUT::ProcessVertTriList;
506             break;
507         case TOP_TRI_LIST_ADJ:
508             pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
509                               : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
510             break;
511         case TOP_TRIANGLE_STRIP:
512             pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
513             break;
514         case TOP_TRI_STRIP_ADJ:
515             if (gsEnabled)
516             {
517                 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
518             }
519             else
520             {
521                 pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
522             }
523             break;
524 
525         case TOP_POINT_LIST:
526             pfnPa = &PA_STATE_CUT::ProcessVertPointList;
527             break;
528         case TOP_LINE_LIST:
529             pfnPa = &PA_STATE_CUT::ProcessVertLineList;
530             break;
531         case TOP_LINE_LIST_ADJ:
532             pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
533                               : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
534             break;
535         case TOP_LINE_STRIP:
536             pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
537             break;
538         case TOP_LISTSTRIP_ADJ:
539             pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
540                               : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
541             break;
542         case TOP_RECT_LIST:
543             pfnPa = &PA_STATE_CUT::ProcessVertRectList;
544             break;
545         default:
546             assert(0 && "Unimplemented topology");
547         }
548     }
549 
GetNextVsOutputPA_STATE_CUT550     SIMDVERTEX& GetNextVsOutput()
551     {
552         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
553         this->headVertex     = (this->headVertex + SIMD_WIDTH) % this->numVerts;
554         this->needOffsets    = true;
555         SIMDVECTOR* pVertex  = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
556 
557         return *(SIMDVERTEX*)pVertex;
558     }
559 
GetNextVsIndicesPA_STATE_CUT560     SIMDMASK& GetNextVsIndices()
561     {
562         uint32_t  vertexIndex  = this->headVertex / SIMD_WIDTH;
563         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
564         return *pCurCutIndex;
565     }
566 
GetSimdVectorPA_STATE_CUT567     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
568     {
569         // unused
570         SWR_ASSERT(0 && "Not implemented");
571         return junkVector;
572     }
573 
574 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_STATE_CUT575     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
576     {
577         // unused
578         SWR_ASSERT(0 && "Not implemented");
579         return junkVector_simd16;
580     }
581 
582 #endif
GetNextStreamOutputPA_STATE_CUT583     bool GetNextStreamOutput()
584     {
585         this->headVertex += SIMD_WIDTH;
586         this->needOffsets = true;
587         return HasWork();
588     }
589 
GetPrimIDPA_STATE_CUT590     SIMDSCALARI GetPrimID(uint32_t startID)
591     {
592 #if USE_SIMD16_FRONTEND
593         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
594 #else
595         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
596 #endif
597     }
598 
ResetPA_STATE_CUT599     void Reset()
600     {
601 #if ENABLE_AVX512_SIMD16
602         useAlternateOffset = false;
603 
604 #endif
605         this->numRemainingVerts = this->numVertsToAssemble;
606         this->numPrimsAssembled = 0;
607         this->curIndex          = 0;
608         this->curVertex         = 0;
609         this->tailVertex        = 0;
610         this->headVertex        = 0;
611         this->reverseWinding    = false;
612         this->adjExtraVert      = -1;
613 #if USE_SIMD16_FRONTEND
614         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
615 #else
616         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
617 #endif
618     }
619 
HasWorkPA_STATE_CUT620     bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }
621 
IsVertexStoreFullPA_STATE_CUT622     bool IsVertexStoreFull()
623     {
624         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
625     }
626 
RestartTopologyPA_STATE_CUT627     void RestartTopology()
628     {
629         this->curIndex       = 0;
630         this->reverseWinding = false;
631         this->adjExtraVert   = -1;
632     }
633 
IsCutIndexPA_STATE_CUT634     bool IsCutIndex(uint32_t vertex)
635     {
636         uint32_t vertexIndex  = vertex / SIMD_WIDTH;
637         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
638         return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
639     }
640 
641     // iterates across the unprocessed verts until we hit the end or we
642     // have assembled SIMD prims
ProcessVertsPA_STATE_CUT643     void ProcessVerts()
644     {
645         while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
646                this->curVertex != this->headVertex)
647         {
648             // if cut index, restart topology
649             if (IsCutIndex(this->curVertex))
650             {
651                 if (this->processCutVerts)
652                 {
653                     (this->*pfnPa)(this->curVertex, false);
654                 }
655                 // finish off tri strip w/ adj before restarting topo
656                 if (this->adjExtraVert != -1)
657                 {
658                     (this->*pfnPa)(this->curVertex, true);
659                 }
660                 RestartTopology();
661             }
662             else
663             {
664                 (this->*pfnPa)(this->curVertex, false);
665             }
666 
667             this->curVertex++;
668             if (this->curVertex >= this->numVerts)
669             {
670                 this->curVertex = 0;
671             }
672             this->numRemainingVerts--;
673         }
674 
675         // special case last primitive for tri strip w/ adj
676         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
677             this->adjExtraVert != -1)
678         {
679             (this->*pfnPa)(this->curVertex, true);
680         }
681     }
682 
AdvancePA_STATE_CUT683     void Advance()
684     {
685         // done with current batch
686         // advance tail to the current unsubmitted vertex
687         this->tailVertex        = this->curVertex;
688         this->numPrimsAssembled = 0;
689 #if USE_SIMD16_FRONTEND
690         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
691 #else
692         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
693 #endif
694     }
695 
NextPrimPA_STATE_CUT696     bool NextPrim()
697     {
698         // if we've assembled enough prims, we can advance to the next set of verts
699         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
700         {
701             Advance();
702         }
703         return false;
704     }
705 
ComputeOffsetsPA_STATE_CUT706     void ComputeOffsets()
707     {
708         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
709         {
710             uint32_t    vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
711             SIMDSCALARI vIndices          = *(SIMDSCALARI*)&this->indices[v][0];
712 
713             // step to simdvertex batch
714             const uint32_t simdShift = SIMD_WIDTH_LOG2;
715 #if USE_SIMD16_FRONTEND
716             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
717             this->vOffsets[v] =
718                 _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
719 #else
720             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
721             this->vOffsets[v] =
722                 _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
723 #endif
724 
725             // step to index
726             const uint32_t simdMask = SIMD_WIDTH - 1;
727 #if USE_SIMD16_FRONTEND
728             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
729             this->vOffsets[v]        = _simd16_add_epi32(
730                 this->vOffsets[v],
731                 _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
732 #else
733             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
734             this->vOffsets[v] =
735                 _simd_add_epi32(this->vOffsets[v],
736                                 _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
737 #endif
738         }
739     }
740 
AssemblePA_STATE_CUT741     bool Assemble(uint32_t slot, simdvector* verts)
742     {
743         // process any outstanding verts
744         ProcessVerts();
745 
746         // return false if we don't have enough prims assembled
747         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
748         {
749             return false;
750         }
751 
752         // cache off gather offsets given the current SIMD set of indices the first time we get an
753         // assemble
754         if (this->needOffsets)
755         {
756             ComputeOffsets();
757             this->needOffsets = false;
758         }
759 
760         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
761         {
762             SIMDSCALARI offsets = this->vOffsets[v];
763 
764             // step to attribute
765 #if USE_SIMD16_FRONTEND
766             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
767 #else
768             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
769 #endif
770 
771             float* pBase = (float*)this->pStreamBase;
772             for (uint32_t c = 0; c < 4; ++c)
773             {
774 #if USE_SIMD16_FRONTEND
775                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
776 
777                 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
778                 simdscalar t =
779                     useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
780                 verts[v].v[c] = t;
781 #else
782                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
783 #endif
784 
785                 // move base to next component
786                 pBase += SIMD_WIDTH;
787             }
788         }
789 
790         // compute the implied 4th vertex, v3
791         if (this->binTopology == TOP_RECT_LIST)
792         {
793             for (uint32_t c = 0; c < 4; ++c)
794             {
795                 // v1, v3 = v1 + v2 - v0, v2
796                 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
797                 simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
798                 temp              = _simd16_sub_ps(temp, verts[1].v[c]);
799                 temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
800                 verts[1].v[c] = _simd16_extract_ps(temp, 0);
801             }
802         }
803 
804         return true;
805     }
806 
807 #if ENABLE_AVX512_SIMD16
AssemblePA_STATE_CUT808     bool Assemble(uint32_t slot, simd16vector verts[])
809     {
810        // process any outstanding verts
811         ProcessVerts();
812 
813         // return false if we don't have enough prims assembled
814         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
815         {
816             return false;
817         }
818 
819         // cache off gather offsets given the current SIMD set of indices the first time we get an
820         // assemble
821         if (this->needOffsets)
822         {
823             ComputeOffsets();
824             this->needOffsets = false;
825         }
826 
827         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
828         {
829             SIMDSCALARI offsets = this->vOffsets[v];
830 
831             // step to attribute
832 #if USE_SIMD16_FRONTEND
833             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
834 #else
835             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
836 #endif
837 
838             float* pBase = (float*)this->pStreamBase;
839             for (uint32_t c = 0; c < 4; ++c)
840             {
841 #if USE_SIMD16_FRONTEND
842                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
843 #else
844                 verts[v].v[c] = _simd16_insert_ps(
845                     _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
846 #endif
847 
848                 // move base to next component
849                 pBase += SIMD_WIDTH;
850             }
851         }
852 
853         // compute the implied 4th vertex, v3
854         if (this->binTopology == TOP_RECT_LIST)
855         {
856             for (uint32_t c = 0; c < 4; ++c)
857             {
858                 // v1, v3 = v1 + v2 - v0, v2
859                 // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
860                 simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
861                 temp              = _simd16_sub_ps(temp, verts[1].v[c]);
862                 verts[1].v[c] =
863                     _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
864             }
865         }
866 
867         return true;
868     }
869 
870 #endif
AssembleSinglePA_STATE_CUT871     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
872     {
873        // move to slot
874         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
875         {
876             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
877 #if USE_SIMD16_FRONTEND
878             uint32_t offset =
879                 useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
880 #else
881             uint32_t offset = pOffset[triIndex];
882 #endif
883             offset += sizeof(SIMDVECTOR) * slot;
884             float* pVert = (float*)&tri[v];
885             for (uint32_t c = 0; c < 4; ++c)
886             {
887                 float* pComponent = (float*)(this->pStreamBase + offset);
888                 pVert[c]          = *pComponent;
889                 offset += SIMD_WIDTH * sizeof(float);
890             }
891         }
892 
893         // compute the implied 4th vertex, v3
894         if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
895         {
896             // v1, v3 = v1 + v2 - v0, v2
897             // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
898             float* pVert0 = (float*)&tri[1];
899             float* pVert1 = (float*)&tri[0];
900             float* pVert2 = (float*)&tri[2];
901             float* pVert3 = (float*)&tri[1];
902             for (uint32_t c = 0; c < 4; ++c)
903             {
904                 pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
905             }
906         }
907     }
908 
NumPrimsPA_STATE_CUT909     uint32_t NumPrims() { return this->numPrimsAssembled; }
910 
911     // Per-topology functions
ProcessVertTriStripPA_STATE_CUT912     void ProcessVertTriStrip(uint32_t index, bool finish)
913     {
914         this->vert[this->curIndex] = index;
915         this->curIndex++;
916         if (this->curIndex == 3)
917         {
918             // assembled enough verts for prim, add to gather indices
919             this->indices[0][this->numPrimsAssembled] = this->vert[0];
920             if (reverseWinding)
921             {
922                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
923                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
924             }
925             else
926             {
927                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
928                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
929             }
930 
931             // increment numPrimsAssembled
932             this->numPrimsAssembled++;
933 
934             // set up next prim state
935             this->vert[0]  = this->vert[1];
936             this->vert[1]  = this->vert[2];
937             this->curIndex = 2;
938             this->reverseWinding ^= 1;
939         }
940     }
941 
942     template <bool gsEnabled>
AssembleTriStripAdjPA_STATE_CUT943     void AssembleTriStripAdj()
944     {
945         if (!gsEnabled)
946         {
947             this->vert[1] = this->vert[2];
948             this->vert[2] = this->vert[4];
949 
950             this->indices[0][this->numPrimsAssembled] = this->vert[0];
951             this->indices[1][this->numPrimsAssembled] = this->vert[1];
952             this->indices[2][this->numPrimsAssembled] = this->vert[2];
953 
954             this->vert[4] = this->vert[2];
955             this->vert[2] = this->vert[1];
956         }
957         else
958         {
959             this->indices[0][this->numPrimsAssembled] = this->vert[0];
960             this->indices[1][this->numPrimsAssembled] = this->vert[1];
961             this->indices[2][this->numPrimsAssembled] = this->vert[2];
962             this->indices[3][this->numPrimsAssembled] = this->vert[3];
963             this->indices[4][this->numPrimsAssembled] = this->vert[4];
964             this->indices[5][this->numPrimsAssembled] = this->vert[5];
965         }
966         this->numPrimsAssembled++;
967     }
968 
969     template <bool gsEnabled>
ProcessVertTriStripAdjPA_STATE_CUT970     void ProcessVertTriStripAdj(uint32_t index, bool finish)
971     {
972         // handle last primitive of tristrip
973         if (finish && this->adjExtraVert != -1)
974         {
975             this->vert[3] = this->adjExtraVert;
976             AssembleTriStripAdj<gsEnabled>();
977             this->adjExtraVert = -1;
978             return;
979         }
980 
981         switch (this->curIndex)
982         {
983         case 0:
984         case 1:
985         case 2:
986         case 4:
987             this->vert[this->curIndex] = index;
988             this->curIndex++;
989             break;
990         case 3:
991             this->vert[5] = index;
992             this->curIndex++;
993             break;
994         case 5:
995             if (this->adjExtraVert == -1)
996             {
997                 this->adjExtraVert = index;
998             }
999             else
1000             {
1001                 this->vert[3] = index;
1002                 if (!gsEnabled)
1003                 {
1004                     AssembleTriStripAdj<gsEnabled>();
1005 
1006                     uint32_t nextTri[6];
1007                     if (this->reverseWinding)
1008                     {
1009                         nextTri[0] = this->vert[4];
1010                         nextTri[1] = this->vert[0];
1011                         nextTri[2] = this->vert[2];
1012                         nextTri[4] = this->vert[3];
1013                         nextTri[5] = this->adjExtraVert;
1014                     }
1015                     else
1016                     {
1017                         nextTri[0] = this->vert[2];
1018                         nextTri[1] = this->adjExtraVert;
1019                         nextTri[2] = this->vert[3];
1020                         nextTri[4] = this->vert[4];
1021                         nextTri[5] = this->vert[0];
1022                     }
1023                     for (uint32_t i = 0; i < 6; ++i)
1024                     {
1025                         this->vert[i] = nextTri[i];
1026                     }
1027 
1028                     this->adjExtraVert = -1;
1029                     this->reverseWinding ^= 1;
1030                 }
1031                 else
1032                 {
1033                     this->curIndex++;
1034                 }
1035             }
1036             break;
1037         case 6:
1038             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
1039             AssembleTriStripAdj<gsEnabled>();
1040 
1041             uint32_t nextTri[6];
1042             if (this->reverseWinding)
1043             {
1044                 nextTri[0] = this->vert[4];
1045                 nextTri[1] = this->vert[0];
1046                 nextTri[2] = this->vert[2];
1047                 nextTri[4] = this->vert[3];
1048                 nextTri[5] = this->adjExtraVert;
1049             }
1050             else
1051             {
1052                 nextTri[0] = this->vert[2];
1053                 nextTri[1] = this->adjExtraVert;
1054                 nextTri[2] = this->vert[3];
1055                 nextTri[4] = this->vert[4];
1056                 nextTri[5] = this->vert[0];
1057             }
1058             for (uint32_t i = 0; i < 6; ++i)
1059             {
1060                 this->vert[i] = nextTri[i];
1061             }
1062             this->reverseWinding ^= 1;
1063             this->adjExtraVert = index;
1064             this->curIndex--;
1065             break;
1066         }
1067     }
1068 
ProcessVertTriListPA_STATE_CUT1069     void ProcessVertTriList(uint32_t index, bool finish)
1070     {
1071         this->vert[this->curIndex] = index;
1072         this->curIndex++;
1073         if (this->curIndex == 3)
1074         {
1075             // assembled enough verts for prim, add to gather indices
1076             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1077             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1078             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1079 
1080             // increment numPrimsAssembled
1081             this->numPrimsAssembled++;
1082 
1083             // set up next prim state
1084             this->curIndex = 0;
1085         }
1086     }
1087 
ProcessVertTriListAdjPA_STATE_CUT1088     void ProcessVertTriListAdj(uint32_t index, bool finish)
1089     {
1090         this->vert[this->curIndex] = index;
1091         this->curIndex++;
1092         if (this->curIndex == 6)
1093         {
1094             // assembled enough verts for prim, add to gather indices
1095             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1096             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1097             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1098             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1099             this->indices[4][this->numPrimsAssembled] = this->vert[4];
1100             this->indices[5][this->numPrimsAssembled] = this->vert[5];
1101 
1102             // increment numPrimsAssembled
1103             this->numPrimsAssembled++;
1104 
1105             // set up next prim state
1106             this->curIndex = 0;
1107         }
1108     }
1109 
ProcessVertTriListAdjNoGsPA_STATE_CUT1110     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1111     {
1112         this->vert[this->curIndex] = index;
1113         this->curIndex++;
1114         if (this->curIndex == 6)
1115         {
1116             // assembled enough verts for prim, add to gather indices
1117             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1118             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1119             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1120 
1121             // increment numPrimsAssembled
1122             this->numPrimsAssembled++;
1123 
1124             // set up next prim state
1125             this->curIndex = 0;
1126         }
1127     }
1128 
ProcessVertLineListPA_STATE_CUT1129     void ProcessVertLineList(uint32_t index, bool finish)
1130     {
1131         this->vert[this->curIndex] = index;
1132         this->curIndex++;
1133         if (this->curIndex == 2)
1134         {
1135             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1136             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1137 
1138             this->numPrimsAssembled++;
1139             this->curIndex = 0;
1140         }
1141     }
1142 
ProcessVertLineStripPA_STATE_CUT1143     void ProcessVertLineStrip(uint32_t index, bool finish)
1144     {
1145         this->vert[this->curIndex] = index;
1146         this->curIndex++;
1147         if (this->curIndex == 2)
1148         {
1149             // assembled enough verts for prim, add to gather indices
1150             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1151             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1152 
1153             // increment numPrimsAssembled
1154             this->numPrimsAssembled++;
1155 
1156             // set up next prim state
1157             this->vert[0]  = this->vert[1];
1158             this->curIndex = 1;
1159         }
1160     }
1161 
ProcessVertLineStripAdjPA_STATE_CUT1162     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1163     {
1164         this->vert[this->curIndex] = index;
1165         this->curIndex++;
1166         if (this->curIndex == 4)
1167         {
1168             // assembled enough verts for prim, add to gather indices
1169             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1170             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1171             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1172             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1173 
1174             // increment numPrimsAssembled
1175             this->numPrimsAssembled++;
1176 
1177             // set up next prim state
1178             this->vert[0]  = this->vert[1];
1179             this->vert[1]  = this->vert[2];
1180             this->vert[2]  = this->vert[3];
1181             this->curIndex = 3;
1182         }
1183     }
1184 
ProcessVertLineStripAdjNoGsPA_STATE_CUT1185     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1186     {
1187         this->vert[this->curIndex] = index;
1188         this->curIndex++;
1189         if (this->curIndex == 4)
1190         {
1191             // assembled enough verts for prim, add to gather indices
1192             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1193             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1194 
1195             // increment numPrimsAssembled
1196             this->numPrimsAssembled++;
1197 
1198             // set up next prim state
1199             this->vert[0]  = this->vert[1];
1200             this->vert[1]  = this->vert[2];
1201             this->vert[2]  = this->vert[3];
1202             this->curIndex = 3;
1203         }
1204     }
1205 
ProcessVertLineListAdjPA_STATE_CUT1206     void ProcessVertLineListAdj(uint32_t index, bool finish)
1207     {
1208         this->vert[this->curIndex] = index;
1209         this->curIndex++;
1210         if (this->curIndex == 4)
1211         {
1212             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1213             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1214             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1215             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1216 
1217             this->numPrimsAssembled++;
1218             this->curIndex = 0;
1219         }
1220     }
1221 
ProcessVertLineListAdjNoGsPA_STATE_CUT1222     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1223     {
1224         this->vert[this->curIndex] = index;
1225         this->curIndex++;
1226         if (this->curIndex == 4)
1227         {
1228             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1229             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1230 
1231             this->numPrimsAssembled++;
1232             this->curIndex = 0;
1233         }
1234     }
1235 
ProcessVertPointListPA_STATE_CUT1236     void ProcessVertPointList(uint32_t index, bool finish)
1237     {
1238         this->vert[this->curIndex] = index;
1239         this->curIndex++;
1240         if (this->curIndex == 1)
1241         {
1242             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1243             this->numPrimsAssembled++;
1244             this->curIndex = 0;
1245         }
1246     }
1247 
ProcessVertRectListPA_STATE_CUT1248     void ProcessVertRectList(uint32_t index, bool finish)
1249     {
1250         this->vert[this->curIndex] = index;
1251         this->curIndex++;
1252         if (this->curIndex == 3)
1253         {
1254             // assembled enough verts for prim, add to gather indices
1255             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1256             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1257             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1258 
1259             // second triangle in the rectangle
1260             // v1, v3 = v1 + v2 - v0, v2
1261             this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
1262             this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
1263             this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];
1264 
1265             // increment numPrimsAssembled
1266             this->numPrimsAssembled += 2;
1267 
1268             // set up next prim state
1269             this->curIndex = 0;
1270         }
1271     }
1272 };
1273 
1274 // Primitive Assembly for data output from the DomainShader.
1275 struct PA_TESS : PA_STATE
1276 {
1277     PA_TESS(DRAW_CONTEXT*     in_pDC,
1278             const SIMDSCALAR* in_pVertData,
1279             uint32_t          in_attributeStrideInVectors,
1280             uint32_t          in_vertexStride,
1281             uint32_t          in_numAttributes,
1282             uint32_t* (&in_ppIndices)[3],
1283             uint32_t           in_numPrims,
1284             PRIMITIVE_TOPOLOGY in_binTopology,
1285             uint32_t           numVertsPerPrim,
1286             bool               SOA = true) :
1287 
1288         PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
1289         m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
1290         m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
1291     {
1292 #if USE_SIMD16_FRONTEND
1293         m_vPrimId = _simd16_setzero_si();
1294 #else
1295         m_vPrimId = _simd_setzero_si();
1296 #endif
1297         binTopology    = in_binTopology;
1298         m_ppIndices[0] = in_ppIndices[0];
1299         m_ppIndices[1] = in_ppIndices[1];
1300         m_ppIndices[2] = in_ppIndices[2];
1301 
1302         switch (binTopology)
1303         {
1304         case TOP_POINT_LIST:
1305             m_numVertsPerPrim = 1;
1306             break;
1307 
1308         case TOP_LINE_LIST:
1309             m_numVertsPerPrim = 2;
1310             break;
1311 
1312         case TOP_TRIANGLE_LIST:
1313             m_numVertsPerPrim = 3;
1314             break;
1315 
1316         default:
1317             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1318             break;
1319         }
1320     }
1321 
HasWorkPA_TESS1322     bool HasWork() { return m_numPrims != 0; }
1323 
GetSimdVectorPA_TESS1324     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1325     {
1326         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1327         return junkVector;
1328     }
1329 
1330 #if ENABLE_AVX512_SIMD16
GetSimdVector_simd16PA_TESS1331     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1332     {
1333         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1334         return junkVector_simd16;
1335     }
1336 
1337 #endif
GenPrimMaskPA_TESS1338     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1339     {
1340         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1341 #if USE_SIMD16_FRONTEND
1342         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
1343             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1344             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
1345 
1346         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1347 #else
1348         static const OSALIGNLINE(int32_t)
1349             maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
1350 
1351         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1352 #endif
1353     }
1354 
AssemblePA_TESS1355     bool Assemble(uint32_t slot, simdvector verts[])
1356     {
1357         SWR_ASSERT(slot < m_numAttributes);
1358 
1359         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1360         if (0 == numPrimsToAssemble)
1361         {
1362             return false;
1363         }
1364 
1365         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1366 
1367         const float* pBaseAttrib;
1368         if (m_SOA)
1369         {
1370             pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1371         }
1372         else
1373         {
1374             const float* pVertData = (const float*)m_pVertexData;
1375             pBaseAttrib            = pVertData + slot * 4;
1376         }
1377 
1378         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1379         {
1380 #if USE_SIMD16_FRONTEND
1381             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1382 #else
1383             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1384 #endif
1385 
1386             const float* pBase = pBaseAttrib;
1387             for (uint32_t c = 0; c < 4; ++c)
1388             {
1389 #if USE_SIMD16_FRONTEND
1390                 simd16scalar temp =
1391                     _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
1392                                               pBase,
1393                                               indices,
1394                                               _simd16_castsi_ps(mask),
1395                                               4 /* gcc doesn't like sizeof(float) */);
1396 
1397                 verts[i].v[c] =
1398                     useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1399 #else
1400                 verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
1401                                                         pBase,
1402                                                         indices,
1403                                                         _simd_castsi_ps(mask),
1404                                                         4); // gcc doesn't like sizeof(float)
1405 #endif
1406                 if (m_SOA)
1407                 {
1408                     pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1409                 }
1410                 else
1411                 {
1412                     pBase += sizeof(float);
1413                 }
1414             }
1415         }
1416 
1417         return true;
1418     }
1419 
1420 #if ENABLE_AVX512_SIMD16
AssemblePA_TESS1421     bool Assemble(uint32_t slot, simd16vector verts[])
1422     {
1423         SWR_ASSERT(slot < m_numAttributes);
1424 
1425         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1426         if (0 == numPrimsToAssemble)
1427         {
1428             return false;
1429         }
1430 
1431         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1432 
1433         const float* pBaseAttrib;
1434         if (m_SOA)
1435         {
1436             pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1437         }
1438         else
1439         {
1440             const float* pVertData = (const float*)m_pVertexData;
1441             pBaseAttrib            = pVertData + slot * 4;
1442         }
1443 
1444         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1445         {
1446 #if USE_SIMD16_FRONTEND
1447             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1448             if (!m_SOA)
1449             {
1450                 indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
1451             }
1452 #else
1453             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1454 #endif
1455 
1456             const float* pBase = pBaseAttrib;
1457             for (uint32_t c = 0; c < 4; ++c)
1458             {
1459 #if USE_SIMD16_FRONTEND
1460                 verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
1461                                                           pBase,
1462                                                           indices,
1463                                                           _simd16_castsi_ps(mask),
1464                                                           4 /* gcc doesn't like sizeof(float) */);
1465 #else
1466                 simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
1467                                                           pBase,
1468                                                           indices,
1469                                                           _simd_castsi_ps(mask),
1470                                                           4 /* gcc doesn't like sizeof(float) */);
1471                 verts[i].v[c]   = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1472 #endif
1473                 if (m_SOA)
1474                 {
1475                     pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1476                 }
1477                 else
1478                 {
1479                     pBase++;
1480                 }
1481             }
1482         }
1483 
1484         return true;
1485     }
1486 
1487 #endif
AssembleSinglePA_TESS1488     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1489     {
1490         SWR_ASSERT(slot < m_numAttributes);
1491 
1492 
1493         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1494 
1495         const float* pVertDataBase;
1496         if (m_SOA)
1497         {
1498             pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1499         }
1500         else
1501         {
1502             const float* pVertData = (const float*)m_pVertexData;
1503             pVertDataBase          = pVertData + slot * 4;
1504         };
1505         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1506         {
1507 #if USE_SIMD16_FRONTEND
1508             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
1509                                                 : m_ppIndices[i][primIndex];
1510             if (!m_SOA)
1511             {
1512                 index *= (vertexStride / 4);
1513             }
1514 #else
1515             uint32_t index = m_ppIndices[i][primIndex];
1516 #endif
1517             const float* pVertData = pVertDataBase;
1518             float*       pVert     = (float*)&verts[i];
1519 
1520             for (uint32_t c = 0; c < 4; ++c)
1521             {
1522                 pVert[c] = pVertData[index];
1523                 if (m_SOA)
1524                 {
1525                     pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1526                 }
1527                 else
1528                 {
1529                     pVertData++;
1530                 }
1531             }
1532 
1533         }
1534     }
1535 
NextPrimPA_TESS1536     bool NextPrim()
1537     {
1538         uint32_t numPrims = PA_TESS::NumPrims();
1539         m_numPrims -= numPrims;
1540         m_ppIndices[0] += numPrims;
1541         m_ppIndices[1] += numPrims;
1542         m_ppIndices[2] += numPrims;
1543 
1544         return HasWork();
1545     }
1546 
GetNextVsOutputPA_TESS1547     SIMDVERTEX& GetNextVsOutput()
1548     {
1549         SWR_NOT_IMPL;
1550         return junkVertex;
1551     }
1552 
GetNextStreamOutputPA_TESS1553     bool GetNextStreamOutput()
1554     {
1555         SWR_NOT_IMPL;
1556         return false;
1557     }
1558 
GetNextVsIndicesPA_TESS1559     SIMDMASK& GetNextVsIndices()
1560     {
1561         SWR_NOT_IMPL;
1562         return junkIndices;
1563     }
1564 
NumPrimsPA_TESS1565     uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }
1566 
ResetPA_TESS1567     void Reset() { SWR_NOT_IMPL; }
1568 
GetPrimIDPA_TESS1569     SIMDSCALARI GetPrimID(uint32_t startID)
1570     {
1571 #if USE_SIMD16_FRONTEND
1572         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1573 #else
1574         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1575 #endif
1576     }
1577 
1578 private:
1579     const SIMDSCALAR* m_pVertexData              = nullptr;
1580     uint32_t          m_attributeStrideInVectors = 0;
1581     uint32_t          m_numAttributes            = 0;
1582     uint32_t          m_numPrims                 = 0;
1583     uint32_t*         m_ppIndices[3];
1584 
1585     uint32_t m_numVertsPerPrim = 0;
1586 
1587     SIMDSCALARI m_vPrimId;
1588 
1589     simdvector junkVector; // junk simdvector for unimplemented API
1590 #if ENABLE_AVX512_SIMD16
1591     simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
1592 #endif
1593     SIMDVERTEX junkVertex;  // junk SIMDVERTEX for unimplemented API
1594     SIMDMASK   junkIndices; // temporary index store for unused virtual function
1595 
1596     bool m_SOA;
1597 };
1598 
1599 // Primitive Assembler factory class, responsible for creating and initializing the correct
1600 // assembler based on state.
1601 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1602 struct PA_FACTORY
1603 {
PA_FACTORYPA_FACTORY1604     PA_FACTORY(DRAW_CONTEXT*         pDC,
1605                PRIMITIVE_TOPOLOGY    in_topo,
1606                uint32_t              numVerts,
1607                PA_STATE::SIMDVERTEX* pVertexStore,
1608                uint32_t              vertexStoreSize,
1609                uint32_t              vertexStride,
1610                uint32_t              numVertsPerPrim) :
1611         topo(in_topo)
1612     {
1613 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1614         const API_STATE& state = GetApiState(pDC);
1615         if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
1616              (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
1617               topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||
1618 
1619             // non-indexed draws with adjacency topologies must use cut-aware PA until we add
1620             // support for them in the optimized PA
1621             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
1622              topo == TOP_TRI_STRIP_ADJ))
1623         {
1624             memset(&indexStore, 0, sizeof(indexStore));
1625             uint32_t numAttribs = state.feNumAttributes;
1626 
1627             new (&this->paCut) PA_STATE_CUT(pDC,
1628                                             reinterpret_cast<uint8_t*>(pVertexStore),
1629                                             vertexStoreSize * PA_STATE::SIMD_WIDTH,
1630                                             vertexStride,
1631                                             &this->indexStore[0],
1632                                             numVerts,
1633                                             numAttribs,
1634                                             state.topology,
1635                                             false,
1636                                             numVertsPerPrim);
1637             cutPA = true;
1638         }
1639         else
1640 #endif
1641         {
1642             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1643             new (&this->paOpt) PA_STATE_OPT(pDC,
1644                                             numPrims,
1645                                             reinterpret_cast<uint8_t*>(pVertexStore),
1646                                             vertexStoreSize * PA_STATE::SIMD_WIDTH,
1647                                             vertexStride,
1648                                             false,
1649                                             numVertsPerPrim);
1650             cutPA = false;
1651         }
1652     }
1653 
GetPAPA_FACTORY1654     PA_STATE& GetPA()
1655     {
1656 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1657         if (cutPA)
1658         {
1659             return this->paCut;
1660         }
1661         else
1662 #endif
1663         {
1664             return this->paOpt;
1665         }
1666     }
1667 
1668     PA_STATE_OPT paOpt;
1669     PA_STATE_CUT paCut;
1670 
1671     bool cutPA{false};
1672 
1673     PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};
1674 
1675     PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
1676 };
1677