1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file pa_avx.cpp
24  *
25  * @brief AVX implementation for primitive assembly.
26  *        N primitives are assembled at a time, where N is the SIMD width.
27  *        A state machine, that is specific for a given topology, drives the
28  *        assembly of vertices into triangles.
29  *
30  ******************************************************************************/
31 #include "context.h"
32 #include "pa.h"
33 #include "frontend.h"
34 
35 #if (KNOB_SIMD_WIDTH == 8)
36 
swizzleLane0(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)37 INLINE simd4scalar swizzleLane0(const simdscalar& x,
38                                 const simdscalar& y,
39                                 const simdscalar& z,
40                                 const simdscalar& w)
41 {
42     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
43     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
44     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
45 }
46 
swizzleLane1(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)47 INLINE simd4scalar swizzleLane1(const simdscalar& x,
48                                 const simdscalar& y,
49                                 const simdscalar& z,
50                                 const simdscalar& w)
51 {
52     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
53     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
54     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
55 }
56 
swizzleLane2(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)57 INLINE simd4scalar swizzleLane2(const simdscalar& x,
58                                 const simdscalar& y,
59                                 const simdscalar& z,
60                                 const simdscalar& w)
61 {
62     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
63     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
64     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
65 }
66 
swizzleLane3(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)67 INLINE simd4scalar swizzleLane3(const simdscalar& x,
68                                 const simdscalar& y,
69                                 const simdscalar& z,
70                                 const simdscalar& w)
71 {
72     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
73     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
74     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
75 }
76 
swizzleLane4(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)77 INLINE simd4scalar swizzleLane4(const simdscalar& x,
78                                 const simdscalar& y,
79                                 const simdscalar& z,
80                                 const simdscalar& w)
81 {
82     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
83     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
84     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
85 }
86 
swizzleLane5(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)87 INLINE simd4scalar swizzleLane5(const simdscalar& x,
88                                 const simdscalar& y,
89                                 const simdscalar& z,
90                                 const simdscalar& w)
91 {
92     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
93     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
94     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
95 }
96 
swizzleLane6(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)97 INLINE simd4scalar swizzleLane6(const simdscalar& x,
98                                 const simdscalar& y,
99                                 const simdscalar& z,
100                                 const simdscalar& w)
101 {
102     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
103     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
104     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
105 }
106 
swizzleLane7(const simdscalar & x,const simdscalar & y,const simdscalar & z,const simdscalar & w)107 INLINE simd4scalar swizzleLane7(const simdscalar& x,
108                                 const simdscalar& y,
109                                 const simdscalar& z,
110                                 const simdscalar& w)
111 {
112     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
113     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
114     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
115 }
116 
swizzleLane0(const simdvector & v)117 INLINE simd4scalar swizzleLane0(const simdvector& v)
118 {
119     return swizzleLane0(v.x, v.y, v.z, v.w);
120 }
121 
swizzleLane1(const simdvector & v)122 INLINE simd4scalar swizzleLane1(const simdvector& v)
123 {
124     return swizzleLane1(v.x, v.y, v.z, v.w);
125 }
126 
swizzleLane2(const simdvector & v)127 INLINE simd4scalar swizzleLane2(const simdvector& v)
128 {
129     return swizzleLane2(v.x, v.y, v.z, v.w);
130 }
131 
swizzleLane3(const simdvector & v)132 INLINE simd4scalar swizzleLane3(const simdvector& v)
133 {
134     return swizzleLane3(v.x, v.y, v.z, v.w);
135 }
136 
swizzleLane4(const simdvector & v)137 INLINE simd4scalar swizzleLane4(const simdvector& v)
138 {
139     return swizzleLane4(v.x, v.y, v.z, v.w);
140 }
141 
swizzleLane5(const simdvector & v)142 INLINE simd4scalar swizzleLane5(const simdvector& v)
143 {
144     return swizzleLane5(v.x, v.y, v.z, v.w);
145 }
146 
swizzleLane6(const simdvector & v)147 INLINE simd4scalar swizzleLane6(const simdvector& v)
148 {
149     return swizzleLane6(v.x, v.y, v.z, v.w);
150 }
151 
swizzleLane7(const simdvector & v)152 INLINE simd4scalar swizzleLane7(const simdvector& v)
153 {
154     return swizzleLane7(v.x, v.y, v.z, v.w);
155 }
156 
swizzleLaneN(const simdvector & v,int lane)157 INLINE simd4scalar swizzleLaneN(const simdvector& v, int lane)
158 {
159     switch (lane)
160     {
161     case 0:
162         return swizzleLane0(v);
163     case 1:
164         return swizzleLane1(v);
165     case 2:
166         return swizzleLane2(v);
167     case 3:
168         return swizzleLane3(v);
169     case 4:
170         return swizzleLane4(v);
171     case 5:
172         return swizzleLane5(v);
173     case 6:
174         return swizzleLane6(v);
175     case 7:
176         return swizzleLane7(v);
177     default:
178         return _mm_setzero_ps();
179     }
180 }
181 
182 #if ENABLE_AVX512_SIMD16
swizzleLane0(const simd16vector & v)183 INLINE simd4scalar swizzleLane0(const simd16vector& v)
184 {
185     return swizzleLane0(_simd16_extract_ps(v.x, 0),
186                         _simd16_extract_ps(v.y, 0),
187                         _simd16_extract_ps(v.z, 0),
188                         _simd16_extract_ps(v.w, 0));
189 }
190 
swizzleLane1(const simd16vector & v)191 INLINE simd4scalar swizzleLane1(const simd16vector& v)
192 {
193     return swizzleLane1(_simd16_extract_ps(v.x, 0),
194                         _simd16_extract_ps(v.y, 0),
195                         _simd16_extract_ps(v.z, 0),
196                         _simd16_extract_ps(v.w, 0));
197 }
198 
swizzleLane2(const simd16vector & v)199 INLINE simd4scalar swizzleLane2(const simd16vector& v)
200 {
201     return swizzleLane2(_simd16_extract_ps(v.x, 0),
202                         _simd16_extract_ps(v.y, 0),
203                         _simd16_extract_ps(v.z, 0),
204                         _simd16_extract_ps(v.w, 0));
205 }
206 
swizzleLane3(const simd16vector & v)207 INLINE simd4scalar swizzleLane3(const simd16vector& v)
208 {
209     return swizzleLane3(_simd16_extract_ps(v.x, 0),
210                         _simd16_extract_ps(v.y, 0),
211                         _simd16_extract_ps(v.z, 0),
212                         _simd16_extract_ps(v.w, 0));
213 }
214 
swizzleLane4(const simd16vector & v)215 INLINE simd4scalar swizzleLane4(const simd16vector& v)
216 {
217     return swizzleLane4(_simd16_extract_ps(v.x, 0),
218                         _simd16_extract_ps(v.y, 0),
219                         _simd16_extract_ps(v.z, 0),
220                         _simd16_extract_ps(v.w, 0));
221 }
222 
swizzleLane5(const simd16vector & v)223 INLINE simd4scalar swizzleLane5(const simd16vector& v)
224 {
225     return swizzleLane5(_simd16_extract_ps(v.x, 0),
226                         _simd16_extract_ps(v.y, 0),
227                         _simd16_extract_ps(v.z, 0),
228                         _simd16_extract_ps(v.w, 0));
229 }
230 
swizzleLane6(const simd16vector & v)231 INLINE simd4scalar swizzleLane6(const simd16vector& v)
232 {
233     return swizzleLane6(_simd16_extract_ps(v.x, 0),
234                         _simd16_extract_ps(v.y, 0),
235                         _simd16_extract_ps(v.z, 0),
236                         _simd16_extract_ps(v.w, 0));
237 }
238 
swizzleLane7(const simd16vector & v)239 INLINE simd4scalar swizzleLane7(const simd16vector& v)
240 {
241     return swizzleLane7(_simd16_extract_ps(v.x, 0),
242                         _simd16_extract_ps(v.y, 0),
243                         _simd16_extract_ps(v.z, 0),
244                         _simd16_extract_ps(v.w, 0));
245 }
246 
swizzleLane8(const simd16vector & v)247 INLINE simd4scalar swizzleLane8(const simd16vector& v)
248 {
249     return swizzleLane0(_simd16_extract_ps(v.x, 1),
250                         _simd16_extract_ps(v.y, 1),
251                         _simd16_extract_ps(v.z, 1),
252                         _simd16_extract_ps(v.w, 1));
253 }
254 
swizzleLane9(const simd16vector & v)255 INLINE simd4scalar swizzleLane9(const simd16vector& v)
256 {
257     return swizzleLane1(_simd16_extract_ps(v.x, 1),
258                         _simd16_extract_ps(v.y, 1),
259                         _simd16_extract_ps(v.z, 1),
260                         _simd16_extract_ps(v.w, 1));
261 }
262 
swizzleLaneA(const simd16vector & v)263 INLINE simd4scalar swizzleLaneA(const simd16vector& v)
264 {
265     return swizzleLane2(_simd16_extract_ps(v.x, 1),
266                         _simd16_extract_ps(v.y, 1),
267                         _simd16_extract_ps(v.z, 1),
268                         _simd16_extract_ps(v.w, 1));
269 }
270 
swizzleLaneB(const simd16vector & v)271 INLINE simd4scalar swizzleLaneB(const simd16vector& v)
272 {
273     return swizzleLane3(_simd16_extract_ps(v.x, 1),
274                         _simd16_extract_ps(v.y, 1),
275                         _simd16_extract_ps(v.z, 1),
276                         _simd16_extract_ps(v.w, 1));
277 }
278 
swizzleLaneC(const simd16vector & v)279 INLINE simd4scalar swizzleLaneC(const simd16vector& v)
280 {
281     return swizzleLane4(_simd16_extract_ps(v.x, 1),
282                         _simd16_extract_ps(v.y, 1),
283                         _simd16_extract_ps(v.z, 1),
284                         _simd16_extract_ps(v.w, 1));
285 }
286 
swizzleLaneD(const simd16vector & v)287 INLINE simd4scalar swizzleLaneD(const simd16vector& v)
288 {
289     return swizzleLane5(_simd16_extract_ps(v.x, 1),
290                         _simd16_extract_ps(v.y, 1),
291                         _simd16_extract_ps(v.z, 1),
292                         _simd16_extract_ps(v.w, 1));
293 }
294 
swizzleLaneE(const simd16vector & v)295 INLINE simd4scalar swizzleLaneE(const simd16vector& v)
296 {
297     return swizzleLane6(_simd16_extract_ps(v.x, 1),
298                         _simd16_extract_ps(v.y, 1),
299                         _simd16_extract_ps(v.z, 1),
300                         _simd16_extract_ps(v.w, 1));
301 }
302 
swizzleLaneF(const simd16vector & v)303 INLINE simd4scalar swizzleLaneF(const simd16vector& v)
304 {
305     return swizzleLane7(_simd16_extract_ps(v.x, 1),
306                         _simd16_extract_ps(v.y, 1),
307                         _simd16_extract_ps(v.z, 1),
308                         _simd16_extract_ps(v.w, 1));
309 }
310 
swizzleLaneN(const simd16vector & v,int lane)311 INLINE simd4scalar swizzleLaneN(const simd16vector& v, int lane)
312 {
313     switch (lane)
314     {
315     case 0:
316         return swizzleLane0(v);
317     case 1:
318         return swizzleLane1(v);
319     case 2:
320         return swizzleLane2(v);
321     case 3:
322         return swizzleLane3(v);
323     case 4:
324         return swizzleLane4(v);
325     case 5:
326         return swizzleLane5(v);
327     case 6:
328         return swizzleLane6(v);
329     case 7:
330         return swizzleLane7(v);
331     case 8:
332         return swizzleLane8(v);
333     case 9:
334         return swizzleLane9(v);
335     case 10:
336         return swizzleLaneA(v);
337     case 11:
338         return swizzleLaneB(v);
339     case 12:
340         return swizzleLaneC(v);
341     case 13:
342         return swizzleLaneD(v);
343     case 14:
344         return swizzleLaneE(v);
345     case 15:
346         return swizzleLaneF(v);
347     default:
348         return _mm_setzero_ps();
349     }
350 }
351 
352 #endif
353 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
354 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
355 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
356 #if ENABLE_AVX512_SIMD16
357 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
358 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
359 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
360 #endif
361 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
362 
363 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
364 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
365 #if ENABLE_AVX512_SIMD16
366 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
367 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
368 #endif
369 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
370 
371 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
372 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
373 #if ENABLE_AVX512_SIMD16
374 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
375 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
376 #endif
377 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
378 
379 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
380 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
381 #if ENABLE_AVX512_SIMD16
382 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
383 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
384 #endif
385 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
386 
387 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
388 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
389 #if ENABLE_AVX512_SIMD16
390 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
391 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
392 #endif
393 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
394 
395 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
396 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
397 #if ENABLE_AVX512_SIMD16
398 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
399 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
400 #endif
401 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
402 
403 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
404 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
405 #if ENABLE_AVX512_SIMD16
406 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
407 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
408 #endif
409 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
410 
411 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
412 #if ENABLE_AVX512_SIMD16
413 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
414 #endif
415 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
416 
417 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
418 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
419 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
420 #if ENABLE_AVX512_SIMD16
421 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
422 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
423 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
424 #endif
425 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
426 
427 template <uint32_t TotalControlPoints>
PaPatchListSingle(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])428 void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
429 {
430     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
431     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
432     // Each attribute has 4 components.
433 
434     /// @todo Optimize this
435 
436 #if USE_SIMD16_FRONTEND
437     if (pa.useAlternateOffset)
438     {
439         primIndex += KNOB_SIMD_WIDTH;
440     }
441 
442 #endif
443     float* pOutVec = (float*)verts;
444 
445     for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
446     {
447         uint32_t input_cp = primIndex * TotalControlPoints + cp;
448 #if USE_SIMD16_FRONTEND
449         uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
450         uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
451 
452 #else
453         uint32_t input_vec  = input_cp / KNOB_SIMD_WIDTH;
454         uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
455 
456 #endif
457         // Loop over all components of the attribute
458         for (uint32_t i = 0; i < 4; ++i)
459         {
460 #if USE_SIMD16_FRONTEND
461             const float* pInputVec =
462                 (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
463 #else
464             const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
465 #endif
466             pOutVec[cp * 4 + i] = pInputVec[input_lane];
467         }
468     }
469 }
470 
471 template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])472 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
473 {
474     SetNextPaState(pa,
475                    PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
476                    PaPatchListSingle<TotalControlPoints>);
477 
478     return false;
479 }
480 
481 template <uint32_t TotalControlPoints>
PaPatchListTerm(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])482 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
483 {
484     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
485     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
486     // Each attribute has 4 components.
487 
488     /// @todo Optimize this
489 
490 #if USE_SIMD16_FRONTEND
491     uint32_t lane_offset = 0;
492 
493     if (pa.useAlternateOffset)
494     {
495         lane_offset = KNOB_SIMD_WIDTH;
496     }
497 
498 #endif
499     // Loop over all components of the attribute
500     for (uint32_t i = 0; i < 4; ++i)
501     {
502         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
503         {
504             float vec[KNOB_SIMD_WIDTH];
505             for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
506             {
507 #if USE_SIMD16_FRONTEND
508                 uint32_t input_cp   = (lane + lane_offset) * TotalControlPoints + cp;
509                 uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
510                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
511 
512                 const float* pInputVec =
513                     (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
514 #else
515                 uint32_t input_cp   = lane * TotalControlPoints + cp;
516                 uint32_t input_vec  = input_cp / KNOB_SIMD_WIDTH;
517                 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
518 
519                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
520 #endif
521                 vec[lane] = pInputVec[input_lane];
522             }
523             verts[cp][i] = _simd_loadu_ps(vec);
524         }
525     }
526 
527     SetNextPaState(pa,
528                    PaPatchList<TotalControlPoints>,
529                    PaPatchListSingle<TotalControlPoints>,
530                    0,
531                    PA_STATE_OPT::SIMD_WIDTH,
532                    true);
533 
534     return true;
535 }
536 
537 #if ENABLE_AVX512_SIMD16
538 template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
PaPatchList_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])539 static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
540 {
541     SetNextPaState_simd16(pa,
542                           PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
543                           PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
544                           PaPatchListSingle<TotalControlPoints>);
545 
546     return false;
547 }
548 
549 template <uint32_t TotalControlPoints>
PaPatchListTerm_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])550 static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
551 {
552     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
553     // KNOB_SIMD16_WIDTH * 1 patch.  This function is called once per attribute.
554     // Each attribute has 4 components.
555 
556     /// @todo Optimize this
557 
558     // Loop over all components of the attribute
559     for (uint32_t i = 0; i < 4; ++i)
560     {
561         for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
562         {
563             float vec[KNOB_SIMD16_WIDTH];
564             for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
565             {
566                 uint32_t input_cp   = lane * TotalControlPoints + cp;
567                 uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
568                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
569 
570                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
571                 vec[lane]              = pInputVec[input_lane];
572             }
573             verts[cp][i] = _simd16_loadu_ps(vec);
574         }
575     }
576 
577     SetNextPaState_simd16(pa,
578                           PaPatchList_simd16<TotalControlPoints>,
579                           PaPatchList<TotalControlPoints>,
580                           PaPatchListSingle<TotalControlPoints>,
581                           0,
582                           PA_STATE_OPT::SIMD_WIDTH,
583                           true);
584 
585     return true;
586 }
587 
588 #endif
589 #define PA_PATCH_LIST_TERMINATOR(N)                                              \
590     template <>                                                                  \
591     bool PaPatchList<N, N>(PA_STATE_OPT & pa, uint32_t slot, simdvector verts[]) \
592     {                                                                            \
593         return PaPatchListTerm<N>(pa, slot, verts);                              \
594     }
595 PA_PATCH_LIST_TERMINATOR(1)
596 PA_PATCH_LIST_TERMINATOR(2)
597 PA_PATCH_LIST_TERMINATOR(3)
598 PA_PATCH_LIST_TERMINATOR(4)
599 PA_PATCH_LIST_TERMINATOR(5)
600 PA_PATCH_LIST_TERMINATOR(6)
601 PA_PATCH_LIST_TERMINATOR(7)
602 PA_PATCH_LIST_TERMINATOR(8)
603 PA_PATCH_LIST_TERMINATOR(9)
604 PA_PATCH_LIST_TERMINATOR(10)
605 PA_PATCH_LIST_TERMINATOR(11)
606 PA_PATCH_LIST_TERMINATOR(12)
607 PA_PATCH_LIST_TERMINATOR(13)
608 PA_PATCH_LIST_TERMINATOR(14)
609 PA_PATCH_LIST_TERMINATOR(15)
610 PA_PATCH_LIST_TERMINATOR(16)
611 PA_PATCH_LIST_TERMINATOR(17)
612 PA_PATCH_LIST_TERMINATOR(18)
613 PA_PATCH_LIST_TERMINATOR(19)
614 PA_PATCH_LIST_TERMINATOR(20)
615 PA_PATCH_LIST_TERMINATOR(21)
616 PA_PATCH_LIST_TERMINATOR(22)
617 PA_PATCH_LIST_TERMINATOR(23)
618 PA_PATCH_LIST_TERMINATOR(24)
619 PA_PATCH_LIST_TERMINATOR(25)
620 PA_PATCH_LIST_TERMINATOR(26)
621 PA_PATCH_LIST_TERMINATOR(27)
622 PA_PATCH_LIST_TERMINATOR(28)
623 PA_PATCH_LIST_TERMINATOR(29)
624 PA_PATCH_LIST_TERMINATOR(30)
625 PA_PATCH_LIST_TERMINATOR(31)
626 PA_PATCH_LIST_TERMINATOR(32)
627 #undef PA_PATCH_LIST_TERMINATOR
628 
629 #if ENABLE_AVX512_SIMD16
630 #define PA_PATCH_LIST_TERMINATOR_SIMD16(N)                                                \
631     template <>                                                                           \
632     bool PaPatchList_simd16<N, N>(PA_STATE_OPT & pa, uint32_t slot, simd16vector verts[]) \
633     {                                                                                     \
634         return PaPatchListTerm_simd16<N>(pa, slot, verts);                                \
635     }
636 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
637 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
638 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
639 PA_PATCH_LIST_TERMINATOR_SIMD16(4)
640 PA_PATCH_LIST_TERMINATOR_SIMD16(5)
641 PA_PATCH_LIST_TERMINATOR_SIMD16(6)
642 PA_PATCH_LIST_TERMINATOR_SIMD16(7)
643 PA_PATCH_LIST_TERMINATOR_SIMD16(8)
644 PA_PATCH_LIST_TERMINATOR_SIMD16(9)
645 PA_PATCH_LIST_TERMINATOR_SIMD16(10)
646 PA_PATCH_LIST_TERMINATOR_SIMD16(11)
647 PA_PATCH_LIST_TERMINATOR_SIMD16(12)
648 PA_PATCH_LIST_TERMINATOR_SIMD16(13)
649 PA_PATCH_LIST_TERMINATOR_SIMD16(14)
650 PA_PATCH_LIST_TERMINATOR_SIMD16(15)
651 PA_PATCH_LIST_TERMINATOR_SIMD16(16)
652 PA_PATCH_LIST_TERMINATOR_SIMD16(17)
653 PA_PATCH_LIST_TERMINATOR_SIMD16(18)
654 PA_PATCH_LIST_TERMINATOR_SIMD16(19)
655 PA_PATCH_LIST_TERMINATOR_SIMD16(20)
656 PA_PATCH_LIST_TERMINATOR_SIMD16(21)
657 PA_PATCH_LIST_TERMINATOR_SIMD16(22)
658 PA_PATCH_LIST_TERMINATOR_SIMD16(23)
659 PA_PATCH_LIST_TERMINATOR_SIMD16(24)
660 PA_PATCH_LIST_TERMINATOR_SIMD16(25)
661 PA_PATCH_LIST_TERMINATOR_SIMD16(26)
662 PA_PATCH_LIST_TERMINATOR_SIMD16(27)
663 PA_PATCH_LIST_TERMINATOR_SIMD16(28)
664 PA_PATCH_LIST_TERMINATOR_SIMD16(29)
665 PA_PATCH_LIST_TERMINATOR_SIMD16(30)
666 PA_PATCH_LIST_TERMINATOR_SIMD16(31)
667 PA_PATCH_LIST_TERMINATOR_SIMD16(32)
668 #undef PA_PATCH_LIST_TERMINATOR_SIMD16
669 
670 #endif
PaTriList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])671 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
672 {
673     SetNextPaState(pa, PaTriList1, PaTriListSingle0);
674     return false; // Not enough vertices to assemble 4 or 8 triangles.
675 }
676 
PaTriList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])677 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
678 {
679     SetNextPaState(pa, PaTriList2, PaTriListSingle0);
680     return false; // Not enough vertices to assemble 8 triangles.
681 }
682 
PaTriList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])683 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
684 {
685 #if KNOB_ARCH == KNOB_ARCH_AVX
686 #if USE_SIMD16_FRONTEND
687     simdvector a;
688     simdvector b;
689     simdvector c;
690 
691     if (!pa.useAlternateOffset)
692     {
693         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
694         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
695 
696         for (uint32_t i = 0; i < 4; i += 1)
697         {
698             a[i] = _simd16_extract_ps(a_16[i], 0);
699             b[i] = _simd16_extract_ps(a_16[i], 1);
700             c[i] = _simd16_extract_ps(b_16[i], 0);
701         }
702     }
703     else
704     {
705         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
706         const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
707 
708         for (uint32_t i = 0; i < 4; i += 1)
709         {
710             a[i] = _simd16_extract_ps(b_16[i], 1);
711             b[i] = _simd16_extract_ps(c_16[i], 0);
712             c[i] = _simd16_extract_ps(c_16[i], 1);
713         }
714     }
715 
716 #else
717     simdvector& a = PaGetSimdVector(pa, 0, slot);
718     simdvector& b = PaGetSimdVector(pa, 1, slot);
719     simdvector& c = PaGetSimdVector(pa, 2, slot);
720 
721 #endif
722     simdscalar s;
723 
724     // Tri Pattern - provoking vertex is always v0
725     //  v0 -> 0 3 6 9  12 15 18 21
726     //  v1 -> 1 4 7 10 13 16 19 22
727     //  v2 -> 2 5 8 11 14 17 20 23
728 
729     for (int i = 0; i < 4; ++i)
730     {
731         simdvector& v0 = verts[0];
732         v0[i]          = _simd_blend_ps(a[i], b[i], 0x92);
733         v0[i]          = _simd_blend_ps(v0[i], c[i], 0x24);
734         v0[i]          = _simd_permute_ps_i(v0[i], 0x6C);
735         s              = _simd_permute2f128_ps(v0[i], v0[i], 0x21);
736         v0[i]          = _simd_blend_ps(v0[i], s, 0x44);
737 
738         simdvector& v1 = verts[1];
739         v1[i]          = _simd_blend_ps(a[i], b[i], 0x24);
740         v1[i]          = _simd_blend_ps(v1[i], c[i], 0x49);
741         v1[i]          = _simd_permute_ps_i(v1[i], 0xB1);
742         s              = _simd_permute2f128_ps(v1[i], v1[i], 0x21);
743         v1[i]          = _simd_blend_ps(v1[i], s, 0x66);
744 
745         simdvector& v2 = verts[2];
746         v2[i]          = _simd_blend_ps(a[i], b[i], 0x49);
747         v2[i]          = _simd_blend_ps(v2[i], c[i], 0x92);
748         v2[i]          = _simd_permute_ps_i(v2[i], 0xC6);
749         s              = _simd_permute2f128_ps(v2[i], v2[i], 0x21);
750         v2[i]          = _simd_blend_ps(v2[i], s, 0x22);
751     }
752 
753 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
754     const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
755     const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
756     const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
757 
758 #if USE_SIMD16_FRONTEND
759     simdvector a;
760     simdvector b;
761     simdvector c;
762 
763     if (!pa.useAlternateOffset)
764     {
765         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
766         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
767 
768         for (uint32_t i = 0; i < 4; i += 1)
769         {
770             a[i] = _simd16_extract_ps(a_16[i], 0);
771             b[i] = _simd16_extract_ps(a_16[i], 1);
772             c[i] = _simd16_extract_ps(b_16[i], 0);
773         }
774     }
775     else
776     {
777         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
778         const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
779 
780         for (uint32_t i = 0; i < 4; i += 1)
781         {
782             a[i] = _simd16_extract_ps(b_16[i], 1);
783             b[i] = _simd16_extract_ps(c_16[i], 0);
784             c[i] = _simd16_extract_ps(c_16[i], 1);
785         }
786     }
787 
788 #else
789     const simdvector& a = PaGetSimdVector(pa, 0, slot);
790     const simdvector& b = PaGetSimdVector(pa, 1, slot);
791     const simdvector& c = PaGetSimdVector(pa, 2, slot);
792 
793 #endif
794     //  v0 -> a0 a3 a6 b1 b4 b7 c2 c5
795     //  v1 -> a1 a4 a7 b2 b5 c0 c3 c6
796     //  v2 -> a2 a5 b0 b3 b6 c1 c4 c7
797 
798     simdvector& v0 = verts[0];
799     simdvector& v1 = verts[1];
800     simdvector& v2 = verts[2];
801 
802     // for simd x, y, z, and w
803     for (int i = 0; i < 4; ++i)
804     {
805         simdscalar temp0 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
806         simdscalar temp1 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
807         simdscalar temp2 = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
808 
809         v0[i] = _simd_permute_ps(temp0, perm0);
810         v1[i] = _simd_permute_ps(temp1, perm1);
811         v2[i] = _simd_permute_ps(temp2, perm2);
812     }
813 
814 #endif
815     SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
816     return true;
817 }
818 
819 #if ENABLE_AVX512_SIMD16
PaTriList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])820 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
821 {
822     SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
823     return false; // Not enough vertices to assemble 16 triangles
824 }
825 
PaTriList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])826 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
827 {
828     SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
829     return false; // Not enough vertices to assemble 16 triangles
830 }
831 
PaTriList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])832 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
833 {
834     // clang-format off
835 
836 #if KNOB_ARCH >= KNOB_ARCH_AVX2
837     const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11,  8, 5, 2, 15, 12,  9, 6, 3, 0);
838     const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12,  9, 6, 3,  0, 13, 10, 7, 4, 1);
839     const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3,  0, 13, 10, 7, 4,  1, 14, 11, 8, 5, 2);
840 #else // KNOB_ARCH == KNOB_ARCH_AVX
841     simd16scalar perm0 = _simd16_setzero_ps();
842     simd16scalar perm1 = _simd16_setzero_ps();
843     simd16scalar perm2 = _simd16_setzero_ps();
844 #endif
845 
846     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
847     const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
848     const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
849 
850     const simd16mask mask0 = 0x4924;
851     const simd16mask mask1 = 0x2492;
852     const simd16mask mask2 = 0x9249;
853 
854     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
855     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
856     //  v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
857 
858     simd16vector& v0 = verts[0];
859     simd16vector& v1 = verts[1];
860     simd16vector& v2 = verts[2];
861 
862     // for simd16 x, y, z, and w
863     for (int i = 0; i < 4; i += 1)
864     {
865         simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
866         simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
867         simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
868 
869         simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask0), tempc, mask1);
870         simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask2), tempc, mask0);
871         simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask1), tempc, mask2);
872 
873 #if KNOB_ARCH >= KNOB_ARCH_AVX2
874         v0[i] = _simd16_permute_ps(temp0, perm0);
875         v1[i] = _simd16_permute_ps(temp1, perm1);
876         v2[i] = _simd16_permute_ps(temp2, perm2);
877 #else // #if KNOB_ARCH == KNOB_ARCH_AVX
878 
879         // the general permutes (above) are prohibitively slow to emulate on AVX (its scalar code)
880 
881         temp0 = _simd16_permute_ps_i(temp0, 0x6C);           // (0, 3, 2, 1) => 00 11 01 10 => 0x6C
882         perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
883         temp0 = _simd16_blend_ps(temp0, perm0, 0x4444);      // 0010 0010 0010 0010
884         perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
885         v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838);      // 0001 1100 0001 1100
886 
887         temp1 = _simd16_permute_ps_i(temp1, 0xB1);           // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
888         perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
889         temp1 = _simd16_blend_ps(temp1, perm1, 0x6666);      // 0010 0010 0010 0010
890         perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
891         v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818);      // 0001 1000 0001 1000
892 
893         temp2 = _simd16_permute_ps_i(temp2, 0xC6);           // (2, 1, 0, 3) => 01 10 00 11 => 0xC6
894         perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
895         temp2 = _simd16_blend_ps(temp2, perm2, 0x2222);      // 0100 0100 0100 0100
896         perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
897         v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C);      // 0011 1000 0011 1000
898 #endif
899     }
900 
901     SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
902     return true;
903 
904     // clang-format on
905 }
906 
907 #endif
PaTriListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])908 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
909 {
910 #if USE_SIMD16_FRONTEND
911     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
912     const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
913     const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
914 
915     if (pa.useAlternateOffset)
916     {
917         primIndex += KNOB_SIMD_WIDTH;
918     }
919 
920     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
921     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
922     //  v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
923 
924     switch (primIndex)
925     {
926     case 0:
927         verts[0] = swizzleLane0(a);
928         verts[1] = swizzleLane1(a);
929         verts[2] = swizzleLane2(a);
930         break;
931     case 1:
932         verts[0] = swizzleLane3(a);
933         verts[1] = swizzleLane4(a);
934         verts[2] = swizzleLane5(a);
935         break;
936     case 2:
937         verts[0] = swizzleLane6(a);
938         verts[1] = swizzleLane7(a);
939         verts[2] = swizzleLane8(a);
940         break;
941     case 3:
942         verts[0] = swizzleLane9(a);
943         verts[1] = swizzleLaneA(a);
944         verts[2] = swizzleLaneB(a);
945         break;
946     case 4:
947         verts[0] = swizzleLaneC(a);
948         verts[1] = swizzleLaneD(a);
949         verts[2] = swizzleLaneE(a);
950         break;
951     case 5:
952         verts[0] = swizzleLaneF(a);
953         verts[1] = swizzleLane0(b);
954         verts[2] = swizzleLane1(b);
955         break;
956     case 6:
957         verts[0] = swizzleLane2(b);
958         verts[1] = swizzleLane3(b);
959         verts[2] = swizzleLane4(b);
960         break;
961     case 7:
962         verts[0] = swizzleLane5(b);
963         verts[1] = swizzleLane6(b);
964         verts[2] = swizzleLane7(b);
965         break;
966     case 8:
967         verts[0] = swizzleLane8(b);
968         verts[1] = swizzleLane9(b);
969         verts[2] = swizzleLaneA(b);
970         break;
971     case 9:
972         verts[0] = swizzleLaneB(b);
973         verts[1] = swizzleLaneC(b);
974         verts[2] = swizzleLaneD(b);
975         break;
976     case 10:
977         verts[0] = swizzleLaneE(b);
978         verts[1] = swizzleLaneF(b);
979         verts[2] = swizzleLane0(c);
980         break;
981     case 11:
982         verts[0] = swizzleLane1(c);
983         verts[1] = swizzleLane2(c);
984         verts[2] = swizzleLane3(c);
985         break;
986     case 12:
987         verts[0] = swizzleLane4(c);
988         verts[1] = swizzleLane5(c);
989         verts[2] = swizzleLane6(c);
990         break;
991     case 13:
992         verts[0] = swizzleLane7(c);
993         verts[1] = swizzleLane8(c);
994         verts[2] = swizzleLane9(c);
995         break;
996     case 14:
997         verts[0] = swizzleLaneA(c);
998         verts[1] = swizzleLaneB(c);
999         verts[2] = swizzleLaneC(c);
1000         break;
1001     case 15:
1002         verts[0] = swizzleLaneD(c);
1003         verts[1] = swizzleLaneE(c);
1004         verts[2] = swizzleLaneF(c);
1005         break;
1006     };
1007 #else
1008     // We have 12 simdscalars contained within 3 simdvectors which
1009     // hold at least 8 triangles worth of data. We want to assemble a single
1010     // triangle with data in horizontal form.
1011 
1012     const simdvector& a = PaGetSimdVector(pa, 0, slot);
1013     const simdvector& b = PaGetSimdVector(pa, 1, slot);
1014     const simdvector& c = PaGetSimdVector(pa, 2, slot);
1015 
1016     // Convert from vertical to horizontal.
1017     // Tri Pattern - provoking vertex is always v0
1018     //  v0 -> 0 3 6 9  12 15 18 21
1019     //  v1 -> 1 4 7 10 13 16 19 22
1020     //  v2 -> 2 5 8 11 14 17 20 23
1021 
1022     switch (primIndex)
1023     {
1024     case 0:
1025         verts[0] = swizzleLane0(a);
1026         verts[1] = swizzleLane1(a);
1027         verts[2] = swizzleLane2(a);
1028         break;
1029     case 1:
1030         verts[0] = swizzleLane3(a);
1031         verts[1] = swizzleLane4(a);
1032         verts[2] = swizzleLane5(a);
1033         break;
1034     case 2:
1035         verts[0] = swizzleLane6(a);
1036         verts[1] = swizzleLane7(a);
1037         verts[2] = swizzleLane0(b);
1038         break;
1039     case 3:
1040         verts[0] = swizzleLane1(b);
1041         verts[1] = swizzleLane2(b);
1042         verts[2] = swizzleLane3(b);
1043         break;
1044     case 4:
1045         verts[0] = swizzleLane4(b);
1046         verts[1] = swizzleLane5(b);
1047         verts[2] = swizzleLane6(b);
1048         break;
1049     case 5:
1050         verts[0] = swizzleLane7(b);
1051         verts[1] = swizzleLane0(c);
1052         verts[2] = swizzleLane1(c);
1053         break;
1054     case 6:
1055         verts[0] = swizzleLane2(c);
1056         verts[1] = swizzleLane3(c);
1057         verts[2] = swizzleLane4(c);
1058         break;
1059     case 7:
1060         verts[0] = swizzleLane5(c);
1061         verts[1] = swizzleLane6(c);
1062         verts[2] = swizzleLane7(c);
1063         break;
1064     };
1065 #endif
1066 }
1067 
PaTriStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1068 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1069 {
1070     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
1071     return false; // Not enough vertices to assemble 8 triangles.
1072 }
1073 
PaTriStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1074 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1075 {
1076 #if USE_SIMD16_FRONTEND
1077     simdvector a;
1078     simdvector b;
1079 
1080     if (!pa.useAlternateOffset)
1081     {
1082         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1083 
1084         for (uint32_t i = 0; i < 4; i += 1)
1085         {
1086             a[i] = _simd16_extract_ps(a_16[i], 0);
1087             b[i] = _simd16_extract_ps(a_16[i], 1);
1088         }
1089     }
1090     else
1091     {
1092         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
1093 
1094         for (uint32_t i = 0; i < 4; i += 1)
1095         {
1096             a[i] = _simd16_extract_ps(b_16[i], 0);
1097             b[i] = _simd16_extract_ps(b_16[i], 1);
1098         }
1099     }
1100 
1101 #else
1102     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
1103     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
1104 
1105 #endif
1106     simdscalar s;
1107 
1108     for (int i = 0; i < 4; ++i)
1109     {
1110         simdscalar a0 = a[i];
1111         simdscalar b0 = b[i];
1112 
1113         // Tri Pattern - provoking vertex is always v0
1114         //  v0 -> 01234567
1115         //  v1 -> 13355779
1116         //  v2 -> 22446688
1117         simdvector& v0 = verts[0];
1118         v0[i]          = a0;
1119 
1120         //  s -> 4567891011
1121         s = _simd_permute2f128_ps(a0, b0, 0x21);
1122         //  s -> 23456789
1123         s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1124 
1125         simdvector& v1 = verts[1];
1126         //  v1 -> 13355779
1127         v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
1128 
1129         simdvector& v2 = verts[2];
1130         //  v2 -> 22446688
1131         v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
1132     }
1133 
1134     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1135     return true;
1136 }
1137 
1138 #if ENABLE_AVX512_SIMD16
PaTriStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1139 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1140 {
1141     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
1142     return false; // Not enough vertices to assemble 16 triangles.
1143 }
1144 
PaTriStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1145 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1146 {
1147     // clang-format off
1148 
1149     const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1150     const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1151 
1152     const simd16mask mask0 = 0xF000;
1153 
1154     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1155     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1156     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1157 
1158     simd16vector& v0 = verts[0];
1159     simd16vector& v1 = verts[1];
1160     simd16vector& v2 = verts[2];
1161 
1162     // for simd16 x, y, z, and w
1163     for (int i = 0; i < 4; i += 1)
1164     {
1165         simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1166         simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1167 
1168         simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
1169         simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1170 
1171         simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0);                                  // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
1172         simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2));              // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
1173 
1174         v0[i] = tempa;                                                                               // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1175         v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1176         v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2));                           // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1177     }
1178 
1179     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1180     return true;
1181 
1182     // clang-format on
1183 }
1184 
1185 #endif
PaTriStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1186 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1187 {
1188 #if USE_SIMD16_FRONTEND
1189     const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
1190     const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
1191 
1192     if (pa.useAlternateOffset)
1193     {
1194         primIndex += KNOB_SIMD_WIDTH;
1195     }
1196 
1197     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
1198     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
1199     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
1200 
1201     switch (primIndex)
1202     {
1203     case 0:
1204         verts[0] = swizzleLane0(a);
1205         verts[1] = swizzleLane1(a);
1206         verts[2] = swizzleLane2(a);
1207         break;
1208     case 1:
1209         verts[0] = swizzleLane1(a);
1210         verts[1] = swizzleLane3(a);
1211         verts[2] = swizzleLane2(a);
1212         break;
1213     case 2:
1214         verts[0] = swizzleLane2(a);
1215         verts[1] = swizzleLane3(a);
1216         verts[2] = swizzleLane4(a);
1217         break;
1218     case 3:
1219         verts[0] = swizzleLane3(a);
1220         verts[1] = swizzleLane5(a);
1221         verts[2] = swizzleLane4(a);
1222         break;
1223     case 4:
1224         verts[0] = swizzleLane4(a);
1225         verts[1] = swizzleLane5(a);
1226         verts[2] = swizzleLane6(a);
1227         break;
1228     case 5:
1229         verts[0] = swizzleLane5(a);
1230         verts[1] = swizzleLane7(a);
1231         verts[2] = swizzleLane6(a);
1232         break;
1233     case 6:
1234         verts[0] = swizzleLane6(a);
1235         verts[1] = swizzleLane7(a);
1236         verts[2] = swizzleLane8(a);
1237         break;
1238     case 7:
1239         verts[0] = swizzleLane7(a);
1240         verts[1] = swizzleLane9(a);
1241         verts[2] = swizzleLane8(a);
1242         break;
1243     case 8:
1244         verts[0] = swizzleLane8(a);
1245         verts[1] = swizzleLane9(a);
1246         verts[2] = swizzleLaneA(a);
1247         break;
1248     case 9:
1249         verts[0] = swizzleLane9(a);
1250         verts[1] = swizzleLaneB(a);
1251         verts[2] = swizzleLaneA(a);
1252         break;
1253     case 10:
1254         verts[0] = swizzleLaneA(a);
1255         verts[1] = swizzleLaneB(a);
1256         verts[2] = swizzleLaneC(a);
1257         break;
1258     case 11:
1259         verts[0] = swizzleLaneB(a);
1260         verts[1] = swizzleLaneD(a);
1261         verts[2] = swizzleLaneC(a);
1262         break;
1263     case 12:
1264         verts[0] = swizzleLaneC(a);
1265         verts[1] = swizzleLaneD(a);
1266         verts[2] = swizzleLaneE(a);
1267         break;
1268     case 13:
1269         verts[0] = swizzleLaneD(a);
1270         verts[1] = swizzleLaneF(a);
1271         verts[2] = swizzleLaneE(a);
1272         break;
1273     case 14:
1274         verts[0] = swizzleLaneE(a);
1275         verts[1] = swizzleLaneF(a);
1276         verts[2] = swizzleLane0(b);
1277         break;
1278     case 15:
1279         verts[0] = swizzleLaneF(a);
1280         verts[1] = swizzleLane1(b);
1281         verts[2] = swizzleLane0(b);
1282         break;
1283     };
1284 #else
1285     const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
1286     const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
1287 
1288     // Convert from vertical to horizontal.
1289     // Tri Pattern - provoking vertex is always v0
1290     //  v0 -> 01234567
1291     //  v1 -> 13355779
1292     //  v2 -> 22446688
1293 
1294     switch (primIndex)
1295     {
1296     case 0:
1297         verts[0] = swizzleLane0(a);
1298         verts[1] = swizzleLane1(a);
1299         verts[2] = swizzleLane2(a);
1300         break;
1301     case 1:
1302         verts[0] = swizzleLane1(a);
1303         verts[1] = swizzleLane3(a);
1304         verts[2] = swizzleLane2(a);
1305         break;
1306     case 2:
1307         verts[0] = swizzleLane2(a);
1308         verts[1] = swizzleLane3(a);
1309         verts[2] = swizzleLane4(a);
1310         break;
1311     case 3:
1312         verts[0] = swizzleLane3(a);
1313         verts[1] = swizzleLane5(a);
1314         verts[2] = swizzleLane4(a);
1315         break;
1316     case 4:
1317         verts[0] = swizzleLane4(a);
1318         verts[1] = swizzleLane5(a);
1319         verts[2] = swizzleLane6(a);
1320         break;
1321     case 5:
1322         verts[0] = swizzleLane5(a);
1323         verts[1] = swizzleLane7(a);
1324         verts[2] = swizzleLane6(a);
1325         break;
1326     case 6:
1327         verts[0] = swizzleLane6(a);
1328         verts[1] = swizzleLane7(a);
1329         verts[2] = swizzleLane0(b);
1330         break;
1331     case 7:
1332         verts[0] = swizzleLane7(a);
1333         verts[1] = swizzleLane1(b);
1334         verts[2] = swizzleLane0(b);
1335         break;
1336     };
1337 #endif
1338 }
1339 
PaTriFan0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1340 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1341 {
1342     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
1343     return false; // Not enough vertices to assemble 8 triangles.
1344 }
1345 
PaTriFan1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1346 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1347 {
1348 #if USE_SIMD16_FRONTEND
1349     simdvector leadVert;
1350     simdvector a;
1351     simdvector b;
1352 
1353     const simd16vector& leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1354 
1355     if (!pa.useAlternateOffset)
1356     {
1357         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
1358 
1359         for (uint32_t i = 0; i < 4; i += 1)
1360         {
1361             leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1362 
1363             a[i] = _simd16_extract_ps(a_16[i], 0);
1364             b[i] = _simd16_extract_ps(a_16[i], 1);
1365         }
1366     }
1367     else
1368     {
1369         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
1370 
1371         for (uint32_t i = 0; i < 4; i += 1)
1372         {
1373             leadVert[i] = _simd16_extract_ps(leadvert_16[i], 0);
1374 
1375             a[i] = _simd16_extract_ps(b_16[i], 0);
1376             b[i] = _simd16_extract_ps(b_16[i], 1);
1377         }
1378     }
1379 
1380 #else
1381     const simdvector& leadVert = PaGetSimdVector(pa, pa.first, slot);
1382     const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
1383     const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
1384 
1385 #endif
1386     simdscalar s;
1387 
1388     // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
1389     for (int i = 0; i < 4; ++i)
1390     {
1391         simdscalar a0 = a[i];
1392         simdscalar b0 = b[i];
1393 
1394         simdscalar comp = leadVert[i];
1395 
1396         simdvector& v0 = verts[0];
1397         v0[i]          = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
1398         v0[i]          = _simd_permute2f128_ps(v0[i], comp, 0x00);
1399 
1400         simdvector& v2 = verts[2];
1401         s              = _simd_permute2f128_ps(a0, b0, 0x21);
1402         v2[i]          = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
1403 
1404         simdvector& v1 = verts[1];
1405         v1[i]          = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
1406     }
1407 
1408     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1409     return true;
1410 }
1411 
1412 #if ENABLE_AVX512_SIMD16
PaTriFan0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1413 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1414 {
1415     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
1416     return false; // Not enough vertices to assemble 16 triangles.
1417 }
1418 
PaTriFan1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1419 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1420 {
1421     // clang-format off
1422 
1423     const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
1424     const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1425     const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1426 
1427     const simd16mask mask0 = 0xF000;
1428 
1429     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1430     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1431     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1432 
1433     simd16vector& v0 = verts[0];
1434     simd16vector& v1 = verts[1];
1435     simd16vector& v2 = verts[2];
1436 
1437     // for simd16 x, y, z, and w
1438     for (uint32_t i = 0; i < 4; i += 1)
1439     {
1440         simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1441         simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1442         simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
1443 
1444         simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0));              // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
1445 
1446         v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00);                                         // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1447 
1448         simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
1449         simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
1450 
1451         simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0);                                  // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
1452 
1453         simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2));              // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1454 
1455         v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1));                           // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1456         v2[i] = temp2;                                                                               // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1457     }
1458 
1459     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1460     return true;
1461 
1462     // clang-format on
1463 }
1464 
1465 #endif
PaTriFanSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1466 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1467 {
1468 #if USE_SIMD16_FRONTEND
1469     const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
1470     const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
1471     const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
1472 
1473     if (pa.useAlternateOffset)
1474     {
1475         primIndex += KNOB_SIMD_WIDTH;
1476     }
1477 
1478     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
1479     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
1480     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
1481 
1482     // vert 0 from leading vertex
1483     verts[0] = swizzleLane0(a);
1484 
1485     // vert 1
1486     if (primIndex < 15)
1487     {
1488         verts[1] = swizzleLaneN(b, primIndex + 1);
1489     }
1490     else
1491     {
1492         verts[1] = swizzleLane0(c);
1493     }
1494 
1495     // vert 2
1496     if (primIndex < 14)
1497     {
1498         verts[2] = swizzleLaneN(b, primIndex + 2);
1499     }
1500     else
1501     {
1502         verts[2] = swizzleLaneN(c, primIndex - 14);
1503     }
1504 #else
1505     const simdvector& a = PaGetSimdVector(pa, pa.first, slot);
1506     const simdvector& b = PaGetSimdVector(pa, pa.prev, slot);
1507     const simdvector& c = PaGetSimdVector(pa, pa.cur, slot);
1508 
1509     // vert 0 from leading vertex
1510     verts[0] = swizzleLane0(a);
1511 
1512     // vert 1
1513     if (primIndex < 7)
1514     {
1515         verts[1] = swizzleLaneN(b, primIndex + 1);
1516     }
1517     else
1518     {
1519         verts[1] = swizzleLane0(c);
1520     }
1521 
1522     // vert 2
1523     if (primIndex < 6)
1524     {
1525         verts[2] = swizzleLaneN(b, primIndex + 2);
1526     }
1527     else
1528     {
1529         verts[2] = swizzleLaneN(c, primIndex - 6);
1530     }
1531 #endif
1532 }
1533 
PaQuadList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1534 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1535 {
1536     SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
1537     return false; // Not enough vertices to assemble 8 triangles.
1538 }
1539 
PaQuadList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1540 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1541 {
1542 #if USE_SIMD16_FRONTEND
1543     simdvector a;
1544     simdvector b;
1545 
1546     if (!pa.useAlternateOffset)
1547     {
1548         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1549 
1550         for (uint32_t i = 0; i < 4; i += 1)
1551         {
1552             a[i] = _simd16_extract_ps(a_16[i], 0);
1553             b[i] = _simd16_extract_ps(a_16[i], 1);
1554         }
1555     }
1556     else
1557     {
1558         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1559 
1560         for (uint32_t i = 0; i < 4; i += 1)
1561         {
1562             a[i] = _simd16_extract_ps(b_16[i], 0);
1563             b[i] = _simd16_extract_ps(b_16[i], 1);
1564         }
1565     }
1566 
1567 #else
1568     simdvector& a = PaGetSimdVector(pa, 0, slot);
1569     simdvector& b = PaGetSimdVector(pa, 1, slot);
1570 
1571 #endif
1572     simdscalar s1, s2;
1573 
1574     for (int i = 0; i < 4; ++i)
1575     {
1576         simdscalar a0 = a[i];
1577         simdscalar b0 = b[i];
1578 
1579         s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
1580         s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
1581 
1582         simdvector& v0 = verts[0];
1583         v0[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
1584 
1585         simdvector& v1 = verts[1];
1586         v1[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
1587 
1588         simdvector& v2 = verts[2];
1589         v2[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
1590     }
1591 
1592     SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1593     return true;
1594 }
1595 
1596 #if ENABLE_AVX512_SIMD16
PaQuadList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1597 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1598 {
1599     SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
1600     return false; // Not enough vertices to assemble 16 triangles.
1601 }
1602 
PaQuadList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1603 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1604 {
1605     // clang-format off
1606 
1607     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
1608     const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
1609 
1610     //  v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
1611     //  v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
1612     //  v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1613 
1614     simd16vector& v0 = verts[0];
1615     simd16vector& v1 = verts[1];
1616     simd16vector& v2 = verts[2];
1617 
1618     // for simd16 x, y, z, and w
1619     for (uint32_t i = 0; i < 4; i += 1)
1620     {
1621         simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1622         simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1623 
1624         simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
1625         simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1626 
1627         v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0));                           // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
1628         v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1));                           // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
1629         v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2));                           // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
1630     }
1631 
1632     SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1633     return true;
1634 
1635     // clang-format on
1636 }
1637 
1638 #endif
PaQuadListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1639 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1640 {
1641 #if USE_SIMD16_FRONTEND
1642     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
1643     const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
1644 
1645     if (pa.useAlternateOffset)
1646     {
1647         primIndex += KNOB_SIMD_WIDTH;
1648     }
1649 
1650     switch (primIndex)
1651     {
1652     case 0:
1653         // triangle 0 - 0 1 2
1654         verts[0] = swizzleLane0(a);
1655         verts[1] = swizzleLane1(a);
1656         verts[2] = swizzleLane2(a);
1657         break;
1658     case 1:
1659         // triangle 1 - 0 2 3
1660         verts[0] = swizzleLane0(a);
1661         verts[1] = swizzleLane2(a);
1662         verts[2] = swizzleLane3(a);
1663         break;
1664     case 2:
1665         // triangle 2 - 4 5 6
1666         verts[0] = swizzleLane4(a);
1667         verts[1] = swizzleLane5(a);
1668         verts[2] = swizzleLane6(a);
1669         break;
1670     case 3:
1671         // triangle 3 - 4 6 7
1672         verts[0] = swizzleLane4(a);
1673         verts[1] = swizzleLane6(a);
1674         verts[2] = swizzleLane7(a);
1675         break;
1676     case 4:
1677         // triangle 4 - 8 9 A
1678         verts[0] = swizzleLane8(a);
1679         verts[1] = swizzleLane9(a);
1680         verts[2] = swizzleLaneA(a);
1681         break;
1682     case 5:
1683         // triangle 5 - 8 A B
1684         verts[0] = swizzleLane8(a);
1685         verts[1] = swizzleLaneA(a);
1686         verts[2] = swizzleLaneB(a);
1687         break;
1688     case 6:
1689         // triangle 6 - C D E
1690         verts[0] = swizzleLaneC(a);
1691         verts[1] = swizzleLaneD(a);
1692         verts[2] = swizzleLaneE(a);
1693         break;
1694     case 7:
1695         // triangle 7 - C E F
1696         verts[0] = swizzleLaneC(a);
1697         verts[1] = swizzleLaneE(a);
1698         verts[2] = swizzleLaneF(a);
1699         break;
1700     case 8:
1701         // triangle 0 - 0 1 2
1702         verts[0] = swizzleLane0(b);
1703         verts[1] = swizzleLane1(b);
1704         verts[2] = swizzleLane2(b);
1705         break;
1706     case 9:
1707         // triangle 1 - 0 2 3
1708         verts[0] = swizzleLane0(b);
1709         verts[1] = swizzleLane2(b);
1710         verts[2] = swizzleLane3(b);
1711         break;
1712     case 10:
1713         // triangle 2 - 4 5 6
1714         verts[0] = swizzleLane4(b);
1715         verts[1] = swizzleLane5(b);
1716         verts[2] = swizzleLane6(b);
1717         break;
1718     case 11:
1719         // triangle 3 - 4 6 7
1720         verts[0] = swizzleLane4(b);
1721         verts[1] = swizzleLane6(b);
1722         verts[2] = swizzleLane7(b);
1723         break;
1724     case 12:
1725         // triangle 4 - 8 9 A
1726         verts[0] = swizzleLane8(b);
1727         verts[1] = swizzleLane9(b);
1728         verts[2] = swizzleLaneA(b);
1729         break;
1730     case 13:
1731         // triangle 5 - 8 A B
1732         verts[0] = swizzleLane8(b);
1733         verts[1] = swizzleLaneA(b);
1734         verts[2] = swizzleLaneB(b);
1735         break;
1736     case 14:
1737         // triangle 6 - C D E
1738         verts[0] = swizzleLaneC(b);
1739         verts[1] = swizzleLaneD(b);
1740         verts[2] = swizzleLaneE(b);
1741         break;
1742     case 15:
1743         // triangle 7 - C E F
1744         verts[0] = swizzleLaneC(b);
1745         verts[1] = swizzleLaneE(b);
1746         verts[2] = swizzleLaneF(b);
1747         break;
1748     }
1749 #else
1750     const simdvector& a = PaGetSimdVector(pa, 0, slot);
1751     const simdvector& b = PaGetSimdVector(pa, 1, slot);
1752 
1753     switch (primIndex)
1754     {
1755     case 0:
1756         // triangle 0 - 0 1 2
1757         verts[0] = swizzleLane0(a);
1758         verts[1] = swizzleLane1(a);
1759         verts[2] = swizzleLane2(a);
1760         break;
1761     case 1:
1762         // triangle 1 - 0 2 3
1763         verts[0] = swizzleLane0(a);
1764         verts[1] = swizzleLane2(a);
1765         verts[2] = swizzleLane3(a);
1766         break;
1767     case 2:
1768         // triangle 2 - 4 5 6
1769         verts[0] = swizzleLane4(a);
1770         verts[1] = swizzleLane5(a);
1771         verts[2] = swizzleLane6(a);
1772         break;
1773     case 3:
1774         // triangle 3 - 4 6 7
1775         verts[0] = swizzleLane4(a);
1776         verts[1] = swizzleLane6(a);
1777         verts[2] = swizzleLane7(a);
1778         break;
1779     case 4:
1780         // triangle 4 - 8 9 10 (0 1 2)
1781         verts[0] = swizzleLane0(b);
1782         verts[1] = swizzleLane1(b);
1783         verts[2] = swizzleLane2(b);
1784         break;
1785     case 5:
1786         // triangle 1 - 0 2 3
1787         verts[0] = swizzleLane0(b);
1788         verts[1] = swizzleLane2(b);
1789         verts[2] = swizzleLane3(b);
1790         break;
1791     case 6:
1792         // triangle 2 - 4 5 6
1793         verts[0] = swizzleLane4(b);
1794         verts[1] = swizzleLane5(b);
1795         verts[2] = swizzleLane6(b);
1796         break;
1797     case 7:
1798         // triangle 3 - 4 6 7
1799         verts[0] = swizzleLane4(b);
1800         verts[1] = swizzleLane6(b);
1801         verts[2] = swizzleLane7(b);
1802         break;
1803     }
1804 #endif
1805 }
1806 
PaLineLoop0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1807 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1808 {
1809     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
1810     return false;
1811 }
1812 
PaLineLoop1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1813 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1814 {
1815     PaLineStrip1(pa, slot, verts);
1816 
1817     if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1)
1818     {
1819         // loop reconnect now
1820         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1821 
1822 #if USE_SIMD16_FRONTEND
1823         simdvector first;
1824 
1825         const simd16vector& first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
1826 
1827         if (!pa.useAlternateOffset)
1828         {
1829             for (uint32_t i = 0; i < 4; i += 1)
1830             {
1831                 first[i] = _simd16_extract_ps(first_16[i], 0);
1832             }
1833         }
1834         else
1835         {
1836             for (uint32_t i = 0; i < 4; i += 1)
1837             {
1838                 first[i] = _simd16_extract_ps(first_16[i], 1);
1839             }
1840         }
1841 
1842 #else
1843         simdvector& first = PaGetSimdVector(pa, pa.first, slot);
1844 
1845 #endif
1846         for (int i = 0; i < 4; i++)
1847         {
1848             float* firstVtx  = (float*)&(first[i]);
1849             float* targetVtx = (float*)&(verts[1][i]);
1850             targetVtx[lane]  = firstVtx[0];
1851         }
1852     }
1853 
1854     SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1855     return true;
1856 }
1857 
1858 #if ENABLE_AVX512_SIMD16
PaLineLoop0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1859 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1860 {
1861     SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
1862     return false;
1863 }
1864 
PaLineLoop1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1865 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1866 {
1867     PaLineStrip1_simd16(pa, slot, verts);
1868 
1869     if (pa.numPrimsComplete + KNOB_SIMD16_WIDTH > pa.numPrims - 1)
1870     {
1871         // loop reconnect now
1872         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
1873 
1874         const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
1875 
1876         for (int i = 0; i < 4; i++)
1877         {
1878             float* firstVtx  = (float*)&(first[i]);
1879             float* targetVtx = (float*)&(verts[1][i]);
1880             targetVtx[lane]  = firstVtx[0];
1881         }
1882     }
1883 
1884     SetNextPaState_simd16(
1885         pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
1886     return true;
1887 }
1888 
1889 #endif
PaLineLoopSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])1890 void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1891 {
1892     PaLineStripSingle0(pa, slot, primIndex, verts);
1893 
1894     if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
1895     {
1896 #if USE_SIMD16_FRONTEND
1897         const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
1898 
1899         verts[1] = swizzleLane0(first);
1900 #else
1901         const simdvector& first = PaGetSimdVector(pa, pa.first, slot);
1902 
1903         verts[1] = swizzleLane0(first);
1904 #endif
1905     }
1906 }
1907 
PaLineList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1908 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1909 {
1910     SetNextPaState(pa, PaLineList1, PaLineListSingle0);
1911     return false; // Not enough vertices to assemble 8 lines
1912 }
1913 
PaLineList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])1914 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
1915 {
1916 #if USE_SIMD16_FRONTEND
1917     simdvector a;
1918     simdvector b;
1919 
1920     if (!pa.useAlternateOffset)
1921     {
1922         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
1923 
1924         for (uint32_t i = 0; i < 4; i += 1)
1925         {
1926             a[i] = _simd16_extract_ps(a_16[i], 0);
1927             b[i] = _simd16_extract_ps(a_16[i], 1);
1928         }
1929     }
1930     else
1931     {
1932         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
1933 
1934         for (uint32_t i = 0; i < 4; i += 1)
1935         {
1936             a[i] = _simd16_extract_ps(b_16[i], 0);
1937             b[i] = _simd16_extract_ps(b_16[i], 1);
1938         }
1939     }
1940 
1941 #else
1942     simdvector& a = PaGetSimdVector(pa, 0, slot);
1943     simdvector& b = PaGetSimdVector(pa, 1, slot);
1944 
1945 #endif
1946     /// @todo: verify provoking vertex is correct
1947     // Line list 0  1  2  3  4  5  6  7
1948     //           8  9 10 11 12 13 14 15
1949 
1950     // shuffle:
1951     //           0 2 4 6 8 10 12 14
1952     //           1 3 5 7 9 11 13 15
1953 
1954     for (uint32_t i = 0; i < 4; ++i)
1955     {
1956         // 0 1 2 3 8 9 10 11
1957         __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
1958         // 4 5 6 7 12 13 14 15
1959         __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
1960 
1961         // 0 2 4 6 8 10 12 14
1962         verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
1963         // 1 3 5 7 9 11 13 15
1964         verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
1965     }
1966 
1967     SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
1968     return true;
1969 }
1970 
1971 #if ENABLE_AVX512_SIMD16
PaLineList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1972 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1973 {
1974     SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
1975     return false; // Not enough vertices to assemble 16 lines
1976 }
1977 
PaLineList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])1978 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
1979 {
1980     // clang-format off
1981 
1982     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
1983     const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
1984 
1985     // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
1986     // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
1987 
1988     simd16vector& v0 = verts[0];
1989     simd16vector& v1 = verts[1];
1990 
1991     // for simd16 x, y, z, and w
1992     for (int i = 0; i < 4; i += 1)
1993     {
1994         simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
1995         simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
1996 
1997         simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) 10 00 10 00   // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
1998         simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) 11 01 11 01   // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
1999 
2000         v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));                           // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
2001         v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
2002     }
2003 
2004     SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2005     return true;
2006 
2007     // clang-format on
2008 }
2009 
2010 #endif
PaLineListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2011 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2012 {
2013 #if USE_SIMD16_FRONTEND
2014     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
2015     const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
2016 
2017     if (pa.useAlternateOffset)
2018     {
2019         primIndex += KNOB_SIMD_WIDTH;
2020     }
2021 
2022     switch (primIndex)
2023     {
2024     case 0:
2025         verts[0] = swizzleLane0(a);
2026         verts[1] = swizzleLane1(a);
2027         break;
2028     case 1:
2029         verts[0] = swizzleLane2(a);
2030         verts[1] = swizzleLane3(a);
2031         break;
2032     case 2:
2033         verts[0] = swizzleLane4(a);
2034         verts[1] = swizzleLane5(a);
2035         break;
2036     case 3:
2037         verts[0] = swizzleLane6(a);
2038         verts[1] = swizzleLane7(a);
2039         break;
2040     case 4:
2041         verts[0] = swizzleLane8(a);
2042         verts[1] = swizzleLane9(a);
2043         break;
2044     case 5:
2045         verts[0] = swizzleLaneA(a);
2046         verts[1] = swizzleLaneB(a);
2047         break;
2048     case 6:
2049         verts[0] = swizzleLaneC(a);
2050         verts[1] = swizzleLaneD(a);
2051         break;
2052     case 7:
2053         verts[0] = swizzleLaneE(a);
2054         verts[1] = swizzleLaneF(a);
2055         break;
2056     case 8:
2057         verts[0] = swizzleLane0(b);
2058         verts[1] = swizzleLane1(b);
2059         break;
2060     case 9:
2061         verts[0] = swizzleLane2(b);
2062         verts[1] = swizzleLane3(b);
2063         break;
2064     case 10:
2065         verts[0] = swizzleLane4(b);
2066         verts[1] = swizzleLane5(b);
2067         break;
2068     case 11:
2069         verts[0] = swizzleLane6(b);
2070         verts[1] = swizzleLane7(b);
2071         break;
2072     case 12:
2073         verts[0] = swizzleLane8(b);
2074         verts[1] = swizzleLane9(b);
2075         break;
2076     case 13:
2077         verts[0] = swizzleLaneA(b);
2078         verts[1] = swizzleLaneB(b);
2079         break;
2080     case 14:
2081         verts[0] = swizzleLaneC(b);
2082         verts[1] = swizzleLaneD(b);
2083         break;
2084     case 15:
2085         verts[0] = swizzleLaneE(b);
2086         verts[1] = swizzleLaneF(b);
2087         break;
2088     }
2089 #else
2090     const simdvector& a = PaGetSimdVector(pa, 0, slot);
2091     const simdvector& b = PaGetSimdVector(pa, 1, slot);
2092 
2093     switch (primIndex)
2094     {
2095     case 0:
2096         verts[0] = swizzleLane0(a);
2097         verts[1] = swizzleLane1(a);
2098         break;
2099     case 1:
2100         verts[0] = swizzleLane2(a);
2101         verts[1] = swizzleLane3(a);
2102         break;
2103     case 2:
2104         verts[0] = swizzleLane4(a);
2105         verts[1] = swizzleLane5(a);
2106         break;
2107     case 3:
2108         verts[0] = swizzleLane6(a);
2109         verts[1] = swizzleLane7(a);
2110         break;
2111     case 4:
2112         verts[0] = swizzleLane0(b);
2113         verts[1] = swizzleLane1(b);
2114         break;
2115     case 5:
2116         verts[0] = swizzleLane2(b);
2117         verts[1] = swizzleLane3(b);
2118         break;
2119     case 6:
2120         verts[0] = swizzleLane4(b);
2121         verts[1] = swizzleLane5(b);
2122         break;
2123     case 7:
2124         verts[0] = swizzleLane6(b);
2125         verts[1] = swizzleLane7(b);
2126         break;
2127     }
2128 #endif
2129 }
2130 
PaLineStrip0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2131 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2132 {
2133     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
2134     return false; // Not enough vertices to assemble 8 lines
2135 }
2136 
PaLineStrip1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2137 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2138 {
2139 #if USE_SIMD16_FRONTEND
2140     simdvector a;
2141     simdvector b;
2142 
2143     if (!pa.useAlternateOffset)
2144     {
2145         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
2146 
2147         for (uint32_t i = 0; i < 4; i += 1)
2148         {
2149             a[i] = _simd16_extract_ps(a_16[i], 0);
2150             b[i] = _simd16_extract_ps(a_16[i], 1);
2151         }
2152     }
2153     else
2154     {
2155         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
2156 
2157         for (uint32_t i = 0; i < 4; i += 1)
2158         {
2159             a[i] = _simd16_extract_ps(b_16[i], 0);
2160             b[i] = _simd16_extract_ps(b_16[i], 1);
2161         }
2162     }
2163 
2164 #else
2165     simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
2166     simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
2167 
2168 #endif
2169     /// @todo: verify provoking vertex is correct
2170     // Line list 0  1  2  3  4  5  6  7
2171     //           8  9 10 11 12 13 14 15
2172 
2173     // shuffle:
2174     //           0  1  2  3  4  5  6  7
2175     //           1  2  3  4  5  6  7  8
2176 
2177     verts[0] = a;
2178 
2179     for (uint32_t i = 0; i < 4; ++i)
2180     {
2181         // 1 2 3 x 5 6 7 x
2182         __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
2183         // 4 5 6 7 8 9 10 11
2184         __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
2185 
2186         // x x x 4 x x x 8
2187         __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
2188 
2189         verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
2190     }
2191 
2192     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2193     return true;
2194 }
2195 
2196 #if ENABLE_AVX512_SIMD16
PaLineStrip0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2197 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2198 {
2199     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
2200     return false; // Not enough vertices to assemble 16 lines
2201 }
2202 
PaLineStrip1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2203 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2204 {
2205     // clang-format off
2206 
2207     const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
2208 
2209     const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2210     const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2211 
2212     const simd16mask mask0 = 0x0001;
2213 
2214     // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2215     // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2216 
2217     simd16vector& v0 = verts[0];
2218     simd16vector& v1 = verts[1];
2219 
2220     v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2221 
2222     // for simd16 x, y, z, and w
2223     for (int i = 0; i < 4; i += 1)
2224     {
2225         simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
2226         simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
2227 
2228         simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
2229 
2230         v1[i] = _simd16_permute_ps(temp, perm);                    // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
2231     }
2232 
2233     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
2234     return true;
2235 
2236     // clang-format on
2237 }
2238 
2239 #endif
PaLineStripSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2240 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2241 {
2242 #if USE_SIMD16_FRONTEND
2243     const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
2244     const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
2245 
2246     if (pa.useAlternateOffset)
2247     {
2248         primIndex += KNOB_SIMD_WIDTH;
2249     }
2250 
2251     switch (primIndex)
2252     {
2253     case 0:
2254         verts[0] = swizzleLane0(a);
2255         verts[1] = swizzleLane1(a);
2256         break;
2257     case 1:
2258         verts[0] = swizzleLane1(a);
2259         verts[1] = swizzleLane2(a);
2260         break;
2261     case 2:
2262         verts[0] = swizzleLane2(a);
2263         verts[1] = swizzleLane3(a);
2264         break;
2265     case 3:
2266         verts[0] = swizzleLane3(a);
2267         verts[1] = swizzleLane4(a);
2268         break;
2269     case 4:
2270         verts[0] = swizzleLane4(a);
2271         verts[1] = swizzleLane5(a);
2272         break;
2273     case 5:
2274         verts[0] = swizzleLane5(a);
2275         verts[1] = swizzleLane6(a);
2276         break;
2277     case 6:
2278         verts[0] = swizzleLane6(a);
2279         verts[1] = swizzleLane7(a);
2280         break;
2281     case 7:
2282         verts[0] = swizzleLane7(a);
2283         verts[1] = swizzleLane8(a);
2284         break;
2285     case 8:
2286         verts[0] = swizzleLane8(a);
2287         verts[1] = swizzleLane9(a);
2288         break;
2289     case 9:
2290         verts[0] = swizzleLane9(a);
2291         verts[1] = swizzleLaneA(a);
2292         break;
2293     case 10:
2294         verts[0] = swizzleLaneA(a);
2295         verts[1] = swizzleLaneB(a);
2296         break;
2297     case 11:
2298         verts[0] = swizzleLaneB(a);
2299         verts[1] = swizzleLaneC(a);
2300         break;
2301     case 12:
2302         verts[0] = swizzleLaneC(a);
2303         verts[1] = swizzleLaneD(a);
2304         break;
2305     case 13:
2306         verts[0] = swizzleLaneD(a);
2307         verts[1] = swizzleLaneE(a);
2308         break;
2309     case 14:
2310         verts[0] = swizzleLaneE(a);
2311         verts[1] = swizzleLaneF(a);
2312         break;
2313     case 15:
2314         verts[0] = swizzleLaneF(a);
2315         verts[1] = swizzleLane0(b);
2316         break;
2317     }
2318 #else
2319     const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
2320     const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
2321 
2322     switch (primIndex)
2323     {
2324     case 0:
2325         verts[0] = swizzleLane0(a);
2326         verts[1] = swizzleLane1(a);
2327         break;
2328     case 1:
2329         verts[0] = swizzleLane1(a);
2330         verts[1] = swizzleLane2(a);
2331         break;
2332     case 2:
2333         verts[0] = swizzleLane2(a);
2334         verts[1] = swizzleLane3(a);
2335         break;
2336     case 3:
2337         verts[0] = swizzleLane3(a);
2338         verts[1] = swizzleLane4(a);
2339         break;
2340     case 4:
2341         verts[0] = swizzleLane4(a);
2342         verts[1] = swizzleLane5(a);
2343         break;
2344     case 5:
2345         verts[0] = swizzleLane5(a);
2346         verts[1] = swizzleLane6(a);
2347         break;
2348     case 6:
2349         verts[0] = swizzleLane6(a);
2350         verts[1] = swizzleLane7(a);
2351         break;
2352     case 7:
2353         verts[0] = swizzleLane7(a);
2354         verts[1] = swizzleLane0(b);
2355         break;
2356     }
2357 #endif
2358 }
2359 
PaPoints0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2360 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2361 {
2362 #if USE_SIMD16_FRONTEND
2363     simdvector a;
2364 
2365     const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2366 
2367     if (!pa.useAlternateOffset)
2368     {
2369         for (uint32_t i = 0; i < 4; i += 1)
2370         {
2371             a[i] = _simd16_extract_ps(a_16[i], 0);
2372         }
2373     }
2374     else
2375     {
2376         for (uint32_t i = 0; i < 4; i += 1)
2377         {
2378             a[i] = _simd16_extract_ps(a_16[i], 1);
2379         }
2380     }
2381 
2382 #else
2383     simdvector& a = PaGetSimdVector(pa, 0, slot);
2384 
2385 #endif
2386     verts[0] = a; // points only have 1 vertex.
2387 
2388     SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2389     return true;
2390 }
2391 
2392 #if ENABLE_AVX512_SIMD16
PaPoints0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2393 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2394 {
2395     simd16vector& a = PaGetSimdVector_simd16(pa, pa.cur, slot);
2396 
2397     verts[0] = a; // points only have 1 vertex.
2398 
2399     SetNextPaState_simd16(
2400         pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2401     return true;
2402 }
2403 
2404 #endif
PaPointsSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2405 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2406 {
2407 #if USE_SIMD16_FRONTEND
2408     const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
2409 
2410     if (pa.useAlternateOffset)
2411     {
2412         primIndex += KNOB_SIMD_WIDTH;
2413     }
2414 
2415     verts[0] = swizzleLaneN(a, primIndex);
2416 #else
2417     const simdvector& a = PaGetSimdVector(pa, 0, slot);
2418 
2419     verts[0] = swizzleLaneN(a, primIndex);
2420 #endif
2421 }
2422 
2423 //////////////////////////////////////////////////////////////////////////
2424 /// @brief State 1 for RECT_LIST topology.
2425 ///        There is not enough to assemble 8 triangles.
PaRectList0(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2426 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2427 {
2428     SetNextPaState(pa, PaRectList1, PaRectListSingle0);
2429     return false;
2430 }
2431 
2432 //////////////////////////////////////////////////////////////////////////
2433 /// @brief State 1 for RECT_LIST topology.
2434 ///   Rect lists has the following format.
2435 ///             w          x          y           z
2436 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
2437 ///         | \ |      | \ |      | \ |       | \ |
2438 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
2439 ///            v0         v3         v6          v9
2440 ///
2441 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2442 ///
2443 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2444 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2445 ///   etc.
2446 ///
2447 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2448 ///   where v0 contains all the first vertices for 8 triangles.
2449 ///
2450 ///     Result:
2451 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2452 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2453 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2454 ///
2455 /// @param pa - State for PA state machine.
2456 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2457 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2458 /// etc.
PaRectList1(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2459 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2460 {
2461 // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
2462 #if USE_SIMD16_FRONTEND
2463     simdvector a;
2464     simdvector b;
2465 
2466     if (!pa.useAlternateOffset)
2467     {
2468         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2469 
2470         for (uint32_t i = 0; i < 4; i += 1)
2471         {
2472             a[i] = _simd16_extract_ps(a_16[i], 0);
2473             b[i] = _simd16_extract_ps(a_16[i], 1);
2474         }
2475     }
2476     else
2477     {
2478         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2479 
2480         for (uint32_t i = 0; i < 4; i += 1)
2481         {
2482             a[i] = _simd16_extract_ps(b_16[i], 0);
2483             b[i] = _simd16_extract_ps(b_16[i], 1);
2484             ;
2485         }
2486     }
2487 
2488 #else
2489     simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
2490     simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
2491 
2492 #endif
2493     __m256 tmp0, tmp1, tmp2;
2494 
2495     // Loop over each component in the simdvector.
2496     for (int i = 0; i < 4; ++i)
2497     {
2498         simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2499         tmp0           = _mm256_permute2f128_ps(
2500             b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2501         v0[i] = _mm256_blend_ps(
2502             a[i],
2503             tmp0,
2504             0x20); //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6, * } where * is don't care.
2505         tmp1  = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *, * }
2506         v0[i] = _mm256_permute_ps(v0[i], 0x5A); //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
2507         v0[i] =
2508             _mm256_blend_ps(tmp1, v0[i], 0xF0); //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
2509 
2510         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2511         ///      AVX2 should make this much cheaper.
2512         simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2513         v1[i]          = _mm256_permute_ps(a[i], 0x09);  //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
2514         tmp1           = _mm256_permute_ps(a[i], 0x43);  // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
2515         tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);      // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
2516         tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7,  *, v4,  v5, *, *,  *,  * }
2517         v1[i] = _mm256_permute_ps(tmp0, 0xE0);      //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
2518         v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
2519         v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2520 
2521         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2522         simdvector& v2 = verts[2]; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
2523         v2[i]          = _mm256_permute_ps(tmp0, 0x30); //   v2 = { *, *, *, *, v8, *, v11, * }
2524         tmp1           = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
2525         v2[i]          = _mm256_blend_ps(tmp1, v2[i], 0xF0);
2526 
2527         // Need to compute 4th implied vertex for the rectangle.
2528         tmp2  = _mm256_sub_ps(v0[i], v1[i]);
2529         tmp2  = _mm256_add_ps(tmp2, v2[i]);         // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
2530         tmp2  = _mm256_permute_ps(tmp2, 0xA0);      // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
2531         v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
2532     }
2533 
2534     SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2535     return true;
2536 }
2537 
2538 //////////////////////////////////////////////////////////////////////////
2539 /// @brief State 2 for RECT_LIST topology.
2540 ///        Not implemented unless there is a use case for more then 8 rects.
2541 /// @param pa - State for PA state machine.
2542 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2543 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2544 /// etc.
PaRectList2(PA_STATE_OPT & pa,uint32_t slot,simdvector verts[])2545 bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
2546 {
2547     SWR_INVALID("Is rect list used for anything other then clears?");
2548     SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2549     return true;
2550 }
2551 
2552 #if ENABLE_AVX512_SIMD16
2553 //////////////////////////////////////////////////////////////////////////
2554 /// @brief State 1 for RECT_LIST topology.
2555 ///        There is not enough to assemble 8 triangles.
PaRectList0_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2556 bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2557 {
2558     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
2559     return false;
2560 }
2561 
2562 //////////////////////////////////////////////////////////////////////////
2563 /// @brief State 1 for RECT_LIST topology.
2564 ///   Rect lists has the following format.
2565 ///             w          x          y           z
2566 ///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
2567 ///         | \ |      | \ |      | \ |       | \ |
2568 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
2569 ///            v0         v3         v6          v9
2570 ///
2571 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
2572 ///
2573 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
2574 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
2575 ///   etc.
2576 ///
2577 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
2578 ///   where v0 contains all the first vertices for 8 triangles.
2579 ///
2580 ///     Result:
2581 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
2582 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
2583 ///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2584 ///
2585 /// @param pa - State for PA state machine.
2586 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2587 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2588 /// etc.
PaRectList1_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2589 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2590 {
2591     // clang-format off
2592 
2593     simdvector a;
2594     simdvector b;
2595 
2596     if (!pa.useAlternateOffset)
2597     {
2598         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7,
2599                                                                         //         v8, v9, v10, v11, v12, v13, v14, v15 }
2600 
2601         for (uint32_t i = 0; i < 4; i += 1)
2602         {
2603             a[i] = _simd16_extract_ps(a_16[i], 0);
2604             b[i] = _simd16_extract_ps(a_16[i], 1);
2605         }
2606     }
2607     else
2608     {
2609         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
2610 
2611         for (uint32_t i = 0; i < 4; i += 1)
2612         {
2613             a[i] = _simd16_extract_ps(b_16[i], 0);
2614             b[i] = _simd16_extract_ps(b_16[i], 1);
2615         }
2616     }
2617 
2618     simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6,  v9,  v9 }
2619     simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2620     simd16vector& v2 = verts[2]; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11,   z }
2621 
2622     // Loop over each component in the simdvector.
2623     for (int i = 0; i < 4; i += 1)
2624     {
2625         simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
2626         simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
2627         simdscalar v2_lo; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
2628 
2629         __m256 tmp0, tmp1, tmp2;
2630 
2631         tmp0  = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
2632         v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20);        //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,   * } where * is don't care.
2633         tmp1  = _mm256_permute_ps(v0_lo, 0xF0);           // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,   *,   * }
2634         v0_lo = _mm256_permute_ps(v0_lo, 0x5A);           //   v0 = {   *,   *,   *,   *,  v6, v6, v9,  v9 }
2635         v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0);       //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9,  v9 }
2636 
2637         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
2638         ///      AVX2 should make this much cheaper.
2639         v1_lo = _mm256_permute_ps(a[i], 0x09);            //   v1 = { v1, v2,  *,  *,  *,  *,   *,   * }
2640         tmp1  = _mm256_permute_ps(a[i], 0x43);            // tmp1 = {  *,  *,  *,  *, v7,  *,  v4,  v5 }
2641         tmp2  = _mm256_blend_ps(v1_lo, tmp1, 0xF0);       // tmp2 = { v1, v2,  *,  *, v7,  *,  v4,  v5 }
2642         tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);  // tmp1 = { v7,  *, v4,  v5, *,  *,   *,   * }
2643         v1_lo = _mm256_permute_ps(tmp0, 0xE0);            //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
2644         v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0);       //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
2645         v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C);       //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
2646 
2647         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
2648         v2_lo = _mm256_permute_ps(tmp0, 0x30);            //   v2 = { *,  *,  *, *, v8, *, v11, * }
2649         tmp1  = _mm256_permute_ps(tmp2, 0x31);            // tmp1 = { v2, *, v5, *,  *, *,   *, * }
2650         v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
2651 
2652         // Need to compute 4th implied vertex for the rectangle.
2653         tmp2  = _mm256_sub_ps(v0_lo, v1_lo);
2654         tmp2  = _mm256_add_ps(tmp2, v2_lo);               // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
2655         tmp2  = _mm256_permute_ps(tmp2, 0xA0);            // tmp2 = {  *,  w,  *, x, *,  y,  *,  z }
2656         v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA);       //   v2 = { v2,  w, v5, x, v8, y, v11, z }
2657 
2658         v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
2659         v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
2660         v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
2661     }
2662 
2663     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2664     return true;
2665 
2666     // clang-format on
2667 }
2668 
2669 //////////////////////////////////////////////////////////////////////////
2670 /// @brief State 2 for RECT_LIST topology.
2671 ///        Not implemented unless there is a use case for more then 8 rects.
2672 /// @param pa - State for PA state machine.
2673 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
2674 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2675 /// etc.
PaRectList2_simd16(PA_STATE_OPT & pa,uint32_t slot,simd16vector verts[])2676 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
2677 {
2678     SWR_INVALID("Is rect list used for anything other then clears?");
2679     SetNextPaState_simd16(
2680         pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
2681     return true;
2682 }
2683 
2684 #endif
2685 //////////////////////////////////////////////////////////////////////////
2686 /// @brief This procedure is called by the Binner to assemble the attributes.
2687 ///        Unlike position, which is stored vertically, the attributes are
2688 ///        stored horizontally. The outputs from the VS, labeled as 'a' and
2689 ///        'b' are vertical. This function needs to transpose the lanes
2690 ///        containing the vertical attribute data into horizontal form.
2691 /// @param pa - State for PA state machine.
2692 /// @param slot - Index into VS output for a given attribute.
2693 /// @param primIndex - Binner processes each triangle individually.
2694 /// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
2695 /// etc.
PaRectListSingle0(PA_STATE_OPT & pa,uint32_t slot,uint32_t primIndex,simd4scalar verts[])2696 void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
2697 {
2698 // We have 12 simdscalars contained within 3 simdvectors which
2699 // hold at least 8 triangles worth of data. We want to assemble a single
2700 // triangle with data in horizontal form.
2701 #if USE_SIMD16_FRONTEND
2702     simdvector a;
2703     simdvector b;
2704 
2705     if (!pa.useAlternateOffset)
2706     {
2707         const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
2708 
2709         for (uint32_t i = 0; i < 4; i += 1)
2710         {
2711             a[i] = _simd16_extract_ps(a_16[i], 0);
2712             b[i] = _simd16_extract_ps(a_16[i], 1);
2713         }
2714     }
2715     else
2716     {
2717         const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
2718 
2719         for (uint32_t i = 0; i < 4; i += 1)
2720         {
2721             a[i] = _simd16_extract_ps(b_16[i], 0);
2722             b[i] = _simd16_extract_ps(b_16[i], 1);
2723             ;
2724         }
2725     }
2726 
2727 #else
2728     simdvector& a = PaGetSimdVector(pa, 0, slot);
2729 
2730 #endif
2731     // Convert from vertical to horizontal.
2732     switch (primIndex)
2733     {
2734     case 0:
2735         verts[0] = swizzleLane0(a);
2736         verts[1] = swizzleLane1(a);
2737         verts[2] = swizzleLane2(a);
2738         break;
2739     case 1:
2740         verts[0] = swizzleLane0(a);
2741         verts[1] = swizzleLane2(a);
2742         verts[2] = _mm_blend_ps(verts[0], verts[1], 0xA);
2743         break;
2744     case 2:
2745     case 3:
2746     case 4:
2747     case 5:
2748     case 6:
2749     case 7:
2750         SWR_INVALID("Invalid primIndex: %d", primIndex);
2751         break;
2752     };
2753 }
2754 
PA_STATE_OPT(DRAW_CONTEXT * in_pDC,uint32_t in_numPrims,uint8_t * pStream,uint32_t in_streamSizeInVerts,uint32_t in_vertexStride,bool in_isStreaming,uint32_t numVertsPerPrim,PRIMITIVE_TOPOLOGY topo)2755 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT*      in_pDC,
2756                            uint32_t           in_numPrims,
2757                            uint8_t*           pStream,
2758                            uint32_t           in_streamSizeInVerts,
2759                            uint32_t           in_vertexStride,
2760                            bool               in_isStreaming,
2761                            uint32_t           numVertsPerPrim,
2762                            PRIMITIVE_TOPOLOGY topo) :
2763     PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim),
2764     numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0),
2765     counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
2766 {
2767     const API_STATE& state = GetApiState(pDC);
2768 
2769     this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
2770 
2771 #if ENABLE_AVX512_SIMD16
2772     pfnPaFunc_simd16 = nullptr;
2773 
2774 #endif
2775     switch (this->binTopology)
2776     {
2777     case TOP_TRIANGLE_LIST:
2778         this->pfnPaFunc = PaTriList0;
2779 #if ENABLE_AVX512_SIMD16
2780         this->pfnPaFunc_simd16 = PaTriList0_simd16;
2781 #endif
2782         break;
2783     case TOP_TRIANGLE_STRIP:
2784         this->pfnPaFunc = PaTriStrip0;
2785 #if ENABLE_AVX512_SIMD16
2786         this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2787 #endif
2788         break;
2789     case TOP_TRIANGLE_FAN:
2790         this->pfnPaFunc = PaTriFan0;
2791 #if ENABLE_AVX512_SIMD16
2792         this->pfnPaFunc_simd16 = PaTriFan0_simd16;
2793 #endif
2794         break;
2795     case TOP_QUAD_LIST:
2796         this->pfnPaFunc = PaQuadList0;
2797 #if ENABLE_AVX512_SIMD16
2798         this->pfnPaFunc_simd16 = PaQuadList0_simd16;
2799 #endif
2800         this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
2801         break;
2802     case TOP_QUAD_STRIP:
2803         // quad strip pattern when decomposed into triangles is the same as verts strips
2804         this->pfnPaFunc = PaTriStrip0;
2805 #if ENABLE_AVX512_SIMD16
2806         this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
2807 #endif
2808         this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
2809         break;
2810     case TOP_LINE_LIST:
2811         this->pfnPaFunc = PaLineList0;
2812 #if ENABLE_AVX512_SIMD16
2813         this->pfnPaFunc_simd16 = PaLineList0_simd16;
2814 #endif
2815         this->numPrims = in_numPrims;
2816         break;
2817     case TOP_LINE_STRIP:
2818         this->pfnPaFunc = PaLineStrip0;
2819 #if ENABLE_AVX512_SIMD16
2820         this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
2821 #endif
2822         this->numPrims = in_numPrims;
2823         break;
2824     case TOP_LINE_LOOP:
2825         this->pfnPaFunc = PaLineLoop0;
2826 #if ENABLE_AVX512_SIMD16
2827         this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
2828 #endif
2829         this->numPrims = in_numPrims;
2830         break;
2831     case TOP_POINT_LIST:
2832         this->pfnPaFunc = PaPoints0;
2833 #if ENABLE_AVX512_SIMD16
2834         this->pfnPaFunc_simd16 = PaPoints0_simd16;
2835 #endif
2836         this->numPrims = in_numPrims;
2837         break;
2838     case TOP_RECT_LIST:
2839         this->pfnPaFunc = PaRectList0;
2840 #if ENABLE_AVX512_SIMD16
2841         this->pfnPaFunc_simd16 = PaRectList0_simd16;
2842 #endif
2843         this->numPrims = in_numPrims * 2;
2844         break;
2845 
2846     case TOP_PATCHLIST_1:
2847         this->pfnPaFunc = PaPatchList<1>;
2848 #if ENABLE_AVX512_SIMD16
2849         this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
2850 #endif
2851         break;
2852     case TOP_PATCHLIST_2:
2853         this->pfnPaFunc = PaPatchList<2>;
2854 #if ENABLE_AVX512_SIMD16
2855         this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
2856 #endif
2857         break;
2858     case TOP_PATCHLIST_3:
2859         this->pfnPaFunc = PaPatchList<3>;
2860 #if ENABLE_AVX512_SIMD16
2861         this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
2862 #endif
2863         break;
2864     case TOP_PATCHLIST_4:
2865         this->pfnPaFunc = PaPatchList<4>;
2866 #if ENABLE_AVX512_SIMD16
2867         this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
2868 #endif
2869         break;
2870     case TOP_PATCHLIST_5:
2871         this->pfnPaFunc = PaPatchList<5>;
2872 #if ENABLE_AVX512_SIMD16
2873         this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
2874 #endif
2875         break;
2876     case TOP_PATCHLIST_6:
2877         this->pfnPaFunc = PaPatchList<6>;
2878 #if ENABLE_AVX512_SIMD16
2879         this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
2880 #endif
2881         break;
2882     case TOP_PATCHLIST_7:
2883         this->pfnPaFunc = PaPatchList<7>;
2884 #if ENABLE_AVX512_SIMD16
2885         this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
2886 #endif
2887         break;
2888     case TOP_PATCHLIST_8:
2889         this->pfnPaFunc = PaPatchList<8>;
2890 #if ENABLE_AVX512_SIMD16
2891         this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
2892 #endif
2893         break;
2894     case TOP_PATCHLIST_9:
2895         this->pfnPaFunc = PaPatchList<9>;
2896 #if ENABLE_AVX512_SIMD16
2897         this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
2898 #endif
2899         break;
2900     case TOP_PATCHLIST_10:
2901         this->pfnPaFunc = PaPatchList<10>;
2902 #if ENABLE_AVX512_SIMD16
2903         this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
2904 #endif
2905         break;
2906     case TOP_PATCHLIST_11:
2907         this->pfnPaFunc = PaPatchList<11>;
2908 #if ENABLE_AVX512_SIMD16
2909         this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
2910 #endif
2911         break;
2912     case TOP_PATCHLIST_12:
2913         this->pfnPaFunc = PaPatchList<12>;
2914 #if ENABLE_AVX512_SIMD16
2915         this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
2916 #endif
2917         break;
2918     case TOP_PATCHLIST_13:
2919         this->pfnPaFunc = PaPatchList<13>;
2920 #if ENABLE_AVX512_SIMD16
2921         this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
2922 #endif
2923         break;
2924     case TOP_PATCHLIST_14:
2925         this->pfnPaFunc = PaPatchList<14>;
2926 #if ENABLE_AVX512_SIMD16
2927         this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
2928 #endif
2929         break;
2930     case TOP_PATCHLIST_15:
2931         this->pfnPaFunc = PaPatchList<15>;
2932 #if ENABLE_AVX512_SIMD16
2933         this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
2934 #endif
2935         break;
2936     case TOP_PATCHLIST_16:
2937         this->pfnPaFunc = PaPatchList<16>;
2938 #if ENABLE_AVX512_SIMD16
2939         this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
2940 #endif
2941         break;
2942     case TOP_PATCHLIST_17:
2943         this->pfnPaFunc = PaPatchList<17>;
2944 #if ENABLE_AVX512_SIMD16
2945         this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
2946 #endif
2947         break;
2948     case TOP_PATCHLIST_18:
2949         this->pfnPaFunc = PaPatchList<18>;
2950 #if ENABLE_AVX512_SIMD16
2951         this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
2952 #endif
2953         break;
2954     case TOP_PATCHLIST_19:
2955         this->pfnPaFunc = PaPatchList<19>;
2956 #if ENABLE_AVX512_SIMD16
2957         this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
2958 #endif
2959         break;
2960     case TOP_PATCHLIST_20:
2961         this->pfnPaFunc = PaPatchList<20>;
2962 #if ENABLE_AVX512_SIMD16
2963         this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
2964 #endif
2965         break;
2966     case TOP_PATCHLIST_21:
2967         this->pfnPaFunc = PaPatchList<21>;
2968 #if ENABLE_AVX512_SIMD16
2969         this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
2970 #endif
2971         break;
2972     case TOP_PATCHLIST_22:
2973         this->pfnPaFunc = PaPatchList<22>;
2974 #if ENABLE_AVX512_SIMD16
2975         this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
2976 #endif
2977         break;
2978     case TOP_PATCHLIST_23:
2979         this->pfnPaFunc = PaPatchList<23>;
2980 #if ENABLE_AVX512_SIMD16
2981         this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
2982 #endif
2983         break;
2984     case TOP_PATCHLIST_24:
2985         this->pfnPaFunc = PaPatchList<24>;
2986 #if ENABLE_AVX512_SIMD16
2987         this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
2988 #endif
2989         break;
2990     case TOP_PATCHLIST_25:
2991         this->pfnPaFunc = PaPatchList<25>;
2992 #if ENABLE_AVX512_SIMD16
2993         this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
2994 #endif
2995         break;
2996     case TOP_PATCHLIST_26:
2997         this->pfnPaFunc = PaPatchList<26>;
2998 #if ENABLE_AVX512_SIMD16
2999         this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
3000 #endif
3001         break;
3002     case TOP_PATCHLIST_27:
3003         this->pfnPaFunc = PaPatchList<27>;
3004 #if ENABLE_AVX512_SIMD16
3005         this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
3006 #endif
3007         break;
3008     case TOP_PATCHLIST_28:
3009         this->pfnPaFunc = PaPatchList<28>;
3010 #if ENABLE_AVX512_SIMD16
3011         this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
3012 #endif
3013         break;
3014     case TOP_PATCHLIST_29:
3015         this->pfnPaFunc = PaPatchList<29>;
3016 #if ENABLE_AVX512_SIMD16
3017         this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
3018 #endif
3019         break;
3020     case TOP_PATCHLIST_30:
3021         this->pfnPaFunc = PaPatchList<30>;
3022 #if ENABLE_AVX512_SIMD16
3023         this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
3024 #endif
3025         break;
3026     case TOP_PATCHLIST_31:
3027         this->pfnPaFunc = PaPatchList<31>;
3028 #if ENABLE_AVX512_SIMD16
3029         this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
3030 #endif
3031         break;
3032     case TOP_PATCHLIST_32:
3033         this->pfnPaFunc = PaPatchList<32>;
3034 #if ENABLE_AVX512_SIMD16
3035         this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
3036 #endif
3037         break;
3038 
3039     default:
3040         SWR_INVALID("Invalid topology: %d", this->binTopology);
3041         break;
3042     };
3043 
3044     this->pfnPaFuncReset = this->pfnPaFunc;
3045 #if ENABLE_AVX512_SIMD16
3046     this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
3047 #endif
3048 
3049 #if USE_SIMD16_FRONTEND
3050     simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
3051     simd16scalari id82 = _simd16_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
3052 
3053 #else
3054     simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
3055     simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
3056 
3057 #endif
3058     switch (this->binTopology)
3059     {
3060     case TOP_TRIANGLE_LIST:
3061     case TOP_TRIANGLE_STRIP:
3062     case TOP_TRIANGLE_FAN:
3063     case TOP_LINE_STRIP:
3064     case TOP_LINE_LIST:
3065     case TOP_LINE_LOOP:
3066 #if USE_SIMD16_FRONTEND
3067         this->primIDIncr = 16;
3068         this->primID     = id16;
3069 #else
3070         this->primIDIncr = 8;
3071         this->primID = id8;
3072 #endif
3073         break;
3074     case TOP_QUAD_LIST:
3075     case TOP_QUAD_STRIP:
3076     case TOP_RECT_LIST:
3077 #if USE_SIMD16_FRONTEND
3078         this->primIDIncr = 8;
3079         this->primID     = id82;
3080 #else
3081         this->primIDIncr = 4;
3082         this->primID = id4;
3083 #endif
3084         break;
3085     case TOP_POINT_LIST:
3086 #if USE_SIMD16_FRONTEND
3087         this->primIDIncr = 16;
3088         this->primID     = id16;
3089 #else
3090         this->primIDIncr = 8;
3091         this->primID = id8;
3092 #endif
3093         break;
3094     case TOP_PATCHLIST_1:
3095     case TOP_PATCHLIST_2:
3096     case TOP_PATCHLIST_3:
3097     case TOP_PATCHLIST_4:
3098     case TOP_PATCHLIST_5:
3099     case TOP_PATCHLIST_6:
3100     case TOP_PATCHLIST_7:
3101     case TOP_PATCHLIST_8:
3102     case TOP_PATCHLIST_9:
3103     case TOP_PATCHLIST_10:
3104     case TOP_PATCHLIST_11:
3105     case TOP_PATCHLIST_12:
3106     case TOP_PATCHLIST_13:
3107     case TOP_PATCHLIST_14:
3108     case TOP_PATCHLIST_15:
3109     case TOP_PATCHLIST_16:
3110     case TOP_PATCHLIST_17:
3111     case TOP_PATCHLIST_18:
3112     case TOP_PATCHLIST_19:
3113     case TOP_PATCHLIST_20:
3114     case TOP_PATCHLIST_21:
3115     case TOP_PATCHLIST_22:
3116     case TOP_PATCHLIST_23:
3117     case TOP_PATCHLIST_24:
3118     case TOP_PATCHLIST_25:
3119     case TOP_PATCHLIST_26:
3120     case TOP_PATCHLIST_27:
3121     case TOP_PATCHLIST_28:
3122     case TOP_PATCHLIST_29:
3123     case TOP_PATCHLIST_30:
3124     case TOP_PATCHLIST_31:
3125     case TOP_PATCHLIST_32:
3126         // Always run KNOB_SIMD_WIDTH number of patches at a time.
3127 #if USE_SIMD16_FRONTEND
3128         this->primIDIncr = 16;
3129         this->primID     = id16;
3130 #else
3131         this->primIDIncr = 8;
3132         this->primID = id8;
3133 #endif
3134         break;
3135 
3136     default:
3137         SWR_INVALID("Invalid topology: %d", this->binTopology);
3138         break;
3139     };
3140 }
3141 #endif
3142