1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file clip.h
24 *
25 * @brief Definitions for clipping
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34 
35 // Temp storage used by the clipper
36 extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
37 #if USE_SIMD16_FRONTEND
38 extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
39 #endif
40 
41 enum SWR_CLIPCODES
42 {
43     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
44     // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
45 #define CLIPCODE_SHIFT 23
46     FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
47     FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
48     FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
49     FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
50 
51     FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
52     FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
53 
54     NEGW            = (0x40 << CLIPCODE_SHIFT),
55 
56     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
57     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
58     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
59     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
60 };
61 
62 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
63 
64 template<typename SIMD_T>
ComputeClipCodes(const API_STATE & state,const typename SIMD_T::Vec4 & vertex,typename SIMD_T::Float & clipCodes,typename SIMD_T::Integer const & viewportIndexes)65 void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
66 {
67     clipCodes = SIMD_T::setzero_ps();
68 
69     // -w
70     typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
71 
72     // FRUSTUM_LEFT
73     typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
74     clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
75 
76     // FRUSTUM_TOP
77     vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
78     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
79 
80     // FRUSTUM_RIGHT
81     vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
82     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
83 
84     // FRUSTUM_BOTTOM
85     vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
86     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
87 
88     if (state.rastState.depthClipEnable)
89     {
90         // FRUSTUM_NEAR
91         // DX clips depth [0..w], GL clips [-w..w]
92         if (state.rastState.clipHalfZ)
93         {
94             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
95         }
96         else
97         {
98             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
99         }
100         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
101 
102         // FRUSTUM_FAR
103         vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
104         clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
105     }
106 
107     // NEGW
108     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
109     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
110 
111     // GUARDBAND_LEFT
112     typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
113     vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
114     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
115 
116     // GUARDBAND_TOP
117     gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
118     vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
119     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
120 
121     // GUARDBAND_RIGHT
122     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
123     vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
124     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
125 
126     // GUARDBAND_BOTTOM
127     gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
128     vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
129     clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
130 }
131 
132 template<typename SIMD_T>
133 struct BinnerChooser
134 {
135 };
136 
137 template<>
138 struct BinnerChooser<SIMD256>
139 {
140     PFN_PROCESS_PRIMS pfnBinFunc;
141 
142     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
143         :pfnBinFunc(nullptr)
144     {
145         if (numVertsPerPrim == 3)
146         {
147             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
148 
149         }
150         else if (numVertsPerPrim == 2)
151         {
152             pfnBinFunc = BinLines;
153         }
154         else
155         {
156             SWR_ASSERT(0 && "Unexpected points in clipper.");
157         }
158     }
159 
160     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
161         :pfnBinFunc(nullptr)
162     {
163         switch (topology)
164         {
165         case TOP_POINT_LIST:
166             pfnBinFunc = BinPoints;
167             break;
168         case TOP_LINE_LIST:
169         case TOP_LINE_STRIP:
170         case TOP_LINE_LOOP:
171         case TOP_LINE_LIST_ADJ:
172         case TOP_LISTSTRIP_ADJ:
173             pfnBinFunc = BinLines;
174             break;
175         default:
176             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
177             break;
178         };
179     }
180 
181     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
182     {
183         SWR_ASSERT(pfnBinFunc != nullptr);
184 
185         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
186     }
187 };
188 
189 #if USE_SIMD16_FRONTEND
190 template<>
191 struct BinnerChooser<SIMD512>
192 {
193     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
194 
195     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
196         :pfnBinFunc(nullptr)
197     {
198         if (numVertsPerPrim == 3)
199         {
200             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
201 
202         }
203         else if (numVertsPerPrim == 2)
204         {
205             pfnBinFunc = BinLines_simd16;
206         }
207         else
208         {
209             SWR_ASSERT(0 && "Unexpected points in clipper.");
210         }
211     }
212 
213     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
214         :pfnBinFunc(nullptr)
215     {
216         switch (topology)
217         {
218         case TOP_POINT_LIST:
219             pfnBinFunc = BinPoints_simd16;
220             break;
221         case TOP_LINE_LIST:
222         case TOP_LINE_STRIP:
223         case TOP_LINE_LOOP:
224         case TOP_LINE_LIST_ADJ:
225         case TOP_LISTSTRIP_ADJ:
226             pfnBinFunc = BinLines_simd16;
227             break;
228         default:
229             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
230             break;
231         };
232     }
233 
234     void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
235     {
236         SWR_ASSERT(pfnBinFunc != nullptr);
237 
238         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
239     }
240 };
241 
242 #endif
243 template<typename SIMD_T>
244 struct SimdHelper
245 {
246 };
247 
248 template<>
249 struct SimdHelper<SIMD256>
250 {
251     static SIMD256::Float insert_lo_ps(SIMD256::Float a)
252     {
253         return a;
254     }
255 
256     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
257     {
258         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
259     }
260 };
261 
262 #if USE_SIMD16_FRONTEND
263 template<>
264 struct SimdHelper<SIMD512>
265 {
266     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
267     {
268         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
269     }
270 
271     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
272     {
273         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
274     }
275 };
276 
277 #endif
278 // Temp storage used by the clipper
279 template<typename SIMD_T>
280 struct ClipHelper
281 {
282 };
283 
284 template<>
285 struct ClipHelper<SIMD256>
286 {
287     static SIMDVERTEX_T<SIMD256> *GetTempVertices()
288     {
289         return tlsTempVertices;
290     }
291 };
292 
293 #if USE_SIMD16_FRONTEND
294 template<>
295 struct ClipHelper<SIMD512>
296 {
297     static SIMDVERTEX_T<SIMD512> *GetTempVertices()
298     {
299         return tlsTempVertices_simd16;
300     }
301 };
302 
303 #endif
304 template<typename SIMD_T, uint32_t NumVertsPerPrim>
305 class Clipper
306 {
307 public:
308     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
309         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
310     {
311         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
312     }
313 
314     void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
315     {
316         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
317         {
318             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
319         }
320     }
321 
322     typename SIMD_T::Float ComputeClipCodeIntersection()
323     {
324         typename SIMD_T::Float result = clipCodes[0];
325 
326         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
327         {
328             result = SIMD_T::and_ps(result, clipCodes[i]);
329         }
330 
331         return result;
332     }
333 
334     typename SIMD_T::Float ComputeClipCodeUnion()
335     {
336         typename SIMD_T::Float result = clipCodes[0];
337 
338         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
339         {
340             result = SIMD_T::or_ps(result, clipCodes[i]);
341         }
342 
343         return result;
344     }
345 
346     int ComputeClipMask()
347     {
348         typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
349 
350         clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
351 
352         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
353     }
354 
355     // clipper is responsible for culling any prims with NAN coordinates
356     int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
357     {
358         typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
359 
360         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
361         {
362             typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
363             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
364 
365             typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
366             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
367         }
368 
369         return SIMD_T::movemask_ps(vNanMask);
370     }
371 
372     int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
373     {
374         uint8_t cullMask = state.backendState.cullDistanceMask;
375         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
376 
377         typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
378 
379         typename SIMD_T::Vec4 vClipCullDistLo[3];
380         typename SIMD_T::Vec4 vClipCullDistHi[3];
381 
382         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
383         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
384 
385         DWORD index;
386         while (_BitScanForward(&index, cullMask))
387         {
388             cullMask &= ~(1 << index);
389             uint32_t slot = index >> 2;
390             uint32_t component = index & 0x3;
391 
392             typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
393             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
394             {
395                 typename SIMD_T::Float vCullComp;
396                 if (slot == 0)
397                 {
398                     vCullComp = vClipCullDistLo[e][component];
399                 }
400                 else
401                 {
402                     vCullComp = vClipCullDistHi[e][component];
403                 }
404 
405                 // cull if cull distance < 0 || NAN
406                 typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
407                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
408             }
409             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
410         }
411 
412         // clipper should also discard any primitive with NAN clip distance
413         uint8_t clipMask = state.backendState.clipDistanceMask;
414         while (_BitScanForward(&index, clipMask))
415         {
416             clipMask &= ~(1 << index);
417             uint32_t slot = index >> 2;
418             uint32_t component = index & 0x3;
419 
420             for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
421             {
422                 typename SIMD_T::Float vClipComp;
423                 if (slot == 0)
424                 {
425                     vClipComp = vClipCullDistLo[e][component];
426                 }
427                 else
428                 {
429                     vClipComp = vClipCullDistHi[e][component];
430                 }
431 
432                 typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
433                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
434             }
435         }
436 
437         return SIMD_T::movemask_ps(vClipCullMask);
438     }
439 
440     void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
441                   const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
442     {
443         // input/output vertex store for clipper
444         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
445 
446         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
447         uint32_t provokingVertex = 0;
448         if (pa.binTopology == TOP_TRIANGLE_FAN)
449         {
450             provokingVertex = state.frontendState.provokingVertex.triFan;
451         }
452         ///@todo: line topology for wireframe?
453 
454         // assemble pos
455         typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
456         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
457         {
458             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
459         }
460 
461         // assemble attribs
462         const SWR_BACKEND_STATE& backendState = state.backendState;
463 
464         int32_t maxSlot = -1;
465         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
466         {
467             // Compute absolute attrib slot in vertex array
468             uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
469             maxSlot = std::max<int32_t>(maxSlot, mapSlot);
470             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
471 
472             pa.Assemble(inputSlot, tmpVector);
473 
474             // if constant interpolation enabled for this attribute, assign the provoking
475             // vertex values to all edges
476             if (CheckBit(constantInterpMask, slot))
477             {
478                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
479                 {
480                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
481                 }
482             }
483             else
484             {
485                 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
486                 {
487                     vertices[i].attrib[inputSlot] = tmpVector[i];
488                 }
489             }
490         }
491 
492         // assemble user clip distances if enabled
493         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
494         if (state.backendState.clipDistanceMask & 0xf)
495         {
496             pa.Assemble(vertexClipCullSlot, tmpVector);
497             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
498             {
499                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
500             }
501         }
502 
503         if (state.backendState.clipDistanceMask & 0xf0)
504         {
505             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
506             for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
507             {
508                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
509             }
510         }
511 
512         uint32_t numAttribs = maxSlot + 1;
513 
514         typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
515 
516         BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
517 
518         // set up new PA for binning clipped primitives
519         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
520         if (NumVertsPerPrim == 3)
521         {
522             clipTopology = TOP_TRIANGLE_FAN;
523 
524             // so that the binner knows to bloat wide points later
525             if (pa.binTopology == TOP_POINT_LIST)
526             {
527                 clipTopology = TOP_POINT_LIST;
528             }
529         }
530         else if (NumVertsPerPrim == 2)
531         {
532             clipTopology = TOP_LINE_LIST;
533         }
534         else
535         {
536             SWR_ASSERT(0 && "Unexpected points in clipper.");
537         }
538 
539         const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
540         const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
541         const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
542         const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
543 
544         const SIMD256::Integer vOffsets = SIMD256::set_epi32(
545             0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
546             6 * sizeof(SIMDVERTEX_T<SIMD_T>),
547             5 * sizeof(SIMDVERTEX_T<SIMD_T>),
548             4 * sizeof(SIMDVERTEX_T<SIMD_T>),
549             3 * sizeof(SIMDVERTEX_T<SIMD_T>),
550             2 * sizeof(SIMDVERTEX_T<SIMD_T>),
551             1 * sizeof(SIMDVERTEX_T<SIMD_T>),
552             0 * sizeof(SIMDVERTEX_T<SIMD_T>));
553 
554         // only need to gather 7 verts
555         // @todo dynamic mask based on actual # of verts generated per lane
556         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
557 
558         uint32_t numClippedPrims = 0;
559 
560         // tranpose clipper output so that each lane's vertices are in SIMD order
561         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
562         // for triangle fan
563 
564 #if defined(_DEBUG)
565         // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
566         SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
567 
568 #else
569         SIMDVERTEX_T<SIMD_T> transposedPrims[2];
570 
571 #endif
572         uint32_t numInputPrims = pa.NumPrims();
573         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
574         {
575             uint32_t numEmittedVerts = pVertexCount[inputPrim];
576             if (numEmittedVerts < NumVertsPerPrim)
577             {
578                 continue;
579             }
580             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
581 
582             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
583             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
584 
585             numClippedPrims += numEmittedPrims;
586 
587             // tranpose clipper output so that each lane's vertices are in SIMD order
588             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
589             // for triangle fan
590 
591             // transpose pos
592             uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
593 
594 #if 0
595             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
596             static const float *dummy = reinterpret_cast<const float *>(pBase);
597 
598 #endif
599             for (uint32_t c = 0; c < 4; ++c)
600             {
601                 SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
602                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
603                 pBase += sizeof(typename SIMD_T::Float);
604             }
605 
606             // transpose attribs
607             pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
608 
609             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
610             {
611                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
612 
613                 for (uint32_t c = 0; c < 4; ++c)
614                 {
615                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
616                     transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
617                     pBase += sizeof(typename SIMD_T::Float);
618                 }
619             }
620 
621             // transpose user clip distances if enabled
622             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
623             if (state.backendState.clipDistanceMask & 0x0f)
624             {
625                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
626 
627                 for (uint32_t c = 0; c < 4; ++c)
628                 {
629                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
630                     transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
631                     pBase += sizeof(typename SIMD_T::Float);
632                 }
633             }
634 
635             if (state.backendState.clipDistanceMask & 0xf0)
636             {
637                 pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
638 
639                 for (uint32_t c = 0; c < 4; ++c)
640                 {
641                     SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
642                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
643                     pBase += sizeof(typename SIMD_T::Float);
644                 }
645             }
646 
647             PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
648             clipPA.viewportArrayActive = pa.viewportArrayActive;
649             clipPA.rtArrayActive = pa.rtArrayActive;
650 
651             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
652 
653             const uint32_t primMask = primMaskMap[numEmittedPrims];
654 
655             const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
656             const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
657             const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
658 
659 
660             while (clipPA.GetNextStreamOutput())
661             {
662                 do
663                 {
664                     typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
665 
666                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
667 
668                     if (assemble)
669                     {
670                         binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
671                     }
672 
673                 } while (clipPA.NextPrim());
674             }
675         }
676 
677 #if defined(_DEBUG)
678         AlignedFree(transposedPrims);
679 
680 #endif
681         // update global pipeline stat
682         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
683     }
684 
685     void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
686                       typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
687     {
688         SWR_ASSERT(pa.pDC != nullptr);
689 
690         SWR_CONTEXT *pContext = pa.pDC->pContext;
691 
692         BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
693 
694         // update clipper invocations pipeline stat
695         uint32_t numInvoc = _mm_popcnt_u32(primMask);
696         UPDATE_STAT_FE(CInvocations, numInvoc);
697 
698         ComputeClipCodes(prim, viewportIdx);
699 
700         // cull prims with NAN coords
701         primMask &= ~ComputeNaNMask(prim);
702 
703         // user cull distance cull
704         if (state.backendState.cullDistanceMask)
705         {
706             primMask &= ~ComputeUserClipCullMask(pa, prim);
707         }
708 
709         // cull prims outside view frustum
710         typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
711         int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
712 
713         // skip clipping for points
714         uint32_t clipMask = 0;
715         if (NumVertsPerPrim != 1)
716         {
717             clipMask = primMask & ComputeClipMask();
718         }
719 
720         if (clipMask)
721         {
722             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
723             // we have to clip tris, execute the clipper, which will also
724             // call the binner
725             ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
726             AR_END(FEGuardbandClip, 1);
727         }
728         else if (validMask)
729         {
730             // update CPrimitives pipeline state
731             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
732 
733             // forward valid prims directly to binner
734             binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
735         }
736     }
737 
738 private:
739     typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
740     {
741         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
742     }
743 
744     typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
745     {
746         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
747         const uint32_t componentStride  = sizeof(typename SIMD_T::Float);
748         const uint32_t attribStride     = sizeof(typename SIMD_T::Vec4);
749 
750         static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
751         {
752             0 * sizeof(float),
753             1 * sizeof(float),
754             2 * sizeof(float),
755             3 * sizeof(float),
756             4 * sizeof(float),
757             5 * sizeof(float),
758             6 * sizeof(float),
759             7 * sizeof(float),
760             8 * sizeof(float),
761             9 * sizeof(float),
762             10 * sizeof(float),
763             11 * sizeof(float),
764             12 * sizeof(float),
765             13 * sizeof(float),
766             14 * sizeof(float),
767             15 * sizeof(float),
768         };
769 
770         static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
771 
772         typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
773 
774         // step to the simdvertex
775         typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
776 
777         // step to the attribute and component
778         vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
779 
780         // step to the lane
781         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
782 
783         return vOffsets;
784     }
785 
786     typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
787     {
788         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
789         typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
790 
791         return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
792     }
793 
794     void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
795     {
796         typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
797 
798         const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
799         const float *pSrc = reinterpret_cast<const float *>(&vSrc);
800         uint32_t mask = SIMD_T::movemask_ps(vMask);
801         DWORD lane;
802         while (_BitScanForward(&lane, mask))
803         {
804             mask &= ~(1 << lane);
805             const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
806             *(float *)pBuf = pSrc[lane];
807         }
808     }
809 
810     template<SWR_CLIPCODES ClippingPlane>
811     void intersect(
812         const typename SIMD_T::Float &vActiveMask,  // active lanes to operate on
813         const typename SIMD_T::Integer &s,          // index to first edge vertex v0 in pInPts.
814         const typename SIMD_T::Integer &p,          // index to second edge vertex v1 in pInPts.
815         const typename SIMD_T::Vec4 &v1,            // vertex 0 position
816         const typename SIMD_T::Vec4 &v2,            // vertex 1 position
817         typename SIMD_T::Integer &outIndex,         // output index.
818         const float *pInVerts,                      // array of all the input positions.
819         uint32_t numInAttribs,                      // number of attributes per vertex.
820         float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
821     {
822         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
823         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
824 
825         // compute interpolation factor
826         typename SIMD_T::Float t;
827         switch (ClippingPlane)
828         {
829         case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
830         case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
831         case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
832         case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
833         case FRUSTUM_NEAR:
834             // DX Znear plane is 0, GL is -w
835             if (this->state.rastState.clipHalfZ)
836             {
837                 t = ComputeInterpFactor(v1[2], v2[2]);
838             }
839             else
840             {
841                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
842             }
843             break;
844         case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
845         default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
846         };
847 
848         // interpolate position and store
849         for (uint32_t c = 0; c < 4; ++c)
850         {
851             typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
852             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
853         }
854 
855         // interpolate attributes and store
856         for (uint32_t a = 0; a < numInAttribs; ++a)
857         {
858             uint32_t attribSlot = vertexAttribOffset + a;
859             for (uint32_t c = 0; c < 4; ++c)
860             {
861                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
862                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
863                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
864                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
865             }
866         }
867 
868         // interpolate clip distance if enabled
869         if (this->state.backendState.clipDistanceMask & 0xf)
870         {
871             uint32_t attribSlot = vertexClipCullOffset;
872             for (uint32_t c = 0; c < 4; ++c)
873             {
874                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
875                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
876                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
877                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
878             }
879         }
880 
881         if (this->state.backendState.clipDistanceMask & 0xf0)
882         {
883             uint32_t attribSlot = vertexClipCullOffset + 1;
884             for (uint32_t c = 0; c < 4; ++c)
885             {
886                 typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
887                 typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
888                 typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
889                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
890             }
891         }
892     }
893 
894     template<SWR_CLIPCODES ClippingPlane>
895     typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
896     {
897         switch (ClippingPlane)
898         {
899         case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
900         case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
901         case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
902         case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
903         case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
904         case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
905         default:
906             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
907             return SIMD_T::setzero_ps();
908         }
909     }
910 
911     template<SWR_CLIPCODES ClippingPlane>
912     typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
913     {
914         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
915 
916         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
917         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
918         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
919 
920         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
921         {
922             typename SIMD_T::Integer s = vCurIndex;
923             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
924             typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
925             p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
926 
927             // gather position
928             typename SIMD_T::Vec4 vInPos0, vInPos1;
929             for (uint32_t c = 0; c < 4; ++c)
930             {
931                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
932                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
933             }
934 
935             // compute inside mask
936             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
937             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
938 
939             // compute intersection mask (s_in != p_in)
940             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
941             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
942 
943             // store s if inside
944             s_in = SIMD_T::and_ps(s_in, vActiveMask);
945             if (!SIMD_T::testz_ps(s_in, s_in))
946             {
947                 // store position
948                 for (uint32_t c = 0; c < 4; ++c)
949                 {
950                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
951                 }
952 
953                 // store attribs
954                 for (uint32_t a = 0; a < numInAttribs; ++a)
955                 {
956                     uint32_t attribSlot = vertexAttribOffset + a;
957                     for (uint32_t c = 0; c < 4; ++c)
958                     {
959                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
960                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
961                     }
962                 }
963 
964                 // store clip distance if enabled
965                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
966                 if (this->state.backendState.clipDistanceMask & 0xf)
967                 {
968                     uint32_t attribSlot = vertexClipCullSlot;
969                     for (uint32_t c = 0; c < 4; ++c)
970                     {
971                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
972                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
973                     }
974                 }
975 
976                 if (this->state.backendState.clipDistanceMask & 0xf0)
977                 {
978                     uint32_t attribSlot = vertexClipCullSlot + 1;
979                     for (uint32_t c = 0; c < 4; ++c)
980                     {
981                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
982                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
983                     }
984                 }
985 
986                 // increment outIndex
987                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
988             }
989 
990             // compute and store intersection
991             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
992             {
993                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
994 
995                 // increment outIndex for active lanes
996                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
997             }
998 
999             // increment loop index and update active mask
1000             vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1001             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1002         }
1003 
1004         return vOutIndex;
1005     }
1006 
1007     template<SWR_CLIPCODES ClippingPlane>
1008     typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
1009     {
1010         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1011 
1012         typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
1013         typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
1014         typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1015 
1016         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1017         {
1018             typename SIMD_T::Integer s = vCurIndex;
1019             typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1020 
1021             // gather position
1022             typename SIMD_T::Vec4 vInPos0, vInPos1;
1023             for (uint32_t c = 0; c < 4; ++c)
1024             {
1025                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1026                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1027             }
1028 
1029             // compute inside mask
1030             typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
1031             typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
1032 
1033             // compute intersection mask (s_in != p_in)
1034             typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
1035             intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
1036 
1037             // store s if inside
1038             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1039             if (!SIMD_T::testz_ps(s_in, s_in))
1040             {
1041                 for (uint32_t c = 0; c < 4; ++c)
1042                 {
1043                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1044                 }
1045 
1046                 // interpolate attributes and store
1047                 for (uint32_t a = 0; a < numInAttribs; ++a)
1048                 {
1049                     uint32_t attribSlot = vertexAttribOffset + a;
1050                     for (uint32_t c = 0; c < 4; ++c)
1051                     {
1052                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1053                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1054                     }
1055                 }
1056 
1057                 // increment outIndex
1058                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1059             }
1060 
1061             // compute and store intersection
1062             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1063             {
1064                 intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
1065 
1066                 // increment outIndex for active lanes
1067                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1068             }
1069 
1070             // store p if inside
1071             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1072             if (!SIMD_T::testz_ps(p_in, p_in))
1073             {
1074                 for (uint32_t c = 0; c < 4; ++c)
1075                 {
1076                     ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1077                 }
1078 
1079                 // interpolate attributes and store
1080                 for (uint32_t a = 0; a < numInAttribs; ++a)
1081                 {
1082                     uint32_t attribSlot = vertexAttribOffset + a;
1083                     for (uint32_t c = 0; c < 4; ++c)
1084                     {
1085                         typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1086                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1087                     }
1088                 }
1089 
1090                 // increment outIndex
1091                 vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1092             }
1093         }
1094 
1095         return vOutIndex;
1096     }
1097 
1098     typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
1099     {
1100         // temp storage
1101         float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
1102 
1103         // zero out num input verts for non-active lanes
1104         typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
1105         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1106 
1107         // clip prims to frustum
1108         typename SIMD_T::Integer vNumOutPts;
1109         if (NumVertsPerPrim == 3)
1110         {
1111             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1112             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1113             vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1114             vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1115             vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1116             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1117         }
1118         else
1119         {
1120             SWR_ASSERT(NumVertsPerPrim == 2);
1121             vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1122             vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1123             vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1124             vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1125             vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1126             vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1127         }
1128 
1129         // restore num verts for non-clipped, active lanes
1130         typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1131         vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
1132 
1133         return vNumOutPts;
1134     }
1135 
1136     const uint32_t workerId{ 0 };
1137     DRAW_CONTEXT *pDC{ nullptr };
1138     const API_STATE &state;
1139     typename SIMD_T::Float clipCodes[NumVertsPerPrim];
1140 };
1141 
1142 
1143 // pipeline stage functions
1144 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1145 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1146 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
1147 #if USE_SIMD16_FRONTEND
1148 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1149 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1150 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
1151 #endif
1152 
1153