1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28 
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37 
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(
41     DRAW_CONTEXT *pDC,
42     PA_STATE &pa,
43     uint32_t workerId,
44     typename SIMD_T::Vec4 prim[],
45     typename SIMD_T::Float recipW[],
46     uint32_t primMask,
47     typename SIMD_T::Integer const &primID,
48     typename SIMD_T::Integer const &viewportIdx,
49     typename SIMD_T::Integer const &rtIdx);
50 
51 template <typename SIMD_T, uint32_t SIMD_WIDTH>
52 void BinPostSetupPointsImpl(
53     DRAW_CONTEXT *pDC,
54     PA_STATE &pa,
55     uint32_t workerId,
56     typename SIMD_T::Vec4 prim[],
57     uint32_t primMask,
58     typename SIMD_T::Integer const &primID,
59     typename SIMD_T::Integer const &viewportIdx,
60     typename SIMD_T::Integer const &rtIdx);
61 
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief Processes attributes for the backend based on linkage mask and
64 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
65 /// @param pDC - Draw context
66 /// @param pa - Primitive Assembly state
67 /// @param linkageMask - Specifies which VS outputs are routed to PS.
68 /// @param pLinkageMap - maps VS attribute slot to PS slot
69 /// @param triIndex - Triangle to process attributes for
70 /// @param pBuffer - Output result
71 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
ProcessAttributes(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t triIndex,uint32_t primId,float * pBuffer)72 INLINE void ProcessAttributes(
73     DRAW_CONTEXT *pDC,
74     PA_STATE&pa,
75     uint32_t triIndex,
76     uint32_t primId,
77     float *pBuffer)
78 {
79     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
80     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
81     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
82     uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
83     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
84     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
85 
86     static const float constTable[3][4] = {
87         { 0.0f, 0.0f, 0.0f, 0.0f },
88         { 0.0f, 0.0f, 0.0f, 1.0f },
89         { 1.0f, 1.0f, 1.0f, 1.0f }
90     };
91 
92     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
93     {
94         uint32_t inputSlot;
95         if (IsSwizzledT::value)
96         {
97             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
98             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
99 
100         }
101         else
102         {
103             inputSlot = backendState.vertexAttribOffset + i;
104         }
105 
106         simd4scalar attrib[3];    // triangle attribs (always 4 wide)
107         float* pAttribStart = pBuffer;
108 
109         if (HasConstantInterpT::value || IsDegenerate::value)
110         {
111             if (CheckBit(constantInterpMask, i))
112             {
113                 uint32_t vid;
114                 uint32_t adjustedTriIndex;
115                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
116                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
117                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
118                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
119                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
120 
121                 switch (topo) {
122                 case TOP_QUAD_LIST:
123                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
124                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
125                     break;
126                 case TOP_QUAD_STRIP:
127                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
128                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
129                     break;
130                 case TOP_TRIANGLE_STRIP:
131                     adjustedTriIndex = triIndex;
132                     vid = (triIndex & 1)
133                         ? tristripProvokingVertex[provokingVertex]
134                         : provokingVertex;
135                     break;
136                 default:
137                     adjustedTriIndex = triIndex;
138                     vid = provokingVertex;
139                     break;
140                 }
141 
142                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
143 
144                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
145                 {
146                     SIMD128::store_ps(pBuffer, attrib[vid]);
147                     pBuffer += 4;
148                 }
149             }
150             else
151             {
152                 pa.AssembleSingle(inputSlot, triIndex, attrib);
153 
154                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
155                 {
156                     SIMD128::store_ps(pBuffer, attrib[i]);
157                     pBuffer += 4;
158                 }
159             }
160         }
161         else
162         {
163             pa.AssembleSingle(inputSlot, triIndex, attrib);
164 
165             for (uint32_t i = 0; i < NumVertsT::value; ++i)
166             {
167                 SIMD128::store_ps(pBuffer, attrib[i]);
168                 pBuffer += 4;
169             }
170         }
171 
172         // pad out the attrib buffer to 3 verts to ensure the triangle
173         // interpolation code in the pixel shader works correctly for the
174         // 3 topologies - point, line, tri.  This effectively zeros out the
175         // effect of the missing vertices in the triangle interpolation.
176         for (uint32_t v = NumVertsT::value; v < 3; ++v)
177         {
178             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
179             pBuffer += 4;
180         }
181 
182         // check for constant source overrides
183         if (IsSwizzledT::value)
184         {
185             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
186             if (mask)
187             {
188                 DWORD comp;
189                 while (_BitScanForward(&comp, mask))
190                 {
191                     mask &= ~(1 << comp);
192 
193                     float constantValue = 0.0f;
194                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
195                     {
196                     case SWR_CONSTANT_SOURCE_CONST_0000:
197                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
198                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
199                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
200                         break;
201                     case SWR_CONSTANT_SOURCE_PRIM_ID:
202                         constantValue = *(float*)&primId;
203                         break;
204                     }
205 
206                     // apply constant value to all 3 vertices
207                     for (uint32_t v = 0; v < 3; ++v)
208                     {
209                         pAttribStart[comp + v * 4] = constantValue;
210                     }
211                 }
212             }
213         }
214     }
215 }
216 
217 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
218 
219 struct ProcessAttributesChooser
220 {
221     typedef PFN_PROCESS_ATTRIBUTES FuncType;
222 
223     template <typename... ArgsB>
GetFuncProcessAttributesChooser224     static FuncType GetFunc()
225     {
226         return ProcessAttributes<ArgsB...>;
227     }
228 };
229 
GetProcessAttributesFunc(uint32_t NumVerts,bool IsSwizzled,bool HasConstantInterp,bool IsDegenerate=false)230 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
231 {
232     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
233 }
234 
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Processes enabled user clip distances. Loads the active clip
237 ///        distances from the PA, sets up barycentric equations, and
238 ///        stores the results to the output buffer
239 /// @param pa - Primitive Assembly state
240 /// @param primIndex - primitive index to process
241 /// @param clipDistMask - mask of enabled clip distances
242 /// @param pUserClipBuffer - buffer to store results
243 template<uint32_t NumVerts>
ProcessUserClipDist(const SWR_BACKEND_STATE & state,PA_STATE & pa,uint32_t primIndex,float * pRecipW,float * pUserClipBuffer)244 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
245 {
246     DWORD clipDist;
247     uint32_t clipDistMask = state.clipDistanceMask;
248     while (_BitScanForward(&clipDist, clipDistMask))
249     {
250         clipDistMask &= ~(1 << clipDist);
251         uint32_t clipSlot = clipDist >> 2;
252         uint32_t clipComp = clipDist & 0x3;
253         uint32_t clipAttribSlot = clipSlot == 0 ?
254             state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
255 
256         simd4scalar primClipDist[3];
257         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
258 
259         float vertClipDist[NumVerts];
260         for (uint32_t e = 0; e < NumVerts; ++e)
261         {
262             OSALIGNSIMD(float) aVertClipDist[4];
263             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
264             vertClipDist[e] = aVertClipDist[clipComp];
265         };
266 
267         // setup plane equations for barycentric interpolation in the backend
268         float baryCoeff[NumVerts];
269         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
270         for (uint32_t e = 0; e < NumVerts - 1; ++e)
271         {
272             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
273         }
274         baryCoeff[NumVerts - 1] = last;
275 
276         for (uint32_t e = 0; e < NumVerts; ++e)
277         {
278             *(pUserClipBuffer++) = baryCoeff[e];
279         }
280     }
281 }
282 
283 INLINE
TransposeVertices(simd4scalar (& dst)[8],const simdscalar & src0,const simdscalar & src1,const simdscalar & src2)284 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
285 {
286     vTranspose3x8(dst, src0, src1, src2);
287 }
288 
289 INLINE
TransposeVertices(simd4scalar (& dst)[16],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2)290 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
291 {
292     vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
293 }
294 
295 
296 #if KNOB_ENABLE_EARLY_RAST
297 
298 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
299 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
300 
301 
302 template<typename SIMD_T>
303 struct EarlyRastHelper
304 {
305 };
306 
307 template<>
308 struct EarlyRastHelper<SIMD256>
309 {
InitShiftCntrlEarlyRastHelper310     static SIMD256::Integer InitShiftCntrl()
311     {
312         return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
313     }
314 };
315 
316 #if USE_SIMD16_FRONTEND
317 template<>
318 struct EarlyRastHelper<SIMD512>
319 {
InitShiftCntrlEarlyRastHelper320     static SIMD512::Integer InitShiftCntrl()
321     {
322         return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
323     }
324 };
325 
326 #endif
327 //////////////////////////////////////////////////////////////////////////
328 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
329 ///        (ER tile) can be rasterized as early as in binner to check if
330 ///        they cover any  pixels. If not - the triangles can be
331 ///        culled in binner.
332 ///
333 /// @param er_bbox - coordinates of ER tile for each triangle
334 /// @param vAi - A coefficients of triangle edges
335 /// @param vBi - B coefficients of triangle edges
336 /// @param vXi - X coordinates of triangle vertices
337 /// @param vYi - Y coordinates of triangle vertices
338 /// @param frontWindingTris - mask indicating CCW/CW triangles
339 /// @param triMask - mask for valid SIMD lanes (triangles)
340 /// @param oneTileMask - defines triangles for ER to work on
341 ///                      (tris that fit into ER tile)
342 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
EarlyRasterizer(SIMDBBOX_T<SIMD_T> & er_bbox,typename SIMD_T::Integer (& vAi)[3],typename SIMD_T::Integer (& vBi)[3],typename SIMD_T::Integer (& vXi)[3],typename SIMD_T::Integer (& vYi)[3],uint32_t cwTrisMask,uint32_t triMask,uint32_t oneTileMask)343 uint32_t SIMDCALL EarlyRasterizer(
344         SIMDBBOX_T<SIMD_T> &er_bbox,
345         typename SIMD_T::Integer (&vAi)[3],
346         typename SIMD_T::Integer (&vBi)[3],
347         typename SIMD_T::Integer (&vXi)[3],
348         typename SIMD_T::Integer (&vYi)[3],
349         uint32_t cwTrisMask,
350         uint32_t triMask,
351         uint32_t oneTileMask)
352 {
353     // step to pixel center of top-left pixel of the triangle bbox
354     typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
355     vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
356 
357     typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
358     vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
359 
360     // negate A and B for CW tris
361     typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
362     typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
363     typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
364     typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
365     typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
366     typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
367 
368     RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
369 
370     typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
371     typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask);
372     typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
373 
374     vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
375     vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
376     vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
377     vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
378     vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
379     vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
380 
381     // evaluate edge equations at top-left pixel
382     typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
383     typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
384     typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
385 
386     typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
387     typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
388     typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
389 
390     typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
391     typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
392     typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
393 
394     typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
395     typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
396     typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
397 
398     typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
399     typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
400     typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
401 
402     vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
403     vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
404     vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
405 
406     // top left rule
407     typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
408     typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
409     typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
410 
411     // vA < 0
412     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
413     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
414     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
415 
416     // vA == 0 && vB < 0
417     typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
418     typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
419     typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
420 
421     vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
422     vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
423     vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
424 
425     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
426     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
427     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
428 
429 
430 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
431     // Go down
432     // coverage pixel 0
433     typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
434     vMask0 = SIMD_T::and_si(vMask0, vEdge2);
435 
436     // coverage pixel 1
437     typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
438     typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
439     typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
440     typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
441     vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
442 
443     // coverage pixel 2
444     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
445     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
446     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
447     typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
448     vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
449 
450     // coverage pixel 3
451     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
452     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
453     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
454     typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
455     vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
456 
457     // One step to the right and then up
458 
459     // coverage pixel 4
460     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
461     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
462     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
463     typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
464     vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
465 
466     // coverage pixel 5
467     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
468     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
469     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
470     typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
471     vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
472 
473     // coverage pixel 6
474     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
475     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
476     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
477     typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
478     vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
479 
480     // coverage pixel 7
481     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
482     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
483     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
484     typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
485     vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
486 
487     typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1);
488     vLit1 = SIMD_T::or_si(vLit1, vMask2);
489     vLit1 = SIMD_T::or_si(vLit1, vMask3);
490     vLit1 = SIMD_T::or_si(vLit1, vMask4);
491     vLit1 = SIMD_T::or_si(vLit1, vMask5);
492     vLit1 = SIMD_T::or_si(vLit1, vMask6);
493     vLit1 = SIMD_T::or_si(vLit1, vMask7);
494 
495     // Step to the right and go down again
496 
497     // coverage pixel 0
498     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
499     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
500     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
501     vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
502     vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
503 
504     // coverage pixel 1
505     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
506     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
507     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
508     vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
509     vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
510 
511     // coverage pixel 2
512     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
513     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
514     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
515     vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
516     vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
517 
518     // coverage pixel 3
519     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
520     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
521     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
522     vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
523     vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
524 
525     // And for the last time - to the right and up
526 
527     // coverage pixel 4
528     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
529     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
530     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
531     vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
532     vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
533 
534     // coverage pixel 5
535     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
536     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
537     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
538     vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
539     vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
540 
541     // coverage pixel 6
542     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
543     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
544     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
545     vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
546     vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
547 
548     // coverage pixel 7
549     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
550     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
551     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
552     vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
553     vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
554 
555     typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1);
556     vLit2 = SIMD_T::or_si(vLit2, vMask2);
557     vLit2 = SIMD_T::or_si(vLit2, vMask3);
558     vLit2 = SIMD_T::or_si(vLit2, vMask4);
559     vLit2 = SIMD_T::or_si(vLit2, vMask5);
560     vLit2 = SIMD_T::or_si(vLit2, vMask6);
561     vLit2 = SIMD_T::or_si(vLit2, vMask7);
562 
563     typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2);
564 
565 #else
566     // Generic algorithm sweeping in row by row order
567     typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM];
568 
569     typename SIMD_T::Integer vEdge0N = vEdge0;
570     typename SIMD_T::Integer vEdge1N = vEdge1;
571     typename SIMD_T::Integer vEdge2N = vEdge2;
572 
573     for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
574     {
575         // Store edge values at the beginning of the row
576         typename SIMD_T::Integer vRowEdge0 = vEdge0N;
577         typename SIMD_T::Integer vRowEdge1 = vEdge1N;
578         typename SIMD_T::Integer vRowEdge2 = vEdge2N;
579 
580         typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM];
581 
582         for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
583         {
584             vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
585             vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
586 
587             vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
588             vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
589             vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
590         }
591         vRowMask[row] = vColMask[0];
592         for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
593         {
594             vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
595         }
596         // Restore values and go to the next row
597         vEdge0N = vRowEdge0;
598         vEdge1N = vRowEdge1;
599         vEdge2N = vRowEdge2;
600 
601         vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
602         vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
603         vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
604     }
605 
606     // compress all masks
607     typename SIMD_T::Integer vLit = vRowMask[0];
608     for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
609     {
610         vLit = SIMD_T::or_si(vLit, vRowMask[row]);
611     }
612 
613 #endif
614     // Check which triangles has any pixel lit
615     uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
616     uint32_t maskUnlit = ~maskLit & oneTileMask;
617 
618     uint32_t oldTriMask = triMask;
619     triMask &= ~maskUnlit;
620 
621     if (triMask ^ oldTriMask)
622     {
623         RDTSC_EVENT(FEEarlyRastExit, _mm_popcnt_u32(triMask & oneTileMask), 0);
624     }
625     return triMask;
626 }
627 
628 #endif // Early rasterizer
629 
630 //////////////////////////////////////////////////////////////////////////
631 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
632 ///        culling, viewport transform, etc.
633 /// @param pDC - pointer to draw context.
634 /// @param pa - The primitive assembly object.
635 /// @param workerId - thread's worker id. Even thread has a unique id.
636 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
637 /// @param primID - Primitive ID for each triangle.
638 /// @param viewportIdx - viewport array index for each triangle.
639 /// @tparam CT - ConservativeRastFETraits
640 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
BinTrianglesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 tri[3],uint32_t triMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)641 void SIMDCALL BinTrianglesImpl(
642     DRAW_CONTEXT *pDC,
643     PA_STATE &pa,
644     uint32_t workerId,
645     typename SIMD_T::Vec4 tri[3],
646     uint32_t triMask,
647     typename SIMD_T::Integer const &primID,
648     typename SIMD_T::Integer const &viewportIdx,
649     typename SIMD_T::Integer const &rtIdx)
650 {
651     SWR_CONTEXT *pContext = pDC->pContext;
652     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
653 
654     AR_BEGIN(FEBinTriangles, pDC->drawId);
655 
656     const API_STATE& state = GetApiState(pDC);
657     const SWR_RASTSTATE& rastState = state.rastState;
658     const SWR_FRONTEND_STATE& feState = state.frontendState;
659 
660     MacroTileMgr *pTileMgr = pDC->pTileMgr;
661 
662     typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
663     typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
664     typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
665 
666     if (feState.vpTransformDisable)
667     {
668         // RHW is passed in directly when VP transform is disabled
669         vRecipW0 = tri[0].v[3];
670         vRecipW1 = tri[1].v[3];
671         vRecipW2 = tri[2].v[3];
672     }
673     else
674     {
675         // Perspective divide
676         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
677         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
678         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
679 
680         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
681         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
682         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
683 
684         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
685         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
686         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
687 
688         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
689         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
690         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
691 
692         // Viewport transform to screen space coords
693         if (pa.viewportArrayActive)
694         {
695             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
696         }
697         else
698         {
699             viewportTransform<3>(tri, state.vpMatrices);
700         }
701     }
702 
703     // Adjust for pixel center location
704     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
705 
706     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
707     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
708 
709     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
710     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
711 
712     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
713     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
714 
715     // Set vXi, vYi to required fixed point precision
716     typename SIMD_T::Integer vXi[3], vYi[3];
717     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
718 
719     // triangle setup
720     typename SIMD_T::Integer vAi[3], vBi[3];
721     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
722 
723     // determinant
724     typename SIMD_T::Integer vDet[2];
725     calcDeterminantIntVertical(vAi, vBi, vDet);
726 
727     // cull zero area
728     uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
729     uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
730 
731     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
732 
733     // don't cull degenerate triangles if we're conservatively rasterizing
734     uint32_t origTriMask = triMask;
735     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
736     {
737         triMask &= ~cullZeroAreaMask;
738     }
739 
740     // determine front winding tris
741     // CW  +det
742     // CCW det < 0;
743     // 0 area triangles are marked as backfacing regardless of winding order,
744     // which is required behavior for conservative rast and wireframe rendering
745     uint32_t frontWindingTris;
746     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
747     {
748         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
749         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
750     }
751     else
752     {
753         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
754         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
755     }
756     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
757 
758     // cull
759     uint32_t cullTris;
760     switch ((SWR_CULLMODE)rastState.cullMode)
761     {
762     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
763     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
764     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
765         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
766     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
767     default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
768     }
769 
770     triMask &= ~cullTris;
771 
772     if (origTriMask ^ triMask)
773     {
774         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
775     }
776 
777     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
778     // compute per tri backface
779     uint32_t frontFaceMask = frontWindingTris;
780     uint32_t *pPrimID = (uint32_t *)&primID;
781     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
782     DWORD triIndex = 0;
783 
784     uint32_t edgeEnable;
785     PFN_WORK_FUNC pfnWork;
786     if (CT::IsConservativeT::value)
787     {
788         // determine which edges of the degenerate tri, if any, are valid to rasterize.
789         // used to call the appropriate templated rasterizer function
790         if (cullZeroAreaMask > 0)
791         {
792             // e0 = v1-v0
793             const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
794             const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
795 
796             uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
797 
798             // e1 = v2-v1
799             const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
800             const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
801 
802             uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
803 
804             // e2 = v0-v2
805             // if v0 == v1 & v1 == v2, v0 == v2
806             uint32_t e2Mask = e0Mask & e1Mask;
807             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
808 
809             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
810             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
811             e0Mask = pdep_u32(e0Mask, 0x00249249);
812 
813             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
814             e1Mask = pdep_u32(e1Mask, 0x00492492);
815 
816             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
817             e2Mask = pdep_u32(e2Mask, 0x00924924);
818 
819             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
820         }
821         else
822         {
823             edgeEnable = 0x00FFFFFF;
824         }
825     }
826     else
827     {
828         // degenerate triangles won't be sent to rasterizer; just enable all edges
829         pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
830             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
831     }
832 
833     SIMDBBOX_T<SIMD_T> bbox;
834 
835     if (!triMask)
836     {
837         goto endBinTriangles;
838     }
839 
840     // Calc bounding box of triangles
841     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
842 
843     // determine if triangle falls between pixel centers and discard
844     // only discard for non-MSAA case and when conservative rast is disabled
845     // (xmin + 127) & ~255
846     // (xmax + 128) & ~255
847     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
848         (!CT::IsConservativeT::value))
849     {
850         origTriMask = triMask;
851 
852         int cullCenterMask;
853 
854         {
855             typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
856             xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
857             typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
858             xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
859 
860             typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
861 
862             typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
863             ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
864             typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
865             ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
866 
867             typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
868 
869             vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
870             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
871         }
872 
873         triMask &= ~cullCenterMask;
874 
875         if (origTriMask ^ triMask)
876         {
877             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
878         }
879     }
880 
881     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
882     // Gather the AOS effective scissor rects based on the per-prim VP index.
883     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
884     {
885         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
886         if (pa.viewportArrayActive)
887 
888         {
889             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
890         }
891         else // broadcast fast path for non-VPAI case.
892         {
893             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
894             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
895             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
896             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
897         }
898 
899         // Make triangle bbox inclusive
900         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
901         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
902 
903         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
904         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
905         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
906         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
907     }
908 
909     if (CT::IsConservativeT::value)
910     {
911         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
912         // some area. Bump the xmax/ymax edges out
913 
914         typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
915         bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
916 
917         typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
918         bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
919     }
920 
921     // Cull tris completely outside scissor
922     {
923         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
924         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
925         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
926         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
927         triMask = triMask & ~maskOutsideScissor;
928     }
929 
930 #if KNOB_ENABLE_EARLY_RAST
931     if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
932     {
933         // Try early rasterization - culling small triangles which do not cover any pixels
934 
935         // convert to ER tiles
936         SIMDBBOX_T<SIMD_T> er_bbox;
937 
938         er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
939         er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
940         er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
941         er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
942 
943         typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
944         typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
945 
946         // Take only triangles that fit into ER tile
947         uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
948 
949         if (oneTileMask)
950         {
951             // determine CW tris (det > 0)
952             uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
953             uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
954             uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
955 
956             // Try early rasterization
957             triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
958 
959             if (!triMask)
960             {
961                 AR_END(FEBinTriangles, 1);
962                 return;
963             }
964         }
965 
966     }
967 #endif
968 
969 endBinTriangles:
970 
971 
972     // Send surviving triangles to the line or point binner based on fill mode
973     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
974     {
975         // Simple non-conformant wireframe mode, useful for debugging
976         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
977         typename SIMD_T::Vec4 line[2];
978         typename SIMD_T::Float recipW[2];
979 
980         line[0] = tri[0];
981         line[1] = tri[1];
982         recipW[0] = vRecipW0;
983         recipW[1] = vRecipW1;
984 
985         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
986 
987         line[0] = tri[1];
988         line[1] = tri[2];
989         recipW[0] = vRecipW1;
990         recipW[1] = vRecipW2;
991 
992         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
993 
994         line[0] = tri[2];
995         line[1] = tri[0];
996         recipW[0] = vRecipW2;
997         recipW[1] = vRecipW0;
998 
999         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1000 
1001         AR_END(FEBinTriangles, 1);
1002         return;
1003     }
1004     else if (rastState.fillMode == SWR_FILLMODE_POINT)
1005     {
1006         // Bin 3 points
1007         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
1008         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
1009         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
1010 
1011         AR_END(FEBinTriangles, 1);
1012         return;
1013     }
1014 
1015     // Convert triangle bbox to macrotile units.
1016     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1017     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1018     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1019     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1020 
1021     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1022 
1023     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1024     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1025     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1026     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1027 
1028     // transpose verts needed for backend
1029     /// @todo modify BE to take non-transformed verts
1030     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1031     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1032     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1033     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1034 
1035     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1036     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1037     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1038     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1039 
1040     // scan remaining valid triangles and bin each separately
1041     while (_BitScanForward(&triIndex, triMask))
1042     {
1043         uint32_t linkageCount = state.backendState.numAttributes;
1044         uint32_t numScalarAttribs = linkageCount * 4;
1045 
1046         BE_WORK work;
1047         work.type = DRAW;
1048 
1049         bool isDegenerate;
1050         if (CT::IsConservativeT::value)
1051         {
1052             // only rasterize valid edges if we have a degenerate primitive
1053             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1054             work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1055                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1056 
1057             // Degenerate triangles are required to be constant interpolated
1058             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1059         }
1060         else
1061         {
1062             isDegenerate = false;
1063             work.pfnWork = pfnWork;
1064         }
1065 
1066         // Select attribute processor
1067         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1068             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1069 
1070         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1071 
1072         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1073         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1074         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1075 
1076         auto pArena = pDC->pArena;
1077         SWR_ASSERT(pArena != nullptr);
1078 
1079         // store active attribs
1080         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1081         desc.pAttribs = pAttribs;
1082         desc.numAttribs = linkageCount;
1083         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1084 
1085         // store triangle vertex data
1086         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1087 
1088         SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
1089         SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
1090         SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
1091         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1092 
1093         // store user clip distances
1094         if (state.backendState.clipDistanceMask)
1095         {
1096             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1097             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1098             ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1099         }
1100 
1101         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1102         {
1103             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1104             {
1105 #if KNOB_ENABLE_TOSS_POINTS
1106                 if (!KNOB_TOSS_SETUP_TRIS)
1107 #endif
1108                 {
1109                     pTileMgr->enqueue(x, y, &work);
1110                 }
1111             }
1112         }
1113 
1114                      triMask &= ~(1 << triIndex);
1115     }
1116 
1117     AR_END(FEBinTriangles, 1);
1118 }
1119 
1120 template <typename CT>
BinTriangles(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector tri[3],uint32_t triMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1121 void BinTriangles(
1122     DRAW_CONTEXT *pDC,
1123     PA_STATE &pa,
1124     uint32_t workerId,
1125     simdvector tri[3],
1126     uint32_t triMask,
1127     simdscalari const &primID,
1128     simdscalari const &viewportIdx,
1129     simdscalari const &rtIdx)
1130 {
1131     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1132 }
1133 
1134 #if USE_SIMD16_FRONTEND
1135 template <typename CT>
BinTriangles_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector tri[3],uint32_t triMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1136 void SIMDCALL BinTriangles_simd16(
1137     DRAW_CONTEXT *pDC,
1138     PA_STATE &pa,
1139     uint32_t workerId,
1140     simd16vector tri[3],
1141     uint32_t triMask,
1142     simd16scalari const &primID,
1143     simd16scalari const &viewportIdx,
1144     simd16scalari const &rtIdx)
1145 {
1146     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1147 }
1148 
1149 #endif
1150 struct FEBinTrianglesChooser
1151 {
1152     typedef PFN_PROCESS_PRIMS FuncType;
1153 
1154     template <typename... ArgsB>
GetFuncFEBinTrianglesChooser1155     static FuncType GetFunc()
1156     {
1157         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1158     }
1159 };
1160 
1161 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc(bool IsConservative)1162 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1163 {
1164     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1165 }
1166 
1167 #if USE_SIMD16_FRONTEND
1168 struct FEBinTrianglesChooser_simd16
1169 {
1170     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1171 
1172     template <typename... ArgsB>
GetFuncFEBinTrianglesChooser_simd161173     static FuncType GetFunc()
1174     {
1175         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1176     }
1177 };
1178 
1179 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc_simd16(bool IsConservative)1180 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1181 {
1182     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1183 }
1184 
1185 #endif
1186 
1187 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1188 void BinPostSetupPointsImpl(
1189     DRAW_CONTEXT *pDC,
1190     PA_STATE &pa,
1191     uint32_t workerId,
1192     typename SIMD_T::Vec4 prim[],
1193     uint32_t primMask,
1194     typename SIMD_T::Integer const &primID,
1195     typename SIMD_T::Integer const &viewportIdx,
1196     typename SIMD_T::Integer const &rtIdx)
1197 {
1198     SWR_CONTEXT *pContext = pDC->pContext;
1199 
1200     AR_BEGIN(FEBinPoints, pDC->drawId);
1201 
1202     typename SIMD_T::Vec4 &primVerts = prim[0];
1203 
1204     const API_STATE& state = GetApiState(pDC);
1205     const SWR_RASTSTATE& rastState = state.rastState;
1206     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1207 
1208     // Select attribute processor
1209     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1210         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1211 
1212     // convert to fixed point
1213     typename SIMD_T::Integer vXi, vYi;
1214 
1215     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1216     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1217 
1218     if (CanUseSimplePoints(pDC))
1219     {
1220         // adjust for ymin-xmin rule
1221         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1222         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1223 
1224         // cull points off the ymin-xmin edge of the viewport
1225         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1226         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1227 
1228         // compute macro tile coordinates
1229         typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1230         typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1231 
1232         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1233 
1234         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1235         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1236 
1237         // compute raster tile coordinates
1238         typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1239         typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1240 
1241         // compute raster tile relative x,y for coverage mask
1242         typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1243         typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1244 
1245         typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1246         typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1247 
1248         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1249         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1250 
1251         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1252         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1253 
1254         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1255         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1256 
1257         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1258         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1259 
1260         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1261         SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1262 
1263         // store render target array index
1264         const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1265 
1266         uint32_t *pPrimID = (uint32_t *)&primID;
1267         DWORD primIndex = 0;
1268 
1269         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1270 
1271         // scan remaining valid triangles and bin each separately
1272         while (_BitScanForward(&primIndex, primMask))
1273         {
1274             uint32_t linkageCount = backendState.numAttributes;
1275             uint32_t numScalarAttribs = linkageCount * 4;
1276 
1277             BE_WORK work;
1278             work.type = DRAW;
1279 
1280             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1281 
1282             // points are always front facing
1283             desc.triFlags.frontFacing = 1;
1284             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1285             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1286 
1287             work.pfnWork = RasterizeSimplePoint;
1288 
1289             auto pArena = pDC->pArena;
1290             SWR_ASSERT(pArena != nullptr);
1291 
1292             // store attributes
1293             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1294             desc.pAttribs = pAttribs;
1295             desc.numAttribs = linkageCount;
1296 
1297             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1298 
1299             // store raster tile aligned x, y, perspective correct z
1300             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1301             desc.pTriBuffer = pTriBuffer;
1302             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1303             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1304             *pTriBuffer = aZ[primIndex];
1305 
1306             uint32_t tX = aTileRelativeX[primIndex];
1307             uint32_t tY = aTileRelativeY[primIndex];
1308 
1309             // pack the relative x,y into the coverageMask, the rasterizer will
1310             // generate the true coverage mask from it
1311             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1312 
1313             // bin it
1314             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1315 #if KNOB_ENABLE_TOSS_POINTS
1316             if (!KNOB_TOSS_SETUP_TRIS)
1317 #endif
1318             {
1319                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1320             }
1321 
1322             primMask &= ~(1 << primIndex);
1323         }
1324     }
1325     else
1326     {
1327         // non simple points need to be potentially binned to multiple macro tiles
1328         typename SIMD_T::Float vPointSize;
1329 
1330         if (rastState.pointParam)
1331         {
1332             typename SIMD_T::Vec4 size[3];
1333             pa.Assemble(VERTEX_SGV_SLOT, size);
1334             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1335         }
1336         else
1337         {
1338             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1339         }
1340 
1341         // bloat point to bbox
1342         SIMDBBOX_T<SIMD_T> bbox;
1343 
1344         bbox.xmin = bbox.xmax = vXi;
1345         bbox.ymin = bbox.ymax = vYi;
1346 
1347         typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1348         typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1349 
1350         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1351         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1352         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1353         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1354 
1355         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1356         // Gather the AOS effective scissor rects based on the per-prim VP index.
1357         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1358         {
1359             typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1360 
1361             if (pa.viewportArrayActive)
1362             {
1363                 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1364             }
1365             else // broadcast fast path for non-VPAI case.
1366             {
1367                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1368                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1369                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1370                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1371             }
1372 
1373             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1374             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1375             bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1376             bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1377         }
1378 
1379         // Cull bloated points completely outside scissor
1380         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1381         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1382         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1383         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1384         primMask = primMask & ~maskOutsideScissor;
1385 
1386         // Convert bbox to macrotile units.
1387         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1388         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1389         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1390         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1391 
1392         OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1393 
1394         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1395         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1396         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1397         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1398 
1399         // store render target array index
1400         const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1401 
1402         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1403         SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1404 
1405         uint32_t *pPrimID = (uint32_t *)&primID;
1406 
1407         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1408         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1409         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1410 
1411         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1412         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1413         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1414 
1415         // scan remaining valid prims and bin each separately
1416         const SWR_BACKEND_STATE& backendState = state.backendState;
1417         DWORD primIndex;
1418         while (_BitScanForward(&primIndex, primMask))
1419         {
1420             uint32_t linkageCount = backendState.numAttributes;
1421             uint32_t numScalarAttribs = linkageCount * 4;
1422 
1423             BE_WORK work;
1424             work.type = DRAW;
1425 
1426             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1427 
1428             desc.triFlags.frontFacing = 1;
1429             desc.triFlags.pointSize = aPointSize[primIndex];
1430             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1431             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1432 
1433             work.pfnWork = RasterizeTriPoint;
1434 
1435             auto pArena = pDC->pArena;
1436             SWR_ASSERT(pArena != nullptr);
1437 
1438             // store active attribs
1439             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1440             desc.numAttribs = linkageCount;
1441             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1442 
1443             // store point vertex data
1444             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1445             desc.pTriBuffer = pTriBuffer;
1446             *pTriBuffer++ = aPrimVertsX[primIndex];
1447             *pTriBuffer++ = aPrimVertsY[primIndex];
1448             *pTriBuffer = aPrimVertsZ[primIndex];
1449 
1450             // store user clip distances
1451             if (backendState.clipDistanceMask)
1452             {
1453                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1454                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1455                 float dists[8];
1456                 float one = 1.0f;
1457                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1458                 for (uint32_t i = 0; i < numClipDist; i++) {
1459                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1460                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1461                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1462                 }
1463             }
1464 
1465             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1466             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1467             {
1468                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1469                 {
1470 #if KNOB_ENABLE_TOSS_POINTS
1471                     if (!KNOB_TOSS_SETUP_TRIS)
1472 #endif
1473                     {
1474                         pTileMgr->enqueue(x, y, &work);
1475                     }
1476                 }
1477             }
1478 
1479             primMask &= ~(1 << primIndex);
1480         }
1481     }
1482 
1483     AR_END(FEBinPoints, 1);
1484 }
1485 
1486 //////////////////////////////////////////////////////////////////////////
1487 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1488 /// @param pDC - pointer to draw context.
1489 /// @param pa - The primitive assembly object.
1490 /// @param workerId - thread's worker id. Even thread has a unique id.
1491 /// @param tri - Contains point position data for SIMDs worth of points.
1492 /// @param primID - Primitive ID for each point.
1493 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[3],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1494 void BinPointsImpl(
1495     DRAW_CONTEXT *pDC,
1496     PA_STATE &pa,
1497     uint32_t workerId,
1498     typename SIMD_T::Vec4 prim[3],
1499     uint32_t primMask,
1500     typename SIMD_T::Integer const &primID,
1501     typename SIMD_T::Integer const &viewportIdx,
1502     typename SIMD_T::Integer const &rtIdx)
1503 {
1504     const API_STATE& state = GetApiState(pDC);
1505     const SWR_FRONTEND_STATE& feState = state.frontendState;
1506     const SWR_RASTSTATE& rastState = state.rastState;
1507 
1508     if (!feState.vpTransformDisable)
1509     {
1510         // perspective divide
1511         typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1512 
1513         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1514         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1515         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1516 
1517         // viewport transform to screen coords
1518         if (pa.viewportArrayActive)
1519         {
1520             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1521         }
1522         else
1523         {
1524             viewportTransform<1>(prim, state.vpMatrices);
1525         }
1526     }
1527 
1528     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1529 
1530     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1531     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1532 
1533     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1534         pDC,
1535         pa,
1536         workerId,
1537         prim,
1538         primMask,
1539         primID,
1540         viewportIdx,
1541         rtIdx);
1542 }
1543 
BinPoints(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[3],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1544 void BinPoints(
1545     DRAW_CONTEXT *pDC,
1546     PA_STATE &pa,
1547     uint32_t workerId,
1548     simdvector prim[3],
1549     uint32_t primMask,
1550     simdscalari const &primID,
1551     simdscalari const &viewportIdx,
1552     simdscalari const &rtIdx)
1553 {
1554     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1555         pDC,
1556         pa,
1557         workerId,
1558         prim,
1559         primMask,
1560         primID,
1561         viewportIdx,
1562         rtIdx);
1563 }
1564 
1565 #if USE_SIMD16_FRONTEND
BinPoints_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1566 void SIMDCALL BinPoints_simd16(
1567     DRAW_CONTEXT *pDC,
1568     PA_STATE &pa,
1569     uint32_t workerId,
1570     simd16vector prim[3],
1571     uint32_t primMask,
1572     simd16scalari const &primID,
1573     simd16scalari const &viewportIdx,
1574     simd16scalari const & rtIdx)
1575 {
1576     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1577         pDC,
1578         pa,
1579         workerId,
1580         prim,
1581         primMask,
1582         primID,
1583         viewportIdx,
1584         rtIdx);
1585 }
1586 
1587 #endif
1588 //////////////////////////////////////////////////////////////////////////
1589 /// @brief Bin SIMD lines to the backend.
1590 /// @param pDC - pointer to draw context.
1591 /// @param pa - The primitive assembly object.
1592 /// @param workerId - thread's worker id. Even thread has a unique id.
1593 /// @param tri - Contains line position data for SIMDs worth of points.
1594 /// @param primID - Primitive ID for each line.
1595 /// @param viewportIdx - Viewport Array Index for each line.
1596 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[],typename SIMD_T::Float recipW[],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1597 void BinPostSetupLinesImpl(
1598     DRAW_CONTEXT *pDC,
1599     PA_STATE &pa,
1600     uint32_t workerId,
1601     typename SIMD_T::Vec4 prim[],
1602     typename SIMD_T::Float recipW[],
1603     uint32_t primMask,
1604     typename SIMD_T::Integer const &primID,
1605     typename SIMD_T::Integer const &viewportIdx,
1606     typename SIMD_T::Integer const &rtIdx)
1607 {
1608     SWR_CONTEXT *pContext = pDC->pContext;
1609     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1610 
1611     AR_BEGIN(FEBinLines, pDC->drawId);
1612 
1613     const API_STATE &state = GetApiState(pDC);
1614     const SWR_RASTSTATE &rastState = state.rastState;
1615 
1616     // Select attribute processor
1617     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1618         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1619 
1620     typename SIMD_T::Float &vRecipW0 = recipW[0];
1621     typename SIMD_T::Float &vRecipW1 = recipW[1];
1622 
1623     // convert to fixed point
1624     typename SIMD_T::Integer vXi[2], vYi[2];
1625 
1626     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1627     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1628     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1629     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1630 
1631     // compute x-major vs y-major mask
1632     typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1633     typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1634     typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1635     uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1636 
1637     // cull zero-length lines
1638     typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1639     vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1640 
1641     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1642 
1643     uint32_t *pPrimID = (uint32_t *)&primID;
1644     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1645 
1646     // Calc bounding box of lines
1647     SIMDBBOX_T<SIMD_T> bbox;
1648     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1649     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1650     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1651     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1652 
1653     // bloat bbox by line width along minor axis
1654     typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1655     typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1656 
1657     SIMDBBOX_T<SIMD_T> bloatBox;
1658 
1659     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1660     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1661     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1662     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1663 
1664     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1665     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1666     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1667     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1668 
1669     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1670     {
1671         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1672 
1673         if (pa.viewportArrayActive)
1674         {
1675             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1676         }
1677         else // broadcast fast path for non-VPAI case.
1678         {
1679             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1680             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1681             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1682             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1683         }
1684 
1685         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1686         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1687         bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1688         bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1689     }
1690 
1691     // Cull prims completely outside scissor
1692     {
1693         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1694         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1695         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1696         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1697         primMask = primMask & ~maskOutsideScissor;
1698     }
1699 
1700     // transpose verts needed for backend
1701     /// @todo modify BE to take non-transformed verts
1702     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1703     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1704     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1705     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1706 
1707     if (!primMask)
1708     {
1709         goto endBinLines;
1710     }
1711 
1712     // Convert triangle bbox to macrotile units.
1713     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1714     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1715     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1716     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1717 
1718     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1719 
1720     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1721     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1722     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1723     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1724 
1725     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1726     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1727     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1728     TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
1729 
1730     // scan remaining valid prims and bin each separately
1731     DWORD primIndex;
1732     while (_BitScanForward(&primIndex, primMask))
1733     {
1734         uint32_t linkageCount = state.backendState.numAttributes;
1735         uint32_t numScalarAttribs = linkageCount * 4;
1736 
1737         BE_WORK work;
1738         work.type = DRAW;
1739 
1740         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1741 
1742         desc.triFlags.frontFacing = 1;
1743         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1744         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1745         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1746 
1747         work.pfnWork = RasterizeLine;
1748 
1749         auto pArena = pDC->pArena;
1750         SWR_ASSERT(pArena != nullptr);
1751 
1752         // store active attribs
1753         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1754         desc.numAttribs = linkageCount;
1755         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1756 
1757         // store line vertex data
1758         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1759 
1760         _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
1761         _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
1762         _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
1763         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1764 
1765         // store user clip distances
1766         if (state.backendState.clipDistanceMask)
1767         {
1768             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1769             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1770             ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1771         }
1772 
1773         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1774         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1775         {
1776             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1777             {
1778 #if KNOB_ENABLE_TOSS_POINTS
1779                 if (!KNOB_TOSS_SETUP_TRIS)
1780 #endif
1781                 {
1782                     pTileMgr->enqueue(x, y, &work);
1783                 }
1784             }
1785         }
1786 
1787         primMask &= ~(1 << primIndex);
1788     }
1789 
1790 endBinLines:
1791 
1792     AR_END(FEBinLines, 1);
1793 }
1794 
1795 //////////////////////////////////////////////////////////////////////////
1796 /// @brief Bin SIMD lines to the backend.
1797 /// @param pDC - pointer to draw context.
1798 /// @param pa - The primitive assembly object.
1799 /// @param workerId - thread's worker id. Even thread has a unique id.
1800 /// @param tri - Contains line position data for SIMDs worth of points.
1801 /// @param primID - Primitive ID for each line.
1802 /// @param viewportIdx - Viewport Array Index for each line.
1803 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,typename SIMD_T::Vec4 prim[3],uint32_t primMask,typename SIMD_T::Integer const & primID,typename SIMD_T::Integer const & viewportIdx,typename SIMD_T::Integer const & rtIdx)1804 void SIMDCALL BinLinesImpl(
1805     DRAW_CONTEXT *pDC,
1806     PA_STATE &pa,
1807     uint32_t workerId,
1808     typename SIMD_T::Vec4 prim[3],
1809     uint32_t primMask,
1810     typename SIMD_T::Integer const &primID,
1811     typename SIMD_T::Integer const &viewportIdx,
1812     typename SIMD_T::Integer const & rtIdx)
1813 {
1814     const API_STATE& state = GetApiState(pDC);
1815     const SWR_RASTSTATE& rastState = state.rastState;
1816     const SWR_FRONTEND_STATE& feState = state.frontendState;
1817 
1818     typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1819 
1820     if (!feState.vpTransformDisable)
1821     {
1822         // perspective divide
1823         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1824         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1825 
1826         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1827         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1828 
1829         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1830         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1831 
1832         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1833         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1834 
1835         // viewport transform to screen coords
1836         if (pa.viewportArrayActive)
1837         {
1838             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1839         }
1840         else
1841         {
1842             viewportTransform<2>(prim, state.vpMatrices);
1843         }
1844     }
1845 
1846     // adjust for pixel center location
1847     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1848 
1849     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1850     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1851 
1852     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1853     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1854 
1855     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1856         pDC,
1857         pa,
1858         workerId,
1859         prim,
1860         vRecipW,
1861         primMask,
1862         primID,
1863         viewportIdx,
1864         rtIdx);
1865 }
1866 
BinLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1867 void BinLines(
1868     DRAW_CONTEXT *pDC,
1869     PA_STATE &pa,
1870     uint32_t workerId,
1871     simdvector prim[],
1872     uint32_t primMask,
1873     simdscalari const &primID,
1874     simdscalari const &viewportIdx,
1875     simdscalari const &rtIdx)
1876 {
1877     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1878 }
1879 
1880 #if USE_SIMD16_FRONTEND
BinLines_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1881 void SIMDCALL BinLines_simd16(
1882     DRAW_CONTEXT *pDC,
1883     PA_STATE &pa,
1884     uint32_t workerId,
1885     simd16vector prim[3],
1886     uint32_t primMask,
1887     simd16scalari const &primID,
1888     simd16scalari const &viewportIdx,
1889     simd16scalari const &rtIdx)
1890 {
1891     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1892 }
1893 
1894 #endif
1895