1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28 
29 #include <vector>
30 #include <algorithm>
31 
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39 
40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
41 
42 template <uint32_t numSamples = 1>
43 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
44 template <typename RT>
45 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
46 template <typename RT>
47 void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
48 
49 #define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
50 static const __m256d gMaskToVecpd[] =
51 {
52     MASKTOVEC(0, 0, 0, 0),
53     MASKTOVEC(0, 0, 0, 1),
54     MASKTOVEC(0, 0, 1, 0),
55     MASKTOVEC(0, 0, 1, 1),
56     MASKTOVEC(0, 1, 0, 0),
57     MASKTOVEC(0, 1, 0, 1),
58     MASKTOVEC(0, 1, 1, 0),
59     MASKTOVEC(0, 1, 1, 1),
60     MASKTOVEC(1, 0, 0, 0),
61     MASKTOVEC(1, 0, 0, 1),
62     MASKTOVEC(1, 0, 1, 0),
63     MASKTOVEC(1, 0, 1, 1),
64     MASKTOVEC(1, 1, 0, 0),
65     MASKTOVEC(1, 1, 0, 1),
66     MASKTOVEC(1, 1, 1, 0),
67     MASKTOVEC(1, 1, 1, 1),
68 };
69 
70 struct POS
71 {
72     int32_t x, y;
73 };
74 
75 struct EDGE
76 {
77     double a, b;                // a, b edge coefficients in fix8
78     double stepQuadX;           // step to adjacent horizontal quad in fix16
79     double stepQuadY;           // step to adjacent vertical quad in fix16
80     double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
81     double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
82 
83     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
84     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
85 };
86 
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief rasterize a raster tile partially covered by the triangle
89 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
90 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
91 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
92 ///        Used to step between quads when sweeping over the raster tile.
93 template<uint32_t NumEdges, typename EdgeMaskT>
rasterizePartialTile(DRAW_CONTEXT * pDC,double startEdges[NumEdges],EDGE * pRastEdges)94 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
95 {
96     uint64_t coverageMask = 0;
97 
98     __m256d vEdges[NumEdges];
99     __m256d vStepX[NumEdges];
100     __m256d vStepY[NumEdges];
101 
102     for (uint32_t e = 0; e < NumEdges; ++e)
103     {
104         // Step to the pixel sample locations of the 1st quad
105         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
106 
107         // compute step to next quad (mul by 2 in x and y direction)
108         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
109         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
110     }
111 
112     // fast unrolled version for 8x8 tile
113 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
114     int edgeMask[NumEdges];
115     uint64_t mask;
116 
117     auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
118     auto update_lambda = [&](int e){mask &= edgeMask[e];};
119     auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
120     auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
121     auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
122 
123 // evaluate which pixels in the quad are covered
124 #define EVAL \
125             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
126 
127     // update coverage mask
128     // if edge 0 is degenerate and will be skipped; init the mask
129 #define UPDATE_MASK(bit) \
130             if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
131                 mask = 0xf;\
132             }\
133             else{\
134                 mask = edgeMask[0]; \
135             }\
136             UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
137             coverageMask |= (mask << bit);
138 
139     // step in the +x direction to the next quad
140 #define INCX \
141             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
142 
143     // step in the +y direction to the next quad
144 #define INCY \
145             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
146 
147     // step in the -x direction to the next quad
148 #define DECX \
149             UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
150 
151     // sweep 2x2 quad back and forth through the raster tile,
152     // computing coverage masks for the entire tile
153 
154     // raster tile
155     // 0  1  2  3  4  5  6  7
156     // x  x
157     // x  x ------------------>
158     //                   x  x  |
159     // <-----------------x  x  V
160     // ..
161 
162     // row 0
163     EVAL;
164     UPDATE_MASK(0);
165     INCX;
166     EVAL;
167     UPDATE_MASK(4);
168     INCX;
169     EVAL;
170     UPDATE_MASK(8);
171     INCX;
172     EVAL;
173     UPDATE_MASK(12);
174     INCY;
175 
176     //row 1
177     EVAL;
178     UPDATE_MASK(28);
179     DECX;
180     EVAL;
181     UPDATE_MASK(24);
182     DECX;
183     EVAL;
184     UPDATE_MASK(20);
185     DECX;
186     EVAL;
187     UPDATE_MASK(16);
188     INCY;
189 
190     // row 2
191     EVAL;
192     UPDATE_MASK(32);
193     INCX;
194     EVAL;
195     UPDATE_MASK(36);
196     INCX;
197     EVAL;
198     UPDATE_MASK(40);
199     INCX;
200     EVAL;
201     UPDATE_MASK(44);
202     INCY;
203 
204     // row 3
205     EVAL;
206     UPDATE_MASK(60);
207     DECX;
208     EVAL;
209     UPDATE_MASK(56);
210     DECX;
211     EVAL;
212     UPDATE_MASK(52);
213     DECX;
214     EVAL;
215     UPDATE_MASK(48);
216 #else
217     uint32_t bit = 0;
218     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
219     {
220         __m256d vStartOfRowEdge[NumEdges];
221         for (uint32_t e = 0; e < NumEdges; ++e)
222         {
223             vStartOfRowEdge[e] = vEdges[e];
224         }
225 
226         for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
227         {
228             int edgeMask[NumEdges];
229             for (uint32_t e = 0; e < NumEdges; ++e)
230             {
231                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
232             }
233 
234             uint64_t mask = edgeMask[0];
235             for (uint32_t e = 1; e < NumEdges; ++e)
236             {
237                 mask &= edgeMask[e];
238             }
239             coverageMask |= (mask << bit);
240 
241             // step to the next pixel in the x
242             for (uint32_t e = 0; e < NumEdges; ++e)
243             {
244                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
245             }
246             bit+=4;
247         }
248 
249         // step to the next row
250         for (uint32_t e = 0; e < NumEdges; ++e)
251         {
252             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
253         }
254     }
255 #endif
256     return coverageMask;
257 
258 }
259 // Top left rule:
260 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
261 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
262 // Top left: a sample is in if it is a top or left edge.
263 // Out: !(horizontal && above) = !horizontal && below
264 // Out: !horizontal && left = !(!horizontal && left) = horizontal and right
adjustTopLeftRuleIntFix16(const __m128i vA,const __m128i vB,__m256d & vEdge)265 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge)
266 {
267     // if vA < 0, vC--
268     // if vA == 0 && vB < 0, vC--
269 
270     __m256d vEdgeOut = vEdge;
271     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
272 
273     // if vA < 0 (line is not horizontal and below)
274     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
275 
276     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
277     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
278     int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
279     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
280 
281     // if either of these are true and we're on the line (edge == 0), bump it outside the line
282     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
283 }
284 
285 //////////////////////////////////////////////////////////////////////////
286 /// @brief calculates difference in precision between the result of manh
287 /// calculation and the edge precision, based on compile time trait values
288 template<typename RT>
ManhToEdgePrecisionAdjust()289 constexpr int64_t ManhToEdgePrecisionAdjust()
290 {
291     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
292                   "Inadequate precision of result of manh calculation ");
293     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
294 }
295 
296 //////////////////////////////////////////////////////////////////////////
297 /// @struct adjustEdgeConservative
298 /// @brief Primary template definition used for partially specializing
299 /// the adjustEdgeConservative function. This struct should never
300 /// be instantiated.
301 /// @tparam RT: rasterizer traits
302 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
303 template <typename RT, typename ConservativeEdgeOffsetT>
304 struct adjustEdgeConservative
305 {
306     //////////////////////////////////////////////////////////////////////////
307     /// @brief Performs calculations to adjust each edge of a triangle away
308     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
309     /// direction.
310     ///
311     /// Uncertainty regions arise from fixed point rounding, which
312     /// can snap a vertex +/- by min fixed point value.
313     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
314     /// This allows the rasterizer to test for coverage only at the pixel center,
315     /// instead of having to test individual pixel corners for conservative coverage
adjustEdgeConservativeadjustEdgeConservative316     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
317     {
318         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away
319         // from the pixel center (in the direction of the edge normal A/B)
320 
321         // edge = Ax + Bx + C - (manh/e)
322         // manh = manhattan distance = abs(A) + abs(B)
323         // e = absolute rounding error from snapping from float to fixed point precision
324 
325         // 'fixed point' multiply (in double to be avx1 friendly)
326         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
327         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
328         __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
329                                      _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
330 
331         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
332                       "Inadequate precision of result of manh calculation ");
333 
334         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
335         // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
336         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
337 
338         // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
339         // this allows the rasterizer to do a single conservative coverage test to see if the primitive
340         // intersects the pixel at all
341         vEdge = _mm256_sub_pd(vEdge, manh);
342     };
343 };
344 
345 //////////////////////////////////////////////////////////////////////////
346 /// @brief adjustEdgeConservative specialization where no edge offset is needed
347 template <typename RT>
348 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
349 {
350     INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
351 };
352 
353 //////////////////////////////////////////////////////////////////////////
354 /// @brief calculates the distance a degenerate BBox needs to be adjusted
355 /// for conservative rast based on compile time trait values
356 template<typename RT>
357 constexpr int64_t ConservativeScissorOffset()
358 {
359     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
360     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
361     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
362     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
363     return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
364 }
365 
366 //////////////////////////////////////////////////////////////////////////
367 /// @brief Performs calculations to adjust each a vector of evaluated edges out
368 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
369 /// direction.
370 template <typename RT>
371 INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
372 {
373     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
374     int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
375     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
376 };
377 
378 //////////////////////////////////////////////////////////////////////////
379 /// @brief Performs calculations to adjust each a scalar evaluated edge out
380 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
381 /// direction.
382 template <typename RT, typename OffsetT>
383 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
384 {
385     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
386     int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
387     return (Edge - manh);
388 };
389 
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Perform any needed adjustments to evaluated triangle edges
392 template <typename RT, typename EdgeOffsetT>
393 struct adjustEdgesFix16
394 {
395     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
396     {
397         static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
398                       "Edge equation expected to be in x.16 fixed point");
399 
400         static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
401 
402         // need to apply any edge offsets before applying the top-left rule
403         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
404 
405         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
406     }
407 };
408 
409 //////////////////////////////////////////////////////////////////////////
410 /// @brief Perform top left adjustments to evaluated triangle edges
411 template <typename RT>
412 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
413 {
414     INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
415     {
416         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
417     }
418 };
419 
420 // max(abs(dz/dx), abs(dz,dy)
421 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
422 {
423     /*
424     // evaluate i,j at (0,0)
425     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
426     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
427 
428     // evaluate i,j at (1,0)
429     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
430     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
431 
432     // compute dz/dx
433     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
434     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
435     float dzdx = abs(d10 - d00);
436 
437     // evaluate i,j at (0,1)
438     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
439     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
440 
441     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
442     float dzdy = abs(d01 - d00);
443     */
444 
445     // optimized version of above
446     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
447     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
448 
449     return std::max(dzdx, dzdy);
450 }
451 
452 INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
453 {
454     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
455     {
456         return (1.0f / (1 << 24));
457     }
458     else if (pState->depthFormat == R16_UNORM)
459     {
460         return (1.0f / (1 << 16));
461     }
462     else
463     {
464         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
465 
466         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
467         float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
468         uint32_t zMaxInt = *(uint32_t*)&zMax;
469         zMaxInt &= 0x7f800000;
470         zMax = *(float*)&zMaxInt;
471 
472         return zMax * (1.0f / (1 << 23));
473     }
474 }
475 
476 INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
477 {
478     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
479     {
480         return 0.0f;
481     }
482 
483     float scale = pState->slopeScaledDepthBias;
484     if (scale != 0.0f)
485     {
486         scale *= ComputeMaxDepthSlope(pTri);
487     }
488 
489     float bias = pState->depthBias;
490     if (!pState->depthBiasPreAdjusted)
491     {
492         bias *= ComputeBiasFactor(pState, pTri, z);
493     }
494     bias += scale;
495 
496     if (pState->depthBiasClamp > 0.0f)
497     {
498         bias = std::min(bias, pState->depthBiasClamp);
499     }
500     else if (pState->depthBiasClamp < 0.0f)
501     {
502         bias = std::max(bias, pState->depthBiasClamp);
503     }
504 
505     return bias;
506 }
507 
508 // Prevent DCE by writing coverage mask from rasterizer to volatile
509 #if KNOB_ENABLE_TOSS_POINTS
510 __declspec(thread) volatile uint64_t gToss;
511 #endif
512 
513 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
514 // try to avoid _chkstk insertions; make this thread local
515 static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
516 
517 INLINE
518 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
519 {
520     edge.a = a;
521     edge.b = b;
522 
523     // compute constant steps to adjacent quads
524     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
525     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
526 
527     // compute constant steps to adjacent raster tiles
528     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
529     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
530 
531     // compute quad offsets
532     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
533     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
534 
535     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
536     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
537     edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
538 
539     // compute raster tile offsets
540     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
541     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
542 
543     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
544     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
545     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
546 }
547 
548 INLINE
549 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
550 {
551     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
552 }
553 
554 //////////////////////////////////////////////////////////////////////////
555 /// @brief Primary template definition used for partially specializing
556 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
557 /// corner to sample position, and test for coverage
558 /// @tparam sampleCount: multisample count
559 template <typename NumSamplesT>
560 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
561                             int32_t &mask0, int32_t &mask1, int32_t &mask2)
562 {
563     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
564     // evaluate edge equations at the tile multisample bounding box
565     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
566     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
567     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
568     mask0 = _mm256_movemask_pd(vSampleBboxTest0);
569     mask1 = _mm256_movemask_pd(vSampleBboxTest1);
570     mask2 = _mm256_movemask_pd(vSampleBboxTest2);
571 }
572 
573 //////////////////////////////////////////////////////////////////////////
574 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
575 /// when only rasterizing a single coverage test point
576 template <>
577 INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
578                                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
579 {
580     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
581     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
582     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
583 }
584 
585 //////////////////////////////////////////////////////////////////////////
586 /// @struct ComputeScissorEdges
587 /// @brief Primary template definition. Allows the function to be generically
588 /// called. When paired with below specializations, will result in an empty
589 /// inlined function if scissor is not enabled
590 /// @tparam RasterScissorEdgesT: is scissor enabled?
591 /// @tparam IsConservativeT: is conservative rast enabled?
592 /// @tparam RT: rasterizer traits
593 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
594 struct ComputeScissorEdges
595 {
596     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
597                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
598 };
599 
600 //////////////////////////////////////////////////////////////////////////
601 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
602 /// specialization. Instantiated when conservative rast and scissor are enabled
603 template <typename RT>
604 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
605 {
606     //////////////////////////////////////////////////////////////////////////
607     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
608     /// evaluate edge equations and offset them away from pixel center.
609     INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
610                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
611     {
612         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
613         SWR_RECT scissor;
614         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
615         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
616         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
617         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
618 
619         POS topLeft{scissor.xmin, scissor.ymin};
620         POS bottomLeft{scissor.xmin, scissor.ymax};
621         POS topRight{scissor.xmax, scissor.ymin};
622         POS bottomRight{scissor.xmax, scissor.ymax};
623 
624         // construct 4 scissor edges in ccw direction
625         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
626         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
627         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
628         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
629 
630         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
631         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
632         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
633         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
634 
635         // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
636         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
637         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
638         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
639         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
640 
641         // Upper left rule for scissor
642         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
643         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
644     }
645 };
646 
647 //////////////////////////////////////////////////////////////////////////
648 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
649 /// specialization. Instantiated when scissor is enabled and conservative rast
650 /// is disabled.
651 template <typename RT>
652 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
653 {
654     //////////////////////////////////////////////////////////////////////////
655     /// @brief Compute scissor edge vectors and evaluate edge equations
656     INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
657                               EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
658     {
659         const SWR_RECT &scissor = scissorBBox;
660         POS topLeft{scissor.xmin, scissor.ymin};
661         POS bottomLeft{scissor.xmin, scissor.ymax};
662         POS topRight{scissor.xmax, scissor.ymin};
663         POS bottomRight{scissor.xmax, scissor.ymax};
664 
665         // construct 4 scissor edges in ccw direction
666         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
667         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
668         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
669         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
670 
671         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
672         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
673         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
674         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
675 
676         // Upper left rule for scissor
677         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
678         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
679     }
680 };
681 
682 //////////////////////////////////////////////////////////////////////////
683 /// @brief Primary function template for TrivialRejectTest. Should
684 /// never be called, but TemplateUnroller instantiates a few unused values,
685 /// so it calls a runtime assert instead of a static_assert.
686 template <typename ValidEdgeMaskT>
687 INLINE bool TrivialRejectTest(const int, const int, const int)
688 {
689     SWR_INVALID("Primary templated function should never be called");
690     return false;
691 };
692 
693 //////////////////////////////////////////////////////////////////////////
694 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
695 /// and edge 1 for trivial coverage reject
696 template <>
697 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
698 {
699     return (!(mask0 && mask1)) ? true : false;
700 };
701 
702 //////////////////////////////////////////////////////////////////////////
703 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
704 /// and edge 2 for trivial coverage reject
705 template <>
706 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
707 {
708     return (!(mask0 && mask2)) ? true : false;
709 };
710 
711 //////////////////////////////////////////////////////////////////////////
712 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
713 /// and edge 2 for trivial coverage reject
714 template <>
715 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
716 {
717     return (!(mask1 && mask2)) ? true : false;
718 };
719 
720 //////////////////////////////////////////////////////////////////////////
721 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
722 /// primitive edges for trivial coverage reject
723 template <>
724 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
725 {
726     return (!(mask0 && mask1 && mask2)) ? true : false;;
727 };
728 
729 //////////////////////////////////////////////////////////////////////////
730 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
731 /// point, so return false and rasterize against conservative BBox
732 template <>
733 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
734 {
735     return false;
736 };
737 
738 //////////////////////////////////////////////////////////////////////////
739 /// @brief Primary function template for TrivialAcceptTest. Always returns
740 /// false, since it will only be called for degenerate tris, and as such
741 /// will never cover the entire raster tile
742 template <typename ScissorEnableT>
743 INLINE bool TrivialAcceptTest(const int, const int, const int)
744 {
745     return false;
746 };
747 
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
750 /// edge masks for a fully covered raster tile
751 template <>
752 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
753 {
754     return ((mask0 & mask1 & mask2) == 0xf);
755 };
756 
757 //////////////////////////////////////////////////////////////////////////
758 /// @brief Primary function template for GenerateSVInnerCoverage. Results
759 /// in an empty function call if SVInnerCoverage isn't requested
760 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
761 struct GenerateSVInnerCoverage
762 {
763     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
764 };
765 
766 //////////////////////////////////////////////////////////////////////////
767 /// @brief Specialization of GenerateSVInnerCoverage where all edges
768 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
769 /// edge values from OuterConservative to InnerConservative and rasterizes.
770 template <typename RT>
771 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
772 {
773     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
774     {
775         SWR_CONTEXT *pContext = pDC->pContext;
776 
777         double startQuadEdgesAdj[RT::NumEdgesT::value];
778         for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
779         {
780             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
781         }
782 
783         // not trivial accept or reject, must rasterize full tile
784         AR_BEGIN(BERasterizePartial, pDC->drawId);
785         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
786         AR_END(BERasterizePartial, 0);
787     }
788 };
789 
790 //////////////////////////////////////////////////////////////////////////
791 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
792 /// in an empty function call if SVInnerCoverage isn't requested
793 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
794 struct UpdateEdgeMasksInnerConservative
795 {
796     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
797                                            const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
798 };
799 
800 //////////////////////////////////////////////////////////////////////////
801 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
802 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
803 /// evaluated at raster tile corners to inner conservative position and
804 /// updates edge masks
805 template <typename RT>
806 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
807 {
808     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
809                                            const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
810     {
811         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
812 
813         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
814         // conservative evaluated edge when adjusting the edge in for inner conservative tests
815         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
816         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
817         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
818 
819         UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
820     }
821 };
822 
823 //////////////////////////////////////////////////////////////////////////
824 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
825 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
826 /// cover an entire raster tile, set mask0 to 0 to force it down the
827 /// rastierizePartialTile path
828 template <typename RT, typename ValidEdgeMaskT>
829 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
830 {
831     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
832                                    const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
833     {
834         // set one mask to zero to force the triangle down the rastierizePartialTile path
835         mask0 = 0;
836     }
837 };
838 
839 template <typename RT>
840 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
841 {
842     SWR_CONTEXT *pContext = pDC->pContext;
843     const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
844 #if KNOB_ENABLE_TOSS_POINTS
845     if (KNOB_TOSS_BIN_TRIS)
846     {
847         return;
848     }
849 #endif
850     AR_BEGIN(BERasterizeTriangle, pDC->drawId);
851     AR_BEGIN(BETriangleSetup, pDC->drawId);
852 
853     const API_STATE &state = GetApiState(pDC);
854     const SWR_RASTSTATE &rastState = state.rastState;
855     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
856 
857     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
858     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
859 
860     __m128 vX, vY, vZ, vRecipW;
861 
862     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
863     // eg: vX = [x0 x1 x2 dc]
864     vX = _mm_load_ps(workDesc.pTriBuffer);
865     vY = _mm_load_ps(workDesc.pTriBuffer + 4);
866     vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
867     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
868 
869     // convert to fixed point
870     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
871     __m128i vXi = fpToFixedPoint(vX);
872     __m128i vYi = fpToFixedPoint(vY);
873 
874     // quantize floating point position to fixed point precision
875     // to prevent attribute creep around the triangle vertices
876     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
877     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
878 
879     // triangle setup - A and B edge equation coefs
880     __m128 vA, vB;
881     triangleSetupAB(vX, vY, vA, vB);
882 
883     __m128i vAi, vBi;
884     triangleSetupABInt(vXi, vYi, vAi, vBi);
885 
886     // determinant
887     float det = calcDeterminantInt(vAi, vBi);
888 
889     // Verts in Pixel Coordinate Space at this point
890     // Det > 0 = CW winding order
891     // Convert CW triangles to CCW
892     if (det > 0.0)
893     {
894         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
895         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
896         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
897         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
898         det = -det;
899     }
900 
901     __m128 vC;
902     // Finish triangle setup - C edge coef
903     triangleSetupC(vX, vY, vA, vB, vC);
904 
905     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
906     {
907         // If we have degenerate edge(s) to rasterize, set I and J coefs
908         // to 0 for constant interpolation of attributes
909         triDesc.I[0] = 0.0f;
910         triDesc.I[1] = 0.0f;
911         triDesc.I[2] = 0.0f;
912         triDesc.J[0] = 0.0f;
913         triDesc.J[1] = 0.0f;
914         triDesc.J[2] = 0.0f;
915 
916         // Degenerate triangles have no area
917         triDesc.recipDet = 0.0f;
918     }
919     else
920     {
921         // only extract coefs for 2 of the barycentrics; the 3rd can be
922         // determined from the barycentric equation:
923         // i + j + k = 1 <=> k = 1 - j - i
924         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
925         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
926         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
927         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
928         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
929         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
930 
931         // compute recipDet, used to calculate barycentric i and j in the backend
932         triDesc.recipDet = 1.0f/det;
933     }
934 
935     OSALIGNSIMD(float) oneOverW[4];
936     _mm_store_ps(oneOverW, vRecipW);
937     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
938     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
939     triDesc.OneOverW[2] = oneOverW[2];
940 
941     // calculate perspective correct coefs per vertex attrib
942     float* pPerspAttribs = perspAttribsTLS;
943     float* pAttribs = workDesc.pAttribs;
944     triDesc.pPerspAttribs = pPerspAttribs;
945     triDesc.pAttribs = pAttribs;
946     float *pRecipW = workDesc.pTriBuffer + 12;
947     triDesc.pRecipW = pRecipW;
948     __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
949     __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
950     __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
951     for(uint32_t i = 0; i < workDesc.numAttribs; i++)
952     {
953         __m128 attribA = _mm_load_ps(pAttribs);
954         __m128 attribB = _mm_load_ps(pAttribs+=4);
955         __m128 attribC = _mm_load_ps(pAttribs+=4);
956         pAttribs+=4;
957 
958         attribA = _mm_mul_ps(attribA, vOneOverWV0);
959         attribB = _mm_mul_ps(attribB, vOneOverWV1);
960         attribC = _mm_mul_ps(attribC, vOneOverWV2);
961 
962         _mm_store_ps(pPerspAttribs, attribA);
963         _mm_store_ps(pPerspAttribs+=4, attribB);
964         _mm_store_ps(pPerspAttribs+=4, attribC);
965         pPerspAttribs+=4;
966     }
967 
968     // compute bary Z
969     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
970     OSALIGNSIMD(float) a[4];
971     _mm_store_ps(a, vZ);
972     triDesc.Z[0] = a[0] - a[2];
973     triDesc.Z[1] = a[1] - a[2];
974     triDesc.Z[2] = a[2];
975 
976     // add depth bias
977     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
978 
979     // Calc bounding box of triangle
980     OSALIGNSIMD(SWR_RECT) bbox;
981     calcBoundingBoxInt(vXi, vYi, bbox);
982 
983     const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
984 
985     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
986     {
987         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
988         bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
989         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
990                    "Conservative rast degenerate handling requires a valid scissor rect");
991     }
992 
993     // Intersect with scissor/viewport
994     OSALIGNSIMD(SWR_RECT) intersect;
995     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
996     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
997     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
998     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
999 
1000     triDesc.triFlags = workDesc.triFlags;
1001 
1002     // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
1003     uint32_t macroX, macroY;
1004     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1005     int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1006     int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1007     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1008     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1009 
1010     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1011     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1012     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1013     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1014 
1015     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
1016 
1017     AR_END(BETriangleSetup, 0);
1018 
1019     // update triangle desc
1020     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1021     uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1022     uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1023     uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1024     uint32_t numTilesX = maxTileX - minTileX + 1;
1025     uint32_t numTilesY = maxTileY - minTileY + 1;
1026 
1027     if (numTilesX == 0 || numTilesY == 0)
1028     {
1029         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
1030         AR_END(BERasterizeTriangle, 1);
1031         return;
1032     }
1033 
1034     AR_BEGIN(BEStepSetup, pDC->drawId);
1035 
1036     // Step to pixel center of top-left pixel of the triangle bbox
1037     // Align intersect bbox (top/left) to raster tile's (top/left).
1038     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1039     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1040 
1041     // convenience typedef
1042     typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1043 
1044     // single sample rasterization evaluates edges at pixel center,
1045     // multisample evaluates edges UL pixel corner and steps to each sample position
1046     if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1047     {
1048         // Add 0.5, in fixed point, to offset to pixel center
1049         x += (FIXED_POINT_SCALE / 2);
1050         y += (FIXED_POINT_SCALE / 2);
1051     }
1052 
1053     __m128i vTopLeftX = _mm_set1_epi32(x);
1054     __m128i vTopLeftY = _mm_set1_epi32(y);
1055 
1056     // evaluate edge equations at top-left pixel using 64bit math
1057     //
1058     // line = Ax + By + C
1059     // solving for C:
1060     // C = -Ax - By
1061     // we know x0 and y0 are on the line; plug them in:
1062     // C = -Ax0 - By0
1063     // plug C back into line equation:
1064     // line = Ax - By - Ax0 - By0
1065     // line = A(x - x0) + B(y - y0)
1066     // dX = (x-x0), dY = (y-y0)
1067     // so all this simplifies to
1068     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1069 
1070     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1071     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1072 
1073     // evaluate A(dx) and B(dY) for all points
1074     __m256d vAipd = _mm256_cvtepi32_pd(vAi);
1075     __m256d vBipd = _mm256_cvtepi32_pd(vBi);
1076     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1077     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1078 
1079     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1080     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1081     __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1082 
1083     // apply any edge adjustments(top-left, crast, etc)
1084     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1085 
1086     // broadcast respective edge results to all lanes
1087     double* pEdge = (double*)&vEdge;
1088     __m256d vEdgeFix16[7];
1089     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1090     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1091     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1092 
1093     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1094     _mm_store_si128((__m128i*)aAi, vAi);
1095     _mm_store_si128((__m128i*)aBi, vBi);
1096     EDGE rastEdges[RT::NumEdgesT::value];
1097 
1098     // Compute and store triangle edge data
1099     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1100     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1101     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1102 
1103     // Compute and store triangle edge data if scissor needs to rasterized
1104     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
1105                        (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1106 
1107     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1108     // used to for testing if entire raster tile is inside a triangle
1109     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1110     {
1111         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1112     }
1113 
1114     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1115     // step sample positions to the raster tile bbox of multisample points
1116     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
1117     //                             |      |
1118     //                             |      |
1119     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
1120     __m256d vEdgeTileBbox[3];
1121     if (NumCoverageSamplesT::value > 1)
1122     {
1123         const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1124         const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1125         const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1126 
1127         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1128         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1129 
1130         // step edge equation tests from Tile
1131         // used to for testing if entire raster tile is inside a triangle
1132         for (uint32_t e = 0; e < 3; ++e)
1133         {
1134             __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1135             __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1136             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1137 
1138             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1139             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
1140         }
1141     }
1142 
1143     AR_END(BEStepSetup, 0);
1144 
1145     uint32_t tY = minTileY;
1146     uint32_t tX = minTileX;
1147     uint32_t maxY = maxTileY;
1148     uint32_t maxX = maxTileX;
1149 
1150     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1151     GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
1152     currentRenderBufferRow = renderBuffers;
1153 
1154     // rasterize and generate coverage masks per sample
1155     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1156     {
1157         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1158         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1159         {
1160             vStartOfRowEdge[e] = vEdgeFix16[e];
1161         }
1162 
1163         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1164         {
1165             triDesc.anyCoveredSamples = 0;
1166 
1167             // is the corner of the edge outside of the raster tile? (vEdge < 0)
1168             int mask0, mask1, mask2;
1169             UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1170 
1171             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1172             {
1173                 // trivial reject, at least one edge has all 4 corners of raster tile outside
1174                 bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1175 
1176                 if (!trivialReject)
1177                 {
1178                     // trivial accept mask
1179                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1180 
1181                     // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
1182                     UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
1183                         (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1184 
1185                     // @todo Make this a bit smarter to allow use of trivial accept when:
1186                     //   1) scissor/vp intersection rect is raster tile aligned
1187                     //   2) raster tile is entirely within scissor/vp intersection rect
1188                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1189                     {
1190                         // trivial accept, all 4 corners of all 3 edges are negative
1191                         // i.e. raster tile completely inside triangle
1192                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1193                         if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
1194                         {
1195                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1196                         }
1197                         RDTSC_EVENT(BETrivialAccept, 1, 0);
1198                     }
1199                     else
1200                     {
1201                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
1202                         if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1203                         {
1204                             // should get optimized out for single sample case (global value numbering or copy propagation)
1205                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1206                             {
1207                                 vEdgeAtSample[e] = vEdgeFix16[e];
1208                             }
1209                         }
1210                         else
1211                         {
1212                             const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
1213                             __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
1214                             __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
1215                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1216                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1217 
1218                             // step edge equation tests from UL tile corner to pixel sample position
1219                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1220                             {
1221                                 __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1222                                 __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1223                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1224                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1225                             }
1226                         }
1227 
1228                         double startQuadEdges[RT::NumEdgesT::value];
1229                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1230                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1231                         {
1232                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1233                         }
1234 
1235                         // not trivial accept or reject, must rasterize full tile
1236                         AR_BEGIN(BERasterizePartial, pDC->drawId);
1237                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
1238                         AR_END(BERasterizePartial, 0);
1239 
1240                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1241 
1242                         // Output SV InnerCoverage, if needed
1243                         GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1244                     }
1245                 }
1246                 else
1247                 {
1248                     // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
1249                     if(NumCoverageSamplesT::value > 1)
1250                     {
1251                         triDesc.coverageMask[sampleNum] = 0;
1252                     }
1253                     RDTSC_EVENT(BETrivialReject, 1, 0);
1254                 }
1255             }
1256 
1257 #if KNOB_ENABLE_TOSS_POINTS
1258             if(KNOB_TOSS_RS)
1259             {
1260                 gToss = triDesc.coverageMask[0];
1261             }
1262             else
1263 #endif
1264             if(triDesc.anyCoveredSamples)
1265             {
1266                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
1267                 // copy conservative coverage result to all samples
1268                 if(RT::IsConservativeT::value)
1269                 {
1270                     auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
1271                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1272                 }
1273 
1274                 AR_BEGIN(BEPixelBackend, pDC->drawId);
1275                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
1276                 AR_END(BEPixelBackend, 0);
1277             }
1278 
1279             // step to the next tile in X
1280             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1281             {
1282                 vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1283             }
1284             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1285         }
1286 
1287         // step to the next tile in Y
1288         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1289         {
1290             vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1291         }
1292         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1293     }
1294 
1295     AR_END(BERasterizeTriangle, 1);
1296 }
1297 
1298 // Get pointers to hot tile memory for color RT, depth, stencil
1299 template <uint32_t numSamples>
1300 void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
1301 {
1302     const API_STATE& state = GetApiState(pDC);
1303     SWR_CONTEXT *pContext = pDC->pContext;
1304 
1305     uint32_t mx, my;
1306     MacroTileMgr::getTileIndices(macroID, mx, my);
1307     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1308     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1309 
1310     // compute tile offset for active hottile buffers
1311     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1312     uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1313     offset*=numSamples;
1314 
1315     unsigned long rtSlot = 0;
1316     uint32_t colorHottileEnableMask = state.colorHottileEnable;
1317     while(_BitScanForward(&rtSlot, colorHottileEnableMask))
1318     {
1319         HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true,
1320             numSamples, renderTargetArrayIndex);
1321         pColor->state = HOTTILE_DIRTY;
1322         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1323 
1324         colorHottileEnableMask &= ~(1 << rtSlot);
1325     }
1326     if(state.depthHottileEnable)
1327     {
1328         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1329         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1330         offset*=numSamples;
1331         HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true,
1332             numSamples, renderTargetArrayIndex);
1333         pDepth->state = HOTTILE_DIRTY;
1334         SWR_ASSERT(pDepth->pBuffer != nullptr);
1335         renderBuffers.pDepth = pDepth->pBuffer + offset;
1336     }
1337     if(state.stencilHottileEnable)
1338     {
1339         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1340         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
1341         offset*=numSamples;
1342         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true,
1343             numSamples, renderTargetArrayIndex);
1344         pStencil->state = HOTTILE_DIRTY;
1345         SWR_ASSERT(pStencil->pBuffer != nullptr);
1346         renderBuffers.pStencil = pStencil->pBuffer + offset;
1347     }
1348 }
1349 
1350 template <typename RT>
1351 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
1352 {
1353     DWORD rt = 0;
1354     while (_BitScanForward(&rt, colorHotTileMask))
1355     {
1356         colorHotTileMask &= ~(1 << rt);
1357         buffers.pColor[rt] += RT::colorRasterTileStep;
1358     }
1359 
1360     buffers.pDepth += RT::depthRasterTileStep;
1361     buffers.pStencil += RT::stencilRasterTileStep;
1362 }
1363 
1364 template <typename RT>
1365 INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
1366 {
1367     DWORD rt = 0;
1368     while (_BitScanForward(&rt, colorHotTileMask))
1369     {
1370         colorHotTileMask &= ~(1 << rt);
1371         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1372         buffers.pColor[rt] = startBufferRow.pColor[rt];
1373     }
1374     startBufferRow.pDepth += RT::depthRasterTileRowStep;
1375     buffers.pDepth = startBufferRow.pDepth;
1376 
1377     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1378     buffers.pStencil = startBufferRow.pStencil;
1379 }
1380 
1381