1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.h
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 *        operations.
27 *
28 ******************************************************************************/
29 #pragma once
30 
31 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
32 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
33 
34 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
35 
36 
37 enum SWR_BACKEND_FUNCS
38 {
39     SWR_BACKEND_SINGLE_SAMPLE,
40     SWR_BACKEND_MSAA_PIXEL_RATE,
41     SWR_BACKEND_MSAA_SAMPLE_RATE,
42     SWR_BACKEND_FUNCS_MAX,
43 };
44 
45 #if KNOB_SIMD_WIDTH == 8
46 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
47 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
48 static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
49 static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
50 #define MASK 0xff
51 #endif
52 
ComputeUserClipMask(uint8_t clipMask,float * pUserClipBuffer,simdscalar const & vI,simdscalar const & vJ)53 static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
54 {
55     simdscalar vClipMask = _simd_setzero_ps();
56     uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
57 
58     for (uint32_t i = 0; i < numClipDistance; ++i)
59     {
60         // pull triangle clip distance values from clip buffer
61         simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
62         simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
63         simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
64 
65         // interpolate
66         simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
67 
68         // clip if interpolated clip distance is < 0 || NAN
69         simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
70 
71         vClipMask = _simd_or_ps(vClipMask, vCull);
72     }
73 
74     return _simd_movemask_ps(vClipMask);
75 }
76 
RasterTileColorOffset(uint32_t sampleNum)77 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
78 {
79     static const uint32_t RasterTileColorOffsets[16]
80     { 0,
81       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
82       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
83       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
84       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
85       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
86       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
87       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
88       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
89       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
90       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
91       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
92       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
93       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
94       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
95       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
96     };
97     assert(sampleNum < 16);
98     return RasterTileColorOffsets[sampleNum];
99 }
100 
RasterTileDepthOffset(uint32_t sampleNum)101 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
102 {
103     static const uint32_t RasterTileDepthOffsets[16]
104     { 0,
105       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
106       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
107       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
108       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
109       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
110       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
111       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
112       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
113       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
114       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
115       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
116       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
117       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
118       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
119       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
120     };
121     assert(sampleNum < 16);
122     return RasterTileDepthOffsets[sampleNum];
123 }
124 
RasterTileStencilOffset(uint32_t sampleNum)125 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
126 {
127     static const uint32_t RasterTileStencilOffsets[16]
128     { 0,
129       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
130       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
131       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
132       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
133       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
134       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
135       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
136       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
137       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
138       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
139       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
140       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
141       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
142       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
143       (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
144     };
145     assert(sampleNum < 16);
146     return RasterTileStencilOffsets[sampleNum];
147 }
148 
149 template<typename T, uint32_t InputCoverage>
150 struct generateInputCoverage
151 {
generateInputCoveragegenerateInputCoverage152     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
153     {
154         // will need to update for avx512
155         assert(KNOB_SIMD_WIDTH == 8);
156 
157         simdscalari mask[2];
158         simdscalari sampleCoverage[2];
159 
160         if(T::bIsCenterPattern)
161         {
162             // center coverage is the same for all samples; just broadcast to the sample slots
163             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
164             if(T::MultisampleT::numSamples == 1)
165             {
166                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
167             }
168             else if(T::MultisampleT::numSamples == 2)
169             {
170                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
171             }
172             else if(T::MultisampleT::numSamples == 4)
173             {
174                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
175             }
176             else if(T::MultisampleT::numSamples == 8)
177             {
178                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
179             }
180             else if(T::MultisampleT::numSamples == 16)
181             {
182                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
183                 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
184             }
185         }
186         else
187         {
188             simdscalari src = _simd_set1_epi32(0);
189             simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
190 
191             if(T::MultisampleT::numSamples == 1)
192             {
193                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
194             }
195             else if(T::MultisampleT::numSamples == 2)
196             {
197                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
198             }
199             else if(T::MultisampleT::numSamples == 4)
200             {
201                 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
202             }
203             else if(T::MultisampleT::numSamples == 8)
204             {
205                 mask[0] = _simd_set1_epi32(-1);
206             }
207             else if(T::MultisampleT::numSamples == 16)
208             {
209                 mask[0] = _simd_set1_epi32(-1);
210                 mask[1] = _simd_set1_epi32(-1);
211                 index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
212             }
213 
214             // gather coverage for samples 0-7
215             sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
216             if(T::MultisampleT::numSamples > 8)
217             {
218                 // gather coverage for samples 8-15
219                 sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
220             }
221         }
222 
223         mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
224                                   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
225         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
226         simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
227 
228         simdscalari packedCoverage1;
229         if(T::MultisampleT::numSamples > 8)
230         {
231             // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
232             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
233         }
234 
235     #if (KNOB_ARCH == KNOB_ARCH_AVX)
236         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
237         simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
238         simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
239         packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
240 
241         simdscalari packedSampleCoverage;
242         if(T::MultisampleT::numSamples > 8)
243         {
244             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
245             hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
246             shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
247             shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
248             packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
249             packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
250         }
251         else
252         {
253             packedSampleCoverage = packedCoverage0;
254         }
255     #else
256         simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
257         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
258         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
259 
260         simdscalari packedSampleCoverage;
261         if(T::MultisampleT::numSamples > 8)
262         {
263             permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
264             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
265             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
266 
267             // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
268             packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
269         }
270         else
271         {
272             packedSampleCoverage = packedCoverage0;
273         }
274     #endif
275 
276         for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
277         {
278             // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
279             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
280 
281             if(!T::bForcedSampleCount)
282             {
283                 // input coverage has to be anded with sample mask if MSAA isn't forced on
284                 inputMask[i] &= sampleMask;
285             }
286 
287             // shift to the next pixel in the 4x2
288             packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
289         }
290     }
291 
generateInputCoveragegenerateInputCoverage292     INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
293     {
294         uint32_t inputMask[KNOB_SIMD_WIDTH];
295         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
296         inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
297     }
298 
299 };
300 
301 template<typename T>
302 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
303 {
304     INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
305     {
306         // will need to update for avx512
307         assert(KNOB_SIMD_WIDTH == 8);
308         simdscalari vec = _simd_set1_epi32(coverageMask[0]);
309         const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
310         vec = _simd_and_si(vec, bit);
311         vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
312         vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
313         inputCoverage = _simd_castsi_ps(vec);
314     }
315 
316     INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
317     {
318         uint32_t simdCoverage = (coverageMask[0] & MASK);
319         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
320         for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
321         {
322             // set all samples to covered if conservative coverage mask is set for that pixel
323             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
324         }
325     }
326 };
327 
328 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
329 // Centroid behaves exactly as follows :
330 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to
331 //     have a sample location there).
332 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the
333 //     coverage with the SampleMask Rasterizer State.
334 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is
335 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the
336 //     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
337 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
338 template<typename T>
339 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
340                             const uint64_t *const coverageMask, const uint32_t sampleMask,
341                             simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
342 {
343     uint32_t inputMask[KNOB_SIMD_WIDTH];
344     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
345 
346     // Case (2) - partially covered pixel
347 
348     // scan for first covered sample per pixel in the 4x2 span
349     unsigned long sampleNum[KNOB_SIMD_WIDTH];
350     (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
351     (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
352     (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
353     (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
354     (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
355     (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
356     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
357     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
358 
359     // look up and set the sample offsets from UL pixel corner for first covered sample
360     simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
361                                     samplePos.X(sampleNum[6]),
362                                     samplePos.X(sampleNum[5]),
363                                     samplePos.X(sampleNum[4]),
364                                     samplePos.X(sampleNum[3]),
365                                     samplePos.X(sampleNum[2]),
366                                     samplePos.X(sampleNum[1]),
367                                     samplePos.X(sampleNum[0]));
368 
369     simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
370                                     samplePos.Y(sampleNum[6]),
371                                     samplePos.Y(sampleNum[5]),
372                                     samplePos.Y(sampleNum[4]),
373                                     samplePos.Y(sampleNum[3]),
374                                     samplePos.Y(sampleNum[2]),
375                                     samplePos.Y(sampleNum[1]),
376                                     samplePos.Y(sampleNum[0]));
377     // add sample offset to UL pixel corner
378     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
379     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
380 
381     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
382     static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
383     simdscalari vInputCoveragei =  _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
384     simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
385 
386     static const simdscalari vZero = _simd_setzero_si();
387     const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
388     simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
389     simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
390     simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
391 
392     simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
393 
394     // set the centroid position based on results from above
395     psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
396     psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
397 
398     // Case (3a) No samples covered and partial sample mask
399     simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
400     // sample mask should never be all 0's for this case, but handle it anyways
401     unsigned long firstCoveredSampleMaskSample = 0;
402     (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
403 
404     simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
405 
406     vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
407     vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
408 
409     // blend in case 3a pixel locations
410     psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
411     psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
412 }
413 
414 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
415                                      const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
416 {
417     // evaluate I,J
418     psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
419     psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
420     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
421     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
422 
423     // interpolate 1/w
424     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
425 }
426 
427 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
428 {
429     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
430     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
431 
432     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
433 }
434 
435 template<typename T>
436 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
437 {
438     // RT has to be single sample if we're in forcedMSAA mode
439     if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
440     {
441         return 1;
442     }
443     // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
444     else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
445     {
446         return GetNumSamples(blendSampleCount);
447     }
448     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
449     else
450     {
451         return T::MultisampleT::numSamples;
452     }
453 }
454 
455 inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
456 {
457     // broadcast scalars
458 
459     coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
460     coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
461     coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
462 
463     coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
464     coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
465     coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
466 
467     coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
468     coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
469     coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
470 
471     coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
472 
473     coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
474     coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
475     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
476 }
477 
478 inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
479 {
480 
481     DWORD index;
482     while (_BitScanForward(&index, colorHotTileMask))
483     {
484         assert(index < SWR_NUM_RENDERTARGETS);
485         colorHotTileMask &= ~(1 << index);
486         pColorBuffer[index] = renderBuffers.pColor[index];
487     }
488 
489     if (pDepthBuffer)
490     {
491         *pDepthBuffer = renderBuffers.pDepth;
492     }
493 
494     if (pStencilBuffer)
495     {
496         *pStencilBuffer = renderBuffers.pStencil;;
497     }
498 }
499 
500 template<typename T>
501 void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
502 {
503     psContext->pAttribs = work.pAttribs;
504     psContext->pPerspAttribs = work.pPerspAttribs;
505     psContext->frontFace = work.triFlags.frontFacing;
506     psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
507 
508     // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
509     psContext->I = work.I;
510     psContext->J = work.J;
511 
512     psContext->recipDet = work.recipDet;
513     psContext->pRecipW = work.pRecipW;
514     psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
515     psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
516     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
517     psContext->sampleIndex = 0;
518 }
519 
520 template<typename T, bool IsSingleSample>
521 void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
522                   const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
523 {
524     if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
525     {
526         // for 1x case, centroid is pixel center
527         psContext->vX.centroid = psContext->vX.center;
528         psContext->vY.centroid = psContext->vY.center;
529         psContext->vI.centroid = psContext->vI.center;
530         psContext->vJ.centroid = psContext->vJ.center;
531         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
532     }
533     else
534     {
535         if (T::bCentroidPos)
536         {
537             ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
538             if (T::bIsCenterPattern)
539             {
540                 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
541                 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
542             }
543             else
544             {
545                 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
546                 CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
547             }
548 
549             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
550         }
551         else
552         {
553             psContext->vX.centroid = psContext->vX.sample;
554             psContext->vY.centroid = psContext->vY.sample;
555         }
556     }
557 }
558 
559 template<typename T>
560 struct PixelRateZTestLoop
561 {
562     PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
563                        uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
564                        pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
565                        samplePos(state.rastState.samplePositions),
566                        clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
567 
568     INLINE
569     uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
570                         const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
571     {
572         SWR_CONTEXT *pContext = pDC->pContext;
573 
574         uint32_t statCount = 0;
575         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
576         for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
577         {
578             const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
579             vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
580 
581             if(!_simd_movemask_ps(vCoverageMask[sample]))
582             {
583                 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
584                 continue;
585             }
586 
587             // offset depth/stencil buffers current sample
588             uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
589             uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
590 
591             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
592             {
593                 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
594 
595                 const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
596 
597                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
598                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
599 
600                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
601             }
602 
603             AR_BEGIN(BEBarycentric, pDC->drawId);
604 
605             // calculate per sample positions
606             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
607             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
608 
609             // calc I & J per sample
610             CalcSampleBarycentrics(coeffs, psContext);
611 
612             if(psState.writesODepth)
613             {
614                 {
615                     // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
616                     vZ[sample] = psContext.vZ;
617                 }
618             }
619             else
620             {
621                 vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
622                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
623             }
624 
625             AR_END(BEBarycentric, 0);
626 
627             ///@todo: perspective correct vs non-perspective correct clipping?
628             // if clip distances are enabled, we need to interpolate for each sample
629             if(clipDistanceMask)
630             {
631                 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
632 
633                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
634             }
635 
636             // ZTest for this sample
637             ///@todo Need to uncomment out this bucket.
638             //AR_BEGIN(BEDepthBucket, pDC->drawId);
639             depthPassMask[sample] = vCoverageMask[sample];
640             stencilPassMask[sample] = vCoverageMask[sample];
641             depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
642                                                      vZ[sample], pDepthSample, vCoverageMask[sample],
643                                                      pStencilSample, &stencilPassMask[sample]);
644             //AR_END(BEDepthBucket, 0);
645 
646             // early-exit if no pixels passed depth or earlyZ is forced on
647             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
648             {
649                 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
650                                   pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
651 
652                 if(!_simd_movemask_ps(depthPassMask[sample]))
653                 {
654                     continue;
655                 }
656             }
657             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
658             uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
659             statCount += _mm_popcnt_u32(statMask);
660         }
661 
662         activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
663         // return number of samples that passed depth and coverage
664         return statCount;
665     }
666 
667     // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
668     simdscalar vZ[T::MultisampleT::numCoverageSamples];
669     simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
670     simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
671     simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
672 
673 private:
674     // functor inputs
675     DRAW_CONTEXT* pDC;
676     uint32_t workerId;
677 
678     const SWR_TRIANGLE_DESC& work;
679     const BarycentricCoeffs& coeffs;
680     const API_STATE& state;
681     const SWR_PS_STATE& psState;
682     const SWR_MULTISAMPLE_POS& samplePos;
683     const uint8_t clipDistanceMask;
684     uint8_t*& pDepthBuffer;
685     uint8_t*& pStencilBuffer;
686 };
687 
688 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
689 {
690     // evaluate I,J
691     psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
692     psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
693     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
694     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
695 
696     // interpolate 1/w
697     psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
698 }
699 
700 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
701 {
702     // evaluate I,J
703     psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
704     psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
705     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
706     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
707 
708     // interpolate 1/w
709     psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
710 }
711 
712 // Merge Output to 4x2 SIMD Tile Format
713 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
714     const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask)
715 {
716     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
717     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
718     simdvector blendOut;
719 
720     DWORD rt = 0;
721     while (_BitScanForward(&rt, renderTargetMask))
722     {
723         renderTargetMask &= ~(1 << rt);
724         uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
725 
726         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
727 
728         {
729             // pfnBlendFunc may not update all channels.  Initialize with PS output.
730             /// TODO: move this into the blend JIT.
731             blendOut = psContext.shaded[rt];
732 
733             // Blend outputs and update coverage mask for alpha test
734             if(pfnBlendFunc[rt] != nullptr)
735             {
736                 pfnBlendFunc[rt](
737                     pBlendState,
738                     psContext.shaded[rt],
739                     psContext.shaded[1],
740                     psContext.shaded[0].w,
741                     sample,
742                     pColorSample,
743                     blendOut,
744                     &psContext.oMask,
745                     (simdscalari*)&coverageMask);
746             }
747         }
748 
749         // final write mask
750         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
751 
752         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
753         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
754 
755         const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
756 
757         // store with color mask
758         if(!pRTBlend->writeDisableRed)
759         {
760             _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
761         }
762         if(!pRTBlend->writeDisableGreen)
763         {
764             _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
765         }
766         if(!pRTBlend->writeDisableBlue)
767         {
768             _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
769         }
770         if(!pRTBlend->writeDisableAlpha)
771         {
772             _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
773         }
774     }
775 }
776 
777 #if USE_8x2_TILE_BACKEND
778 // Merge Output to 8x2 SIMD16 Tile Format
779 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
780     const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
781 {
782     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
783     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
784 
785     if (useAlternateOffset)
786     {
787         rasterTileColorOffset += sizeof(simdscalar);
788     }
789 
790     simdvector blendSrc;
791     simdvector blendOut;
792 
793     DWORD rt;
794     while (_BitScanForward(&rt, renderTargetMask))
795     {
796         renderTargetMask &= ~(1 << rt);
797 
798         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
799 
800         simdscalar* pColorSample;
801         bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
802         if (hotTileEnable)
803         {
804             pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
805             blendSrc[0] = pColorSample[0];
806             blendSrc[1] = pColorSample[2];
807             blendSrc[2] = pColorSample[4];
808             blendSrc[3] = pColorSample[6];
809         }
810         else
811         {
812             pColorSample = nullptr;
813         }
814 
815         {
816             // pfnBlendFunc may not update all channels.  Initialize with PS output.
817             /// TODO: move this into the blend JIT.
818             blendOut = psContext.shaded[rt];
819 
820             // Blend outputs and update coverage mask for alpha test
821             if(pfnBlendFunc[rt] != nullptr)
822             {
823                 pfnBlendFunc[rt](
824                     pBlendState,
825                     psContext.shaded[rt],
826                     psContext.shaded[1],
827                     psContext.shaded[0].w,
828                     sample,
829                     reinterpret_cast<uint8_t *>(&blendSrc),
830                     blendOut,
831                     &psContext.oMask,
832                     reinterpret_cast<simdscalari *>(&coverageMask));
833             }
834         }
835 
836         // final write mask
837         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
838 
839         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
840         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
841 
842         // store with color mask
843         if (!pRTBlend->writeDisableRed)
844         {
845             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
846         }
847         if (!pRTBlend->writeDisableGreen)
848         {
849             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
850         }
851         if (!pRTBlend->writeDisableBlue)
852         {
853             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
854         }
855         if (!pRTBlend->writeDisableAlpha)
856         {
857             _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
858         }
859     }
860 }
861 
862 #endif
863 
864 template<typename T>
865 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
866 {
867     ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
868 
869 
870     SWR_CONTEXT *pContext = pDC->pContext;
871 
872     AR_BEGIN(BEPixelRateBackend, pDC->drawId);
873     AR_BEGIN(BESetup, pDC->drawId);
874 
875     const API_STATE &state = GetApiState(pDC);
876 
877     BarycentricCoeffs coeffs;
878     SetupBarycentricCoeffs(&coeffs, work);
879 
880     SWR_PS_CONTEXT psContext;
881     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
882     SetupPixelShaderContext<T>(&psContext, samplePos, work);
883 
884     uint8_t *pDepthBuffer, *pStencilBuffer;
885     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
886 
887     AR_END(BESetup, 0);
888 
889     PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
890 
891     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
892     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
893 
894     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
895 
896     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
897     {
898         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
899         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
900 
901         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
902 
903         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
904         {
905 #if USE_8x2_TILE_BACKEND
906             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
907 #endif
908             simdscalar activeLanes;
909             if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
910             activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
911 
912             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
913             {
914                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
915 
916                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
917             }
918 
919             AR_BEGIN(BEBarycentric, pDC->drawId);
920 
921             CalcPixelBarycentrics(coeffs, psContext);
922 
923             CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
924 
925             AR_END(BEBarycentric, 0);
926 
927             if(T::bForcedSampleCount)
928             {
929                 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
930                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
931                 activeLanes = _simd_and_ps(activeLanes, vSampleMask);
932             }
933 
934             // Early-Z?
935             if(T::bCanEarlyZ && !T::bForcedSampleCount)
936             {
937                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
938                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
939                 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
940             }
941 
942             // if we have no covered samples that passed depth at this point, go to next tile
943             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
944 
945             if(state.psState.usesSourceDepth)
946             {
947                 AR_BEGIN(BEBarycentric, pDC->drawId);
948                 // interpolate and quantize z
949                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
950                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
951                 AR_END(BEBarycentric, 0);
952             }
953 
954             // pixels that are currently active
955             psContext.activeMask = _simd_castps_si(activeLanes);
956             psContext.oMask = T::MultisampleT::FullSampleMask();
957 
958             // execute pixel shader
959             AR_BEGIN(BEPixelShader, pDC->drawId);
960             state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
961             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
962             AR_END(BEPixelShader, 0);
963 
964             // update active lanes to remove any discarded or oMask'd pixels
965             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
966             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
967 
968             // late-Z
969             if(!T::bCanEarlyZ && !T::bForcedSampleCount)
970             {
971                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
972                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
973                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
974             }
975 
976             // if we have no covered samples that passed depth at this point, skip OM and go to next tile
977             if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
978 
979             // output merger
980             // loop over all samples, broadcasting the results of the PS to all passing pixels
981             for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
982             {
983                 AR_BEGIN(BEOutputMerger, pDC->drawId);
984                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
985                 uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
986                 simdscalar coverageMask, depthMask;
987                 if(T::bForcedSampleCount)
988                 {
989                     coverageMask = depthMask = activeLanes;
990                 }
991                 else
992                 {
993                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
994                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
995                     if(!_simd_movemask_ps(depthMask))
996                     {
997                         // stencil should already have been written in early/lateZ tests
998                         AR_END(BEOutputMerger, 0);
999                         continue;
1000                     }
1001                 }
1002 
1003                 // broadcast the results of the PS to all passing pixels
1004 #if USE_8x2_TILE_BACKEND
1005                 OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset);
1006 #else // USE_8x2_TILE_BACKEND
1007                 OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask);
1008 #endif // USE_8x2_TILE_BACKEND
1009 
1010                 if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1011                 {
1012                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
1013                     uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1014 
1015                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
1016                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
1017                 }
1018                 AR_END(BEOutputMerger, 0);
1019             }
1020 Endtile:
1021             AR_BEGIN(BEEndTile, pDC->drawId);
1022 
1023             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1024             {
1025                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1026             }
1027 
1028             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1029             {
1030                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1031             }
1032             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1033 
1034 #if USE_8x2_TILE_BACKEND
1035             if (useAlternateOffset)
1036             {
1037                 DWORD rt;
1038                 uint32_t rtMask = state.colorHottileEnable;
1039                 while (_BitScanForward(&rt, rtMask))
1040                 {
1041                     rtMask &= ~(1 << rt);
1042                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1043                 }
1044             }
1045 #else
1046             DWORD rt;
1047             uint32_t rtMask = state.colorHottileEnable;
1048             while (_BitScanForward(&rt, rtMask))
1049             {
1050                 rtMask &= ~(1 << rt);
1051                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1052             }
1053 #endif
1054             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1055             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1056 
1057             AR_END(BEEndTile, 0);
1058 
1059             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
1060             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1061         }
1062 
1063         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
1064         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1065     }
1066 
1067     AR_END(BEPixelRateBackend, 0);
1068 }
1069 
1070 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
1071          uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
1072     >
1073 struct SwrBackendTraits
1074 {
1075     static const bool bIsCenterPattern = (isCenter == 1);
1076     static const uint32_t InputCoverage = coverage;
1077     static const bool bCentroidPos = (centroid == 1);
1078     static const bool bForcedSampleCount = (forced == 1);
1079     static const bool bCanEarlyZ = (canEarlyZ == 1);
1080     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
1081 };
1082