1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 * operations.
27 *
28 ******************************************************************************/
29
30 #include <smmintrin.h>
31
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37
38 #include <algorithm>
39
40 template <typename T>
BackendSampleRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)41 void BackendSampleRate(DRAW_CONTEXT* pDC,
42 uint32_t workerId,
43 uint32_t x,
44 uint32_t y,
45 SWR_TRIANGLE_DESC& work,
46 RenderOutputBuffers& renderBuffers)
47 {
48 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId);
49 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
50
51 void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
52 const API_STATE& state = GetApiState(pDC);
53
54 BarycentricCoeffs coeffs;
55 SetupBarycentricCoeffs(&coeffs, work);
56
57 SWR_PS_CONTEXT psContext;
58 const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
59 SetupPixelShaderContext<T>(&psContext, samplePos, work);
60
61 uint8_t *pDepthBuffer, *pStencilBuffer;
62 SetupRenderBuffers(psContext.pColorBuffer,
63 &pDepthBuffer,
64 &pStencilBuffer,
65 state.colorHottileEnable,
66 renderBuffers);
67
68 bool isTileDirty = false;
69
70 RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
71
72 psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
73 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
74
75 const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
76
77 for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
78 {
79 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
80 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
81
82 const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
83
84 for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
85 {
86 const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
87
88
89 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
90 {
91 const uint64_t* pCoverageMask =
92 (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
93 ? &work.innerCoverageMask
94 : &work.coverageMask[0];
95
96 generateInputCoverage<T, T::InputCoverage>(
97 pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
98 }
99
100 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
101
102 CalcPixelBarycentrics(coeffs, psContext);
103
104 CalcCentroid<T, false>(
105 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
106
107 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
108
109 for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
110 {
111 simdmask coverageMask = work.coverageMask[sample] & MASK;
112
113 if (coverageMask)
114 {
115 // offset depth/stencil buffers current sample
116 uint8_t* pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
117 uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
118
119 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
120 {
121 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
122 "Unsupported depth hot tile format");
123
124 const simdscalar z =
125 _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
126
127 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
128 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
129
130 coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
131 }
132
133 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
134
135 // calculate per sample positions
136 psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
137 psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
138
139 CalcSampleBarycentrics(coeffs, psContext);
140
141 // interpolate and quantize z
142 psContext.vZ = vplaneps(coeffs.vZa,
143 coeffs.vZb,
144 coeffs.vZc,
145 psContext.vI.sample,
146 psContext.vJ.sample);
147 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
148
149 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
150
151 // interpolate user clip distance if available
152 if (state.backendState.clipDistanceMask)
153 {
154 coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
155 work.pUserClipBuffer,
156 psContext.vI.sample,
157 psContext.vJ.sample);
158 }
159
160 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
161 simdscalar depthPassMask = vCoverageMask;
162 simdscalar stencilPassMask = vCoverageMask;
163
164 // Early-Z?
165 if (T::bCanEarlyZ)
166 {
167 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
168 depthPassMask = DepthStencilTest(&state,
169 work.triFlags.frontFacing,
170 work.triFlags.viewportIndex,
171 psContext.vZ,
172 pDepthSample,
173 vCoverageMask,
174 pStencilSample,
175 &stencilPassMask);
176 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
177 _simd_movemask_ps(stencilPassMask),
178 _simd_movemask_ps(vCoverageMask)));
179 RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
180
181 // early-exit if no samples passed depth or earlyZ is forced on.
182 if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
183 {
184 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
185 &state.depthStencilState,
186 work.triFlags.frontFacing,
187 psContext.vZ,
188 pDepthSample,
189 depthPassMask,
190 vCoverageMask,
191 pStencilSample,
192 stencilPassMask);
193
194 if (!_simd_movemask_ps(depthPassMask))
195 {
196 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
197 continue;
198 }
199 }
200 }
201
202 psContext.sampleIndex = sample;
203 psContext.activeMask = _simd_castps_si(vCoverageMask);
204
205 // execute pixel shader
206 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
207 state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
208 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
209
210 // update stats
211 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
212 AR_EVENT(PSStats((HANDLE)&psContext.stats));
213
214 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
215
216 if (_simd_movemask_ps(vCoverageMask))
217 {
218 isTileDirty = true;
219 }
220
221 // late-Z
222 if (!T::bCanEarlyZ)
223 {
224 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
225 depthPassMask = DepthStencilTest(&state,
226 work.triFlags.frontFacing,
227 work.triFlags.viewportIndex,
228 psContext.vZ,
229 pDepthSample,
230 vCoverageMask,
231 pStencilSample,
232 &stencilPassMask);
233 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
234 _simd_movemask_ps(stencilPassMask),
235 _simd_movemask_ps(vCoverageMask)));
236 RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
237
238 if (!_simd_movemask_ps(depthPassMask))
239 {
240 // need to call depth/stencil write for stencil write
241 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
242 &state.depthStencilState,
243 work.triFlags.frontFacing,
244 psContext.vZ,
245 pDepthSample,
246 depthPassMask,
247 vCoverageMask,
248 pStencilSample,
249 stencilPassMask);
250
251 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
252 continue;
253 }
254 }
255
256 uint32_t statMask = _simd_movemask_ps(depthPassMask);
257 uint32_t statCount = _mm_popcnt_u32(statMask);
258 UPDATE_STAT_BE(DepthPassCount, statCount);
259
260 // output merger
261 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
262
263 OutputMerger8x2(pDC,
264 psContext,
265 psContext.pColorBuffer,
266 sample,
267 &state.blendState,
268 state.pfnBlendFunc,
269 vCoverageMask,
270 depthPassMask,
271 state.psState.renderTargetMask,
272 useAlternateOffset,
273 workerId);
274
275 // do final depth write after all pixel kills
276 if (!state.psState.forceEarlyZ)
277 {
278 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
279 &state.depthStencilState,
280 work.triFlags.frontFacing,
281 psContext.vZ,
282 pDepthSample,
283 depthPassMask,
284 vCoverageMask,
285 pStencilSample,
286 stencilPassMask);
287 }
288 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
289 }
290 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
291 }
292
293 Endtile:
294 ATTR_UNUSED;
295
296 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
297
298 if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
299 {
300 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
301 }
302
303 if (useAlternateOffset)
304 {
305 unsigned long rt;
306 uint32_t rtMask = state.colorHottileEnable;
307 while (_BitScanForward(&rt, rtMask))
308 {
309 rtMask &= ~(1 << rt);
310 psContext.pColorBuffer[rt] +=
311 (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
312 }
313 }
314
315 pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
316 pStencilBuffer +=
317 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
318
319 RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
320
321 psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
322 psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
323 }
324
325 psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
326 psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
327 }
328
329 if (isTileDirty)
330 {
331 SetRenderHotTilesDirty(pDC, renderBuffers);
332 }
333
334 RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0);
335 }
336
337 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
338 // arguments to static template arguments.
339 template <uint32_t... ArgsT>
340 struct BEChooserSampleRate
341 {
342 // Last Arg Terminator
GetFuncBEChooserSampleRate343 static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
344 {
345 switch (tArg)
346 {
347 case SWR_BACKEND_MSAA_SAMPLE_RATE:
348 return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
349 break;
350 case SWR_BACKEND_SINGLE_SAMPLE:
351 case SWR_BACKEND_MSAA_PIXEL_RATE:
352 SWR_ASSERT(0 && "Invalid backend func\n");
353 return nullptr;
354 break;
355 default:
356 SWR_ASSERT(0 && "Invalid backend func\n");
357 return nullptr;
358 break;
359 }
360 }
361
362 // Recursively parse args
363 template <typename... TArgsT>
GetFuncBEChooserSampleRate364 static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
365 {
366 switch (tArg)
367 {
368 case SWR_INPUT_COVERAGE_NONE:
369 return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
370 remainingArgs...);
371 break;
372 case SWR_INPUT_COVERAGE_NORMAL:
373 return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
374 remainingArgs...);
375 break;
376 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
377 return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
378 remainingArgs...);
379 break;
380 default:
381 SWR_ASSERT(0 && "Invalid sample pattern\n");
382 return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
383 remainingArgs...);
384 break;
385 }
386 }
387
388 // Recursively parse args
389 template <typename... TArgsT>
GetFuncBEChooserSampleRate390 static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
391 {
392 switch (tArg)
393 {
394 case SWR_MULTISAMPLE_1X:
395 return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
396 break;
397 case SWR_MULTISAMPLE_2X:
398 return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
399 break;
400 case SWR_MULTISAMPLE_4X:
401 return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
402 break;
403 case SWR_MULTISAMPLE_8X:
404 return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
405 break;
406 case SWR_MULTISAMPLE_16X:
407 return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
408 break;
409 default:
410 SWR_ASSERT(0 && "Invalid sample count\n");
411 return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
412 break;
413 }
414 }
415
416 // Recursively parse args
417 template <typename... TArgsT>
GetFuncBEChooserSampleRate418 static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
419 {
420 if (tArg == true)
421 {
422 return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
423 }
424
425 return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
426 }
427 };
428
InitBackendSampleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])429 void InitBackendSampleFuncTable(
430 PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
431 {
432 for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
433 sampleCount++)
434 {
435 for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
436 {
437 for (uint32_t centroid = 0; centroid < 2; centroid++)
438 {
439 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
440 {
441 table[sampleCount][inputCoverage][centroid][canEarlyZ] =
442 BEChooserSampleRate<>::GetFunc(
443 (SWR_MULTISAMPLE_COUNT)sampleCount,
444 false,
445 (SWR_INPUT_COVERAGE)inputCoverage,
446 (centroid > 0),
447 false,
448 (canEarlyZ > 0),
449 (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
450 }
451 }
452 }
453 }
454 }
455