1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 *        operations.
27 *
28 ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 
38 #include <algorithm>
39 
40 template<typename T>
BackendSingleSample(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)41 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
42 {
43     SWR_CONTEXT *pContext = pDC->pContext;
44 
45     AR_BEGIN(BESingleSampleBackend, pDC->drawId);
46     AR_BEGIN(BESetup, pDC->drawId);
47 
48     const API_STATE &state = GetApiState(pDC);
49 
50     BarycentricCoeffs coeffs;
51     SetupBarycentricCoeffs(&coeffs, work);
52 
53     SWR_PS_CONTEXT psContext;
54     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
55     SetupPixelShaderContext<T>(&psContext, samplePos, work);
56 
57     uint8_t *pDepthBuffer, *pStencilBuffer;
58     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
59 
60     AR_END(BESetup, 1);
61 
62     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
63     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
64 
65     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
66 
67     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
68     {
69         psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
70         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
71 
72         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
73 
74         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
75         {
76 #if USE_8x2_TILE_BACKEND
77             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
78 #endif
79             simdmask coverageMask = work.coverageMask[0] & MASK;
80 
81             if (coverageMask)
82             {
83                 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
84                 {
85                     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
86 
87                     const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
88 
89                     const float minz = state.depthBoundsState.depthBoundsTestMinValue;
90                     const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
91 
92                     coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
93                 }
94 
95                 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
96                 {
97                     const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
98 
99                     generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
100                 }
101 
102                 AR_BEGIN(BEBarycentric, pDC->drawId);
103 
104                 CalcPixelBarycentrics(coeffs, psContext);
105 
106                 CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
107 
108                 // interpolate and quantize z
109                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
110                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
111 
112                 AR_END(BEBarycentric, 1);
113 
114                 // interpolate user clip distance if available
115                 if (state.backendState.clipDistanceMask)
116                 {
117                     coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
118                 }
119 
120                 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
121                 simdscalar depthPassMask = vCoverageMask;
122                 simdscalar stencilPassMask = vCoverageMask;
123 
124                 // Early-Z?
125                 if (T::bCanEarlyZ)
126                 {
127                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
128                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
129                                                      psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
130                     AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
131                     AR_END(BEEarlyDepthTest, 0);
132 
133                     // early-exit if no pixels passed depth or earlyZ is forced on
134                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
135                     {
136                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
137                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
138 
139                         if (!_simd_movemask_ps(depthPassMask))
140                         {
141                             goto Endtile;
142                         }
143                     }
144                 }
145 
146                 psContext.sampleIndex = 0;
147                 psContext.activeMask = _simd_castps_si(vCoverageMask);
148 
149                 // execute pixel shader
150                 AR_BEGIN(BEPixelShader, pDC->drawId);
151                 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
152                 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
153                 AR_END(BEPixelShader, 0);
154 
155                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
156 
157                 // late-Z
158                 if (!T::bCanEarlyZ)
159                 {
160                     AR_BEGIN(BELateDepthTest, pDC->drawId);
161                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
162                                                         psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
163                     AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
164                     AR_END(BELateDepthTest, 0);
165 
166                     if (!_simd_movemask_ps(depthPassMask))
167                     {
168                         // need to call depth/stencil write for stencil write
169                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
170                             pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
171                         goto Endtile;
172                     }
173                 } else {
174                     // for early z, consolidate discards from shader
175                     // into depthPassMask
176                     depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
177                 }
178 
179                 uint32_t statMask = _simd_movemask_ps(depthPassMask);
180                 uint32_t statCount = _mm_popcnt_u32(statMask);
181                 UPDATE_STAT_BE(DepthPassCount, statCount);
182 
183                 // output merger
184                 AR_BEGIN(BEOutputMerger, pDC->drawId);
185 #if USE_8x2_TILE_BACKEND
186                 OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
187 #else
188                 OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
189 #endif
190 
191                 // do final depth write after all pixel kills
192                 if (!state.psState.forceEarlyZ)
193                 {
194                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
195                         pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
196                 }
197                 AR_END(BEOutputMerger, 0);
198             }
199 
200 Endtile:
201             AR_BEGIN(BEEndTile, pDC->drawId);
202 
203             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
204             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
205             {
206                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
207             }
208 
209 #if USE_8x2_TILE_BACKEND
210             if (useAlternateOffset)
211             {
212                 DWORD rt;
213                 uint32_t rtMask = state.colorHottileEnable;
214                 while(_BitScanForward(&rt, rtMask))
215                 {
216                     rtMask &= ~(1 << rt);
217                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
218                 }
219             }
220 #else
221             DWORD rt;
222             uint32_t rtMask = state.colorHottileEnable;
223             while (_BitScanForward(&rt, rtMask))
224             {
225                 rtMask &= ~(1 << rt);
226                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
227             }
228 #endif
229             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
230             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
231 
232             AR_END(BEEndTile, 0);
233 
234             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
235             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
236         }
237 
238         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
239         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
240     }
241 
242     AR_END(BESingleSampleBackend, 0);
243 }
244 
245 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
246 // arguments to static template arguments.
247 template <uint32_t... ArgsT>
248 struct BEChooserSingleSample
249 {
250     // Last Arg Terminator
GetFuncBEChooserSingleSample251     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
252     {
253         switch(tArg)
254         {
255         case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
256         case SWR_BACKEND_MSAA_PIXEL_RATE:
257         case SWR_BACKEND_MSAA_SAMPLE_RATE:
258         default:
259             SWR_ASSERT(0 && "Invalid backend func\n");
260             return nullptr;
261             break;
262         }
263     }
264 
265     // Recursively parse args
266     template <typename... TArgsT>
GetFuncBEChooserSingleSample267     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
268     {
269         switch(tArg)
270         {
271         case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
272         case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
273         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
274         default:
275         SWR_ASSERT(0 && "Invalid sample pattern\n");
276         return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
277         break;
278         }
279     }
280 
281     // Recursively parse args
282     template <typename... TArgsT>
GetFuncBEChooserSingleSample283     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
284     {
285         switch(tArg)
286         {
287         case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
288         case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
289         case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
290         case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
291         case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
292         default:
293         SWR_ASSERT(0 && "Invalid sample count\n");
294         return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
295         break;
296         }
297     }
298 
299     // Recursively parse args
300     template <typename... TArgsT>
GetFuncBEChooserSingleSample301     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
302     {
303         if(tArg == true)
304         {
305             return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
306         }
307 
308         return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
309     }
310 };
311 
InitBackendSingleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_INPUT_COVERAGE_COUNT][2][2])312 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
313 {
314     for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
315     {
316         for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
317         {
318             for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
319             {
320                 table[inputCoverage][isCentroid][canEarlyZ] =
321                     BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
322                                          (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
323             }
324         }
325     }
326 }
327