1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 *        operations.
27 *
28 ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 
38 #include <algorithm>
39 
40 template<typename T>
BackendSampleRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)41 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
42 {
43     SWR_CONTEXT *pContext = pDC->pContext;
44 
45     AR_BEGIN(BESampleRateBackend, pDC->drawId);
46     AR_BEGIN(BESetup, pDC->drawId);
47 
48     const API_STATE &state = GetApiState(pDC);
49 
50     BarycentricCoeffs coeffs;
51     SetupBarycentricCoeffs(&coeffs, work);
52 
53     SWR_PS_CONTEXT psContext;
54     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
55     SetupPixelShaderContext<T>(&psContext, samplePos, work);
56 
57     uint8_t *pDepthBuffer, *pStencilBuffer;
58     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
59 
60     AR_END(BESetup, 0);
61 
62     psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
63     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
64 
65     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
66 
67     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
68     {
69         psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
70         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
71 
72         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
73 
74         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
75         {
76 #if USE_8x2_TILE_BACKEND
77             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
78 #endif
79             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
80             {
81                 const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
82 
83                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
84             }
85 
86             AR_BEGIN(BEBarycentric, pDC->drawId);
87 
88             CalcPixelBarycentrics(coeffs, psContext);
89 
90             CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
91 
92             AR_END(BEBarycentric, 0);
93 
94             for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
95             {
96                 simdmask coverageMask = work.coverageMask[sample] & MASK;
97 
98                 if (coverageMask)
99                 {
100                     // offset depth/stencil buffers current sample
101                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
102                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
103 
104                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
105                     {
106                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
107 
108                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
109 
110                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
111                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
112 
113                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
114                     }
115 
116                     AR_BEGIN(BEBarycentric, pDC->drawId);
117 
118                     // calculate per sample positions
119                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
120                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
121 
122                     CalcSampleBarycentrics(coeffs, psContext);
123 
124                     // interpolate and quantize z
125                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
126                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
127 
128                     AR_END(BEBarycentric, 0);
129 
130                     // interpolate user clip distance if available
131                     if (state.backendState.clipDistanceMask)
132                     {
133                         coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
134                     }
135 
136                     simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
137                     simdscalar depthPassMask = vCoverageMask;
138                     simdscalar stencilPassMask = vCoverageMask;
139 
140                     // Early-Z?
141                     if (T::bCanEarlyZ)
142                     {
143                         AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
144                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
145                             psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
146                         AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
147                         AR_END(BEEarlyDepthTest, 0);
148 
149                         // early-exit if no samples passed depth or earlyZ is forced on.
150                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
151                         {
152                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
153                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
154 
155                             if (!_simd_movemask_ps(depthPassMask))
156                             {
157                                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
158                                 continue;
159                             }
160                         }
161                     }
162 
163                     psContext.sampleIndex = sample;
164                     psContext.activeMask = _simd_castps_si(vCoverageMask);
165 
166                     // execute pixel shader
167                     AR_BEGIN(BEPixelShader, pDC->drawId);
168                     UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
169                     state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
170                     AR_END(BEPixelShader, 0);
171 
172                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
173 
174                     // late-Z
175                     if (!T::bCanEarlyZ)
176                     {
177                         AR_BEGIN(BELateDepthTest, pDC->drawId);
178                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
179                             psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
180                         AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
181                         AR_END(BELateDepthTest, 0);
182 
183                         if (!_simd_movemask_ps(depthPassMask))
184                         {
185                             // need to call depth/stencil write for stencil write
186                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
187                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
188 
189                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
190                             continue;
191                         }
192                     }
193 
194                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
195                     uint32_t statCount = _mm_popcnt_u32(statMask);
196                     UPDATE_STAT_BE(DepthPassCount, statCount);
197 
198                     // output merger
199                     AR_BEGIN(BEOutputMerger, pDC->drawId);
200 #if USE_8x2_TILE_BACKEND
201                     OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
202 #else
203                     OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
204 #endif
205 
206                     // do final depth write after all pixel kills
207                     if (!state.psState.forceEarlyZ)
208                     {
209                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
210                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
211                     }
212                     AR_END(BEOutputMerger, 0);
213                 }
214                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
215             }
216 
217         Endtile:
218             ATTR_UNUSED;
219 
220             AR_BEGIN(BEEndTile, pDC->drawId);
221 
222             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
223             {
224                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
225             }
226 
227 #if USE_8x2_TILE_BACKEND
228             if (useAlternateOffset)
229             {
230                 DWORD rt;
231                 uint32_t rtMask = state.colorHottileEnable;
232                 while (_BitScanForward(&rt, rtMask))
233                 {
234                     rtMask &= ~(1 << rt);
235                     psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
236                 }
237             }
238 #else
239             DWORD rt;
240             uint32_t rtMask = state.colorHottileEnable;
241             while (_BitScanForward(&rt, rtMask))
242             {
243                 rtMask &= ~(1 << rt);
244                 psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
245             }
246 #endif
247             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
248             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
249 
250             AR_END(BEEndTile, 0);
251 
252             psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
253             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
254         }
255 
256         psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
257         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
258     }
259 
260     AR_END(BESampleRateBackend, 0);
261 }
262 
263 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
264 // arguments to static template arguments.
265 template <uint32_t... ArgsT>
266 struct BEChooserSampleRate
267 {
268     // Last Arg Terminator
GetFuncBEChooserSampleRate269     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
270     {
271         switch (tArg)
272         {
273         case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
274         case SWR_BACKEND_SINGLE_SAMPLE:
275         case SWR_BACKEND_MSAA_PIXEL_RATE:
276             SWR_ASSERT(0 && "Invalid backend func\n");
277             return nullptr;
278             break;
279         default:
280             SWR_ASSERT(0 && "Invalid backend func\n");
281             return nullptr;
282             break;
283         }
284     }
285 
286     // Recursively parse args
287     template <typename... TArgsT>
GetFuncBEChooserSampleRate288     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
289     {
290         switch (tArg)
291         {
292         case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
293         case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
294         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
295         default:
296             SWR_ASSERT(0 && "Invalid sample pattern\n");
297             return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
298             break;
299         }
300     }
301 
302     // Recursively parse args
303     template <typename... TArgsT>
GetFuncBEChooserSampleRate304     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
305     {
306         switch (tArg)
307         {
308         case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
309         case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
310         case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
311         case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
312         case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
313         default:
314             SWR_ASSERT(0 && "Invalid sample count\n");
315             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
316             break;
317         }
318     }
319 
320     // Recursively parse args
321     template <typename... TArgsT>
GetFuncBEChooserSampleRate322     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
323     {
324         if (tArg == true)
325         {
326             return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
327         }
328 
329         return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
330     }
331 };
332 
InitBackendSampleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])333 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
334 {
335     for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
336     {
337         for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
338         {
339             for (uint32_t centroid = 0; centroid < 2; centroid++)
340             {
341                 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
342                 {
343                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
344                         BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
345                         (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
346                 }
347             }
348         }
349     }
350 }
351