1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file backend.cpp
24 *
25 * @brief Backend handles rasterization, pixel shading and output merger
26 *        operations.
27 *
28 ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 #include "backends/gen_BackendPixelRate.hpp"
38 
39 #include <algorithm>
40 
41 
42 //////////////////////////////////////////////////////////////////////////
43 /// @brief Process compute work.
44 /// @param pDC - pointer to draw context (dispatch).
45 /// @param workerId - The unique worker ID that is assigned to this thread.
46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessComputeBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t threadGroupId,void * & pSpillFillBuffer,void * & pScratchSpace)47 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
48 {
49     SWR_CONTEXT *pContext = pDC->pContext;
50 
51     AR_BEGIN(BEDispatch, pDC->drawId);
52 
53     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
54     SWR_ASSERT(pTaskData != nullptr);
55 
56     // Ensure spill fill memory has been allocated.
57     size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
58     if (spillFillSize && pSpillFillBuffer == nullptr)
59     {
60         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
61     }
62 
63     size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
64     if (scratchSpaceSize && pScratchSpace == nullptr)
65     {
66         pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
67     }
68 
69     const API_STATE& state = GetApiState(pDC);
70 
71     SWR_CS_CONTEXT csContext{ 0 };
72     csContext.tileCounter = threadGroupId;
73     csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
74     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
75     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
76     csContext.pTGSM = pContext->ppScratch[workerId];
77     csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
78     csContext.pScratchSpace = (uint8_t*)pScratchSpace;
79     csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;
80 
81     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
82 
83     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
84 
85     AR_END(BEDispatch, 1);
86 }
87 
88 //////////////////////////////////////////////////////////////////////////
89 /// @brief Process shutdown.
90 /// @param pDC - pointer to draw context (dispatch).
91 /// @param workerId - The unique worker ID that is assigned to this thread.
92 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessShutdownBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)93 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
94 {
95     // Dummy function
96 }
97 
ProcessSyncBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)98 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
99 {
100     uint32_t x, y;
101     MacroTileMgr::getTileIndices(macroTile, x, y);
102     SWR_ASSERT(x == 0 && y == 0);
103 }
104 
ProcessStoreTileBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,STORE_TILES_DESC * pDesc,SWR_RENDERTARGET_ATTACHMENT attachment)105 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc,
106     SWR_RENDERTARGET_ATTACHMENT attachment)
107 {
108     SWR_CONTEXT *pContext = pDC->pContext;
109 
110     AR_BEGIN(BEStoreTiles, pDC->drawId);
111 
112     SWR_FORMAT srcFormat;
113     switch (attachment)
114     {
115     case SWR_ATTACHMENT_COLOR0:
116     case SWR_ATTACHMENT_COLOR1:
117     case SWR_ATTACHMENT_COLOR2:
118     case SWR_ATTACHMENT_COLOR3:
119     case SWR_ATTACHMENT_COLOR4:
120     case SWR_ATTACHMENT_COLOR5:
121     case SWR_ATTACHMENT_COLOR6:
122     case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
123     case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
124     case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
125     default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
126     }
127 
128     uint32_t x, y;
129     MacroTileMgr::getTileIndices(macroTile, x, y);
130 
131     // Only need to store the hottile if it's been rendered to...
132     HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
133     if (pHotTile)
134     {
135         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
136         if (pHotTile->state == HOTTILE_CLEAR)
137         {
138             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
139             SWR_ASSERT(pfnClearTiles != nullptr);
140 
141             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
142         }
143 
144         if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
145         {
146             int32_t destX = KNOB_MACROTILE_X_DIM * x;
147             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
148 
149             pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
150                 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
151         }
152 
153 
154         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
155         {
156             if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
157             {
158                 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
159             }
160         }
161     }
162     AR_END(BEStoreTiles, 1);
163 }
164 
ProcessStoreTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)165 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
166 {
167     STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
168 
169     unsigned long rt = 0;
170     uint32_t mask = pDesc->attachmentMask;
171     while (_BitScanForward(&rt, mask))
172     {
173         mask &= ~(1 << rt);
174         ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
175     }
176 }
177 
ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)178 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
179 {
180     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
181     SWR_CONTEXT *pContext = pDC->pContext;
182 
183     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
184 
185     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
186     {
187         if (pDesc->attachmentMask & (1 << i))
188         {
189             HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
190                 pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
191             if (pHotTile)
192             {
193                 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
194             }
195         }
196     }
197 }
198 
199 template<uint32_t sampleCountT>
BackendNullPS(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)200 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
201 {
202     SWR_CONTEXT *pContext = pDC->pContext;
203 
204     AR_BEGIN(BENullBackend, pDC->drawId);
205     ///@todo: handle center multisample pattern
206     AR_BEGIN(BESetup, pDC->drawId);
207 
208     const API_STATE &state = GetApiState(pDC);
209 
210     BarycentricCoeffs coeffs;
211     SetupBarycentricCoeffs(&coeffs, work);
212 
213     uint8_t *pDepthBuffer, *pStencilBuffer;
214     SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
215 
216     SWR_PS_CONTEXT psContext;
217     // skip SetupPixelShaderContext(&psContext, ...); // not needed here
218 
219     AR_END(BESetup, 0);
220 
221     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
222 
223     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
224     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
225     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
226     {
227         simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
228 
229         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
230 
231         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
232         {
233             // iterate over active samples
234             unsigned long sample = 0;
235             uint32_t sampleMask = state.blendState.sampleMask;
236             while (_BitScanForward(&sample, sampleMask))
237             {
238                 sampleMask &= ~(1 << sample);
239 
240                 simdmask coverageMask = work.coverageMask[sample] & MASK;
241 
242                 if (coverageMask)
243                 {
244                     // offset depth/stencil buffers current sample
245                     uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
246                     uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
247 
248                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
249                     {
250                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
251 
252                         const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
253 
254                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
255                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
256 
257                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
258                     }
259 
260                     AR_BEGIN(BEBarycentric, pDC->drawId);
261 
262                     // calculate per sample positions
263                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
264                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
265 
266                     CalcSampleBarycentrics(coeffs, psContext);
267 
268                     // interpolate and quantize z
269                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
270                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
271 
272                     AR_END(BEBarycentric, 0);
273 
274                     // interpolate user clip distance if available
275                     if (state.backendState.clipDistanceMask)
276                     {
277                         coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
278                     }
279 
280                     simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
281                     simdscalar stencilPassMask = vCoverageMask;
282 
283                     AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
284                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
285                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
286                     AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
287                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
288                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
289                     AR_END(BEEarlyDepthTest, 0);
290 
291                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
292                     uint32_t statCount = _mm_popcnt_u32(statMask);
293                     UPDATE_STAT_BE(DepthPassCount, statCount);
294                 }
295 
296             Endtile:
297                 ATTR_UNUSED;
298                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
299             }
300 
301             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
302             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
303 
304             vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
305         }
306 
307         vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
308     }
309 
310     AR_END(BENullBackend, 0);
311 }
312 
313 PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
314 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
315 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
316                                      [2] // centroid
317                                      [2] // canEarlyZ
318                                      = {};
319 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
320                                        [2] // isCenterPattern
321                                        [SWR_INPUT_COVERAGE_COUNT]
322                                        [2] // centroid
323                                        [2] // forcedSampleCount
324                                        [2] // canEarlyZ
325                                        = {};
326 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
327                                         [SWR_INPUT_COVERAGE_COUNT]
328                                         [2] // centroid
329                                         [2] // canEarlyZ
330                                         = {};
331 
InitBackendFuncTables()332 void InitBackendFuncTables()
333 {
334     InitBackendPixelRate();
335     InitBackendSingleFuncTable(gBackendSingleSample);
336     InitBackendSampleFuncTable(gBackendSampleRateTable);
337 
338     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
339     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
340     gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
341     gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
342     gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
343 }
344