1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file tilemgr.cpp
24 *
25 * @brief Implementation for Macro Tile Manager which provides the facilities
26 *        for threads to work on an macro tile.
27 *
28 ******************************************************************************/
29 #include <unordered_map>
30 
31 #include "fifo.hpp"
32 #include "core/tilemgr.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
35 
36 #define TILE_ID(x,y) ((x << 16 | y))
37 
MacroTileMgr(CachingArena & arena)38 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
39 {
40 }
41 
enqueue(uint32_t x,uint32_t y,BE_WORK * pWork)42 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
43 {
44     // Should not enqueue more then what we have backing for in the hot tile manager.
45     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
46     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
47 
48     if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
49     {
50         return;
51     }
52 
53     uint32_t id = TILE_ID(x, y);
54 
55     MacroTileQueue &tile = mTiles[id];
56     tile.mWorkItemsFE++;
57     tile.mId = id;
58 
59     if (tile.mWorkItemsFE == 1)
60     {
61         tile.clear(mArena);
62         mDirtyTiles.push_back(&tile);
63     }
64 
65     mWorkItemsProduced++;
66     tile.enqueue_try_nosync(mArena, pWork);
67 }
68 
markTileComplete(uint32_t id)69 void MacroTileMgr::markTileComplete(uint32_t id)
70 {
71     SWR_ASSERT(mTiles.find(id) != mTiles.end());
72     MacroTileQueue &tile = mTiles[id];
73     uint32_t numTiles = tile.mWorkItemsFE;
74     InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
75 
76     _ReadWriteBarrier();
77     tile.mWorkItemsBE += numTiles;
78     SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
79 
80     // clear out tile, but defer fifo clear until the next DC first queues to it.
81     // this prevents worker threads from constantly locking a completed macro tile
82     tile.mWorkItemsFE = 0;
83     tile.mWorkItemsBE = 0;
84 }
85 
GetHotTile(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t macroID,SWR_RENDERTARGET_ATTACHMENT attachment,bool create,uint32_t numSamples,uint32_t renderTargetArrayIndex)86 HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
87     uint32_t renderTargetArrayIndex)
88 {
89     uint32_t x, y;
90     MacroTileMgr::getTileIndices(macroID, x, y);
91 
92     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
93     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
94 
95     HotTileSet &tile = mHotTiles[x][y];
96     HOTTILE& hotTile = tile.Attachment[attachment];
97     if (hotTile.pBuffer == NULL)
98     {
99         if (create)
100         {
101             uint32_t size = numSamples * mHotTileSize[attachment];
102             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
103             hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
104             hotTile.state = HOTTILE_INVALID;
105             hotTile.numSamples = numSamples;
106             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
107         }
108         else
109         {
110             return NULL;
111         }
112     }
113     else
114     {
115         // free the old tile and create a new one with enough space to hold all samples
116         if (numSamples > hotTile.numSamples)
117         {
118             // tile should be either uninitialized or resolved if we're deleting and switching to a
119             // new sample count
120             SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
121                 (hotTile.state == HOTTILE_RESOLVED) ||
122                 (hotTile.state == HOTTILE_CLEAR));
123             FreeHotTileMem(hotTile.pBuffer);
124 
125             uint32_t size = numSamples * mHotTileSize[attachment];
126             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
127             hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
128             hotTile.state = HOTTILE_INVALID;
129             hotTile.numSamples = numSamples;
130         }
131 
132         // if requested render target array index isn't currently loaded, need to store out the current hottile
133         // and load the requested array slice
134         if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
135         {
136             SWR_FORMAT format;
137             switch (attachment)
138             {
139             case SWR_ATTACHMENT_COLOR0:
140             case SWR_ATTACHMENT_COLOR1:
141             case SWR_ATTACHMENT_COLOR2:
142             case SWR_ATTACHMENT_COLOR3:
143             case SWR_ATTACHMENT_COLOR4:
144             case SWR_ATTACHMENT_COLOR5:
145             case SWR_ATTACHMENT_COLOR6:
146             case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
147             case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
148             case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
149             default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
150             }
151 
152             if (hotTile.state == HOTTILE_CLEAR)
153             {
154                 if (attachment == SWR_ATTACHMENT_STENCIL)
155                     ClearStencilHotTile(&hotTile);
156                 else if (attachment == SWR_ATTACHMENT_DEPTH)
157                     ClearDepthHotTile(&hotTile);
158                 else
159                     ClearColorHotTile(&hotTile);
160 
161                 hotTile.state = HOTTILE_DIRTY;
162             }
163 
164             if (hotTile.state == HOTTILE_DIRTY)
165             {
166                 pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
167                     x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
168             }
169 
170             pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
171                 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
172 
173             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
174             hotTile.state = HOTTILE_DIRTY;
175         }
176     }
177     return &tile.Attachment[attachment];
178 }
179 
GetHotTileNoLoad(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t macroID,SWR_RENDERTARGET_ATTACHMENT attachment,bool create,uint32_t numSamples)180 HOTTILE* HotTileMgr::GetHotTileNoLoad(
181     SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
182     SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
183 {
184     uint32_t x, y;
185     MacroTileMgr::getTileIndices(macroID, x, y);
186 
187     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
188     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
189 
190     HotTileSet &tile = mHotTiles[x][y];
191     HOTTILE& hotTile = tile.Attachment[attachment];
192     if (hotTile.pBuffer == NULL)
193     {
194         if (create)
195         {
196             uint32_t size = numSamples * mHotTileSize[attachment];
197             hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64);
198             hotTile.state = HOTTILE_INVALID;
199             hotTile.numSamples = numSamples;
200             hotTile.renderTargetArrayIndex = 0;
201         }
202         else
203         {
204             return NULL;
205         }
206     }
207 
208     return &hotTile;
209 }
210 
211 #if USE_8x2_TILE_BACKEND
ClearColorHotTile(const HOTTILE * pHotTile)212 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
213 {
214     // Load clear color into SIMD register...
215     float *pClearData = (float *)(pHotTile->clearData);
216     simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]);
217     simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]);
218     simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]);
219     simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]);
220 
221     float *pfBuf = (float *)pHotTile->pBuffer;
222     uint32_t numSamples = pHotTile->numSamples;
223 
224     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
225     {
226         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
227         {
228             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
229             {
230                 _simd16_store_ps(pfBuf, valR);
231                 pfBuf += KNOB_SIMD16_WIDTH;
232 
233                 _simd16_store_ps(pfBuf, valG);
234                 pfBuf += KNOB_SIMD16_WIDTH;
235 
236                 _simd16_store_ps(pfBuf, valB);
237                 pfBuf += KNOB_SIMD16_WIDTH;
238 
239                 _simd16_store_ps(pfBuf, valA);
240                 pfBuf += KNOB_SIMD16_WIDTH;
241             }
242         }
243     }
244 }
245 
ClearDepthHotTile(const HOTTILE * pHotTile)246 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
247 {
248     // Load clear color into SIMD register...
249     float *pClearData = (float *)(pHotTile->clearData);
250     simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]);
251 
252     float *pfBuf = (float *)pHotTile->pBuffer;
253     uint32_t numSamples = pHotTile->numSamples;
254 
255     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
256     {
257         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
258         {
259             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
260             {
261                 _simd16_store_ps(pfBuf, valZ);
262                 pfBuf += KNOB_SIMD16_WIDTH;
263             }
264         }
265     }
266 }
267 
ClearStencilHotTile(const HOTTILE * pHotTile)268 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
269 {
270     // convert from F32 to U8.
271     uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
272     //broadcast 32x into __m256i...
273     simd16scalari valS = _simd16_set1_epi8(clearVal);
274 
275     simd16scalari *pBuf = (simd16scalari *)pHotTile->pBuffer;
276     uint32_t numSamples = pHotTile->numSamples;
277 
278     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
279     {
280         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
281         {
282             // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
283             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
284             {
285                 _simd16_store_si(pBuf, valS);
286                 pBuf += 1;
287             }
288         }
289     }
290 }
291 
292 #else
ClearColorHotTile(const HOTTILE * pHotTile)293 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
294 {
295     // Load clear color into SIMD register...
296     float *pClearData = (float*)(pHotTile->clearData);
297     simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
298     simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
299     simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
300     simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
301 
302     float *pfBuf = (float*)pHotTile->pBuffer;
303     uint32_t numSamples = pHotTile->numSamples;
304 
305     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
306     {
307         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
308         {
309             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
310             {
311                 _simd_store_ps(pfBuf, valR);
312                 pfBuf += KNOB_SIMD_WIDTH;
313                 _simd_store_ps(pfBuf, valG);
314                 pfBuf += KNOB_SIMD_WIDTH;
315                 _simd_store_ps(pfBuf, valB);
316                 pfBuf += KNOB_SIMD_WIDTH;
317                 _simd_store_ps(pfBuf, valA);
318                 pfBuf += KNOB_SIMD_WIDTH;
319             }
320         }
321     }
322 }
323 
ClearDepthHotTile(const HOTTILE * pHotTile)324 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
325 {
326     // Load clear color into SIMD register...
327     float *pClearData = (float*)(pHotTile->clearData);
328     simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
329 
330     float *pfBuf = (float*)pHotTile->pBuffer;
331     uint32_t numSamples = pHotTile->numSamples;
332 
333     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
334     {
335         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
336         {
337             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
338             {
339                 _simd_store_ps(pfBuf, valZ);
340                 pfBuf += KNOB_SIMD_WIDTH;
341             }
342         }
343     }
344 }
345 
ClearStencilHotTile(const HOTTILE * pHotTile)346 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
347 {
348     // convert from F32 to U8.
349     uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
350     //broadcast 32x into __m256i...
351     simdscalari valS = _simd_set1_epi8(clearVal);
352 
353     simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
354     uint32_t numSamples = pHotTile->numSamples;
355 
356     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
357     {
358         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
359         {
360             // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
361             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
362             {
363                 _simd_store_si(pBuf, valS);
364                 pBuf += 1;
365             }
366         }
367     }
368 }
369 
370 #endif
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief InitializeHotTiles
373 /// for draw calls, we initialize the active hot tiles and perform deferred
374 /// load on them if tile is in invalid state. we do this in the outer thread
375 /// loop instead of inside the draw routine itself mainly for performance,
376 /// to avoid unnecessary setup every triangle
377 /// @todo support deferred clear
378 /// @param pCreateInfo - pointer to creation info.
InitializeHotTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroID)379 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID)
380 {
381     const API_STATE& state = GetApiState(pDC);
382 
383     uint32_t x, y;
384     MacroTileMgr::getTileIndices(macroID, x, y);
385     x *= KNOB_MACROTILE_X_DIM;
386     y *= KNOB_MACROTILE_Y_DIM;
387 
388     uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
389 
390     // check RT if enabled
391     unsigned long rtSlot = 0;
392     uint32_t colorHottileEnableMask = state.colorHottileEnable;
393     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
394     {
395         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
396 
397         if (pHotTile->state == HOTTILE_INVALID)
398         {
399             AR_BEGIN(BELoadTiles, pDC->drawId);
400             // invalid hottile before draw requires a load from surface before we can draw to it
401             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
402             pHotTile->state = HOTTILE_DIRTY;
403             AR_END(BELoadTiles, 0);
404         }
405         else if (pHotTile->state == HOTTILE_CLEAR)
406         {
407             AR_BEGIN(BELoadTiles, pDC->drawId);
408             // Clear the tile.
409             ClearColorHotTile(pHotTile);
410             pHotTile->state = HOTTILE_DIRTY;
411             AR_END(BELoadTiles, 0);
412         }
413         colorHottileEnableMask &= ~(1 << rtSlot);
414     }
415 
416     // check depth if enabled
417     if (state.depthHottileEnable)
418     {
419         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
420         if (pHotTile->state == HOTTILE_INVALID)
421         {
422             AR_BEGIN(BELoadTiles, pDC->drawId);
423             // invalid hottile before draw requires a load from surface before we can draw to it
424             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
425             pHotTile->state = HOTTILE_DIRTY;
426             AR_END(BELoadTiles, 0);
427         }
428         else if (pHotTile->state == HOTTILE_CLEAR)
429         {
430             AR_BEGIN(BELoadTiles, pDC->drawId);
431             // Clear the tile.
432             ClearDepthHotTile(pHotTile);
433             pHotTile->state = HOTTILE_DIRTY;
434             AR_END(BELoadTiles, 0);
435         }
436     }
437 
438     // check stencil if enabled
439     if (state.stencilHottileEnable)
440     {
441         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
442         if (pHotTile->state == HOTTILE_INVALID)
443         {
444             AR_BEGIN(BELoadTiles, pDC->drawId);
445             // invalid hottile before draw requires a load from surface before we can draw to it
446             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
447             pHotTile->state = HOTTILE_DIRTY;
448             AR_END(BELoadTiles, 0);
449         }
450         else if (pHotTile->state == HOTTILE_CLEAR)
451         {
452             AR_BEGIN(BELoadTiles, pDC->drawId);
453             // Clear the tile.
454             ClearStencilHotTile(pHotTile);
455             pHotTile->state = HOTTILE_DIRTY;
456             AR_END(BELoadTiles, 0);
457         }
458     }
459 }
460