1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 *        The SWR_CONTEXT is our global context and contains the DC ring,
27 *        thread state, etc.
28 *
29 *        The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33 
34 #include <condition_variable>
35 #include <algorithm>
36 
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/intrin.h"
43 #include "core/threads.h"
44 #include "ringbuffer.h"
45 #include "archrast/archrast.h"
46 
47 // x.8 fixed point precision values
48 #define FIXED_POINT_SHIFT 8
49 #define FIXED_POINT_SCALE 256
50 
51 // x.16 fixed point precision values
52 #define FIXED_POINT16_SHIFT 16
53 #define FIXED_POINT16_SCALE 65536
54 
55 struct SWR_CONTEXT;
56 struct DRAW_CONTEXT;
57 
58 struct TRI_FLAGS
59 {
60     uint32_t frontFacing : 1;
61     uint32_t yMajor : 1;
62     uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
63     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
64     float pointSize;
65     uint32_t renderTargetArrayIndex;
66     uint32_t viewportIndex;
67 };
68 
69 //////////////////////////////////////////////////////////////////////////
70 /// SWR_TRIANGLE_DESC
71 /////////////////////////////////////////////////////////////////////////
72 struct SWR_TRIANGLE_DESC
73 {
74     float I[3];
75     float J[3];
76     float Z[3];
77     float OneOverW[3];
78     float recipDet;
79 
80     float *pRecipW;
81     float *pAttribs;
82     float *pPerspAttribs;
83     float *pSamplePos;
84     float *pUserClipBuffer;
85 
86     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
87     uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
88     uint64_t anyCoveredSamples;
89 
90     TRI_FLAGS triFlags;
91 };
92 
93 struct TRIANGLE_WORK_DESC
94 {
95     float *pTriBuffer;
96     float *pAttribs;
97     float *pUserClipBuffer;
98     uint32_t numAttribs;
99     TRI_FLAGS triFlags;
100 };
101 
102 struct CLEAR_DESC
103 {
104     SWR_RECT rect;
105     uint32_t attachmentMask;
106     uint32_t renderTargetArrayIndex;
107     float clearRTColor[4];  // RGBA_32F
108     float clearDepth;   // [0..1]
109     uint8_t clearStencil;
110 };
111 
112 struct DISCARD_INVALIDATE_TILES_DESC
113 {
114     uint32_t attachmentMask;
115     SWR_RECT rect;
116     SWR_TILE_STATE newTileState;
117     bool createNewTiles;
118     bool fullTilesOnly;
119 };
120 
121 struct SYNC_DESC
122 {
123     PFN_CALLBACK_FUNC pfnCallbackFunc;
124     uint64_t userData;
125     uint64_t userData2;
126     uint64_t userData3;
127 };
128 
129 struct STORE_TILES_DESC
130 {
131     uint32_t attachmentMask;
132     SWR_TILE_STATE postStoreTileState;
133     SWR_RECT rect;
134 };
135 
136 struct COMPUTE_DESC
137 {
138     uint32_t threadGroupCountX;
139     uint32_t threadGroupCountY;
140     uint32_t threadGroupCountZ;
141 };
142 
143 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
144 
145 enum WORK_TYPE
146 {
147     SYNC,
148     DRAW,
149     CLEAR,
150     DISCARDINVALIDATETILES,
151     STORETILES,
152     SHUTDOWN,
153 };
154 
OSALIGNSIMD(struct)155 OSALIGNSIMD(struct) BE_WORK
156 {
157     WORK_TYPE type;
158     PFN_WORK_FUNC pfnWork;
159     union
160     {
161         SYNC_DESC sync;
162         TRIANGLE_WORK_DESC tri;
163         CLEAR_DESC clear;
164         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
165         STORE_TILES_DESC storeTiles;
166     } desc;
167 };
168 
169 struct DRAW_WORK
170 {
171     DRAW_CONTEXT*   pDC;
172     union
173     {
174         uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
175         uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
176     };
177     union
178     {
179         const int32_t* pIB;        // DrawIndexed: App supplied indices
180         uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
181     };
182     int32_t    baseVertex;
183     uint32_t   numInstances;        // Number of instances
184     uint32_t   startInstance;       // Instance offset
185     uint32_t   startPrimID;         // starting primitiveID for this draw batch
186     uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
187     SWR_FORMAT type;                // index buffer type
188 };
189 
190 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
191 struct FE_WORK
192 {
193     WORK_TYPE type;
194     PFN_FE_WORK_FUNC pfnWork;
195     union
196     {
197         SYNC_DESC sync;
198         DRAW_WORK draw;
199         CLEAR_DESC clear;
200         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
201         STORE_TILES_DESC storeTiles;
202     } desc;
203 };
204 
205 struct GUARDBANDS
206 {
207     float left[KNOB_NUM_VIEWPORTS_SCISSORS];
208     float right[KNOB_NUM_VIEWPORTS_SCISSORS];
209     float top[KNOB_NUM_VIEWPORTS_SCISSORS];
210     float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
211 };
212 
213 struct PA_STATE;
214 
215 // function signature for pipeline stages that execute after primitive assembly
216 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
217     uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
218 
219 #if ENABLE_AVX512_SIMD16
220 // function signature for pipeline stages that execute after primitive assembly
221 typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
222     uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
223 
224 #endif
OSALIGNLINE(struct)225 OSALIGNLINE(struct) API_STATE
226 {
227     // Vertex Buffers
228     SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
229 
230     // Index Buffer
231     SWR_INDEX_BUFFER_STATE  indexBuffer;
232 
233     // FS - Fetch Shader State
234     PFN_FETCH_FUNC          pfnFetchFunc;
235 
236     // VS - Vertex Shader State
237     PFN_VERTEX_FUNC         pfnVertexFunc;
238 
239     // GS - Geometry Shader State
240     PFN_GS_FUNC             pfnGsFunc;
241     SWR_GS_STATE            gsState;
242 
243     // CS - Compute Shader
244     PFN_CS_FUNC             pfnCsFunc;
245     uint32_t                totalThreadsInGroup;
246     uint32_t                totalSpillFillSize;
247     uint32_t                scratchSpaceSize;
248     uint32_t                scratchSpaceNumInstances;
249 
250     // FE - Frontend State
251     SWR_FRONTEND_STATE      frontendState;
252 
253     // SOS - Streamout Shader State
254     PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
255 
256     // Streamout state
257     SWR_STREAMOUT_STATE     soState;
258     mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
259 
260     // Tessellation State
261     PFN_HS_FUNC             pfnHsFunc;
262     PFN_DS_FUNC             pfnDsFunc;
263     SWR_TS_STATE            tsState;
264 
265     // Number of attributes used by the frontend (vs, so, gs)
266     uint32_t                feNumAttributes;
267 
268     PRIMITIVE_TOPOLOGY      topology;
269     bool                    forceFront;
270 
271     // RS - Rasterizer State
272     SWR_RASTSTATE           rastState;
273     // floating point multisample offsets
274     float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
275 
276     GUARDBANDS               gbState;
277 
278     SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
279     SWR_VIEWPORT_MATRICES   vpMatrices;
280 
281     SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
282     SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
283     bool                    scissorsTileAligned;
284 
285     // Backend state
286     SWR_BACKEND_STATE       backendState;
287 
288     SWR_DEPTH_BOUNDS_STATE  depthBoundsState;
289 
290     // PS - Pixel shader state
291     SWR_PS_STATE            psState;
292 
293     SWR_DEPTH_STENCIL_STATE depthStencilState;
294 
295     // OM - Output Merger State
296     SWR_BLEND_STATE         blendState;
297     PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
298 
299     struct
300     {
301         uint32_t enableStatsFE : 1;             // Enable frontend pipeline stats
302         uint32_t enableStatsBE : 1;             // Enable backend pipeline stats
303         uint32_t colorHottileEnable : 8;        // Bitmask of enabled color hottiles
304         uint32_t depthHottileEnable: 1;         // Enable depth buffer hottile
305         uint32_t stencilHottileEnable : 1;      // Enable stencil buffer hottile
306     };
307 
308     PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
309 };
310 
311 class MacroTileMgr;
312 class DispatchQueue;
313 
314 struct RenderOutputBuffers
315 {
316     uint8_t* pColor[SWR_NUM_RENDERTARGETS];
317     uint8_t* pDepth;
318     uint8_t* pStencil;
319 };
320 
321 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
322 struct BarycentricCoeffs
323 {
324     simdscalar vIa;
325     simdscalar vIb;
326     simdscalar vIc;
327 
328     simdscalar vJa;
329     simdscalar vJb;
330     simdscalar vJc;
331 
332     simdscalar vZa;
333     simdscalar vZb;
334     simdscalar vZc;
335 
336     simdscalar vRecipDet;
337 
338     simdscalar vAOneOverW;
339     simdscalar vBOneOverW;
340     simdscalar vCOneOverW;
341 };
342 
343 // pipeline function pointer types
344 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
345 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
346                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &);
347 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
348 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
349 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
350                                               simdscalar const &, simdscalar const &);
351 
352 struct BACKEND_FUNCS
353 {
354     PFN_BACKEND_FUNC pfnBackend;
355 };
356 
357 // Draw State
358 struct DRAW_STATE
359 {
360     API_STATE state;
361 
362     void* pPrivateState;  // Its required the driver sets this up for each draw.
363 
364     // pipeline function pointers, filled in by API thread when setting up the draw
365     BACKEND_FUNCS backendFuncs;
366     PFN_PROCESS_PRIMS pfnProcessPrims;
367 #if USE_SIMD16_FRONTEND
368     PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
369 #endif
370 
371     CachingArena* pArena;     // This should only be used by API thread.
372 };
373 
374 struct DRAW_DYNAMIC_STATE
375 {
ResetDRAW_DYNAMIC_STATE376     void Reset(uint32_t numThreads)
377     {
378         SWR_STATS* pSavePtr = pStats;
379         memset(this, 0, sizeof(*this));
380         pStats = pSavePtr;
381         memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
382     }
383     ///@todo Currently assumes only a single FE can do stream output for a draw.
384     uint32_t SoWriteOffset[4];
385     bool     SoWriteOffsetDirty[4];
386 
387     SWR_STATS_FE statsFE;   // Only one FE thread per DC.
388     SWR_STATS*   pStats;
389 };
390 
391 // Draw Context
392 //    The api thread sets up a draw context that exists for the life of the draw.
393 //    This draw context maintains all of the state needed for the draw operation.
394 struct DRAW_CONTEXT
395 {
396     SWR_CONTEXT*    pContext;
397     union
398     {
399         MacroTileMgr*   pTileMgr;
400         DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
401     };
402     DRAW_STATE*     pState;             // Read-only state. Core should not update this outside of API thread.
403     DRAW_DYNAMIC_STATE dynState;
404 
405     CachingArena*   pArena;
406 
407     uint32_t        drawId;
408     bool            dependentFE;    // Frontend work is dependent on all previous FE
409     bool            dependent;      // Backend work is dependent on all previous BE
410     bool            isCompute;      // Is this DC a compute context?
411     bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
412 
413     FE_WORK         FeWork;
414 
415     volatile OSALIGNLINE(bool)       doneFE;         // Is FE work done for this draw?
416     volatile OSALIGNLINE(uint32_t)   FeLock;
417     volatile OSALIGNLINE(uint32_t)   threadsDone;
418 
419     SYNC_DESC       retireCallback; // Call this func when this DC is retired.
420 };
421 
422 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
423 
GetApiState(const DRAW_CONTEXT * pDC)424 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
425 {
426     SWR_ASSERT(pDC != nullptr);
427     SWR_ASSERT(pDC->pState != nullptr);
428 
429     return pDC->pState->state;
430 }
431 
GetPrivateState(const DRAW_CONTEXT * pDC)432 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
433 {
434     SWR_ASSERT(pDC != nullptr);
435     SWR_ASSERT(pDC->pState != nullptr);
436 
437     return pDC->pState->pPrivateState;
438 }
439 
440 class HotTileMgr;
441 
442 struct SWR_CONTEXT
443 {
444     // Draw Context Ring
445     //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
446     //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
447     //  of draws that can be in flight at any given time.
448     //
449     //  Description:
450     //  1. State - When an application first sets state we'll request a new draw context to use.
451     //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
452     //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
453     //     c. All state calls set state on pCurDrawContext.
454     //  2. Draw - Creates submits a work item that is associated with current draw context.
455     //     a. Set pPrevDrawContext = pCurDrawContext
456     //     b. Set pCurDrawContext to NULL.
457     //  3. State - When an applications sets state after draw
458     //     a. Same as step 1.
459     //     b. State is copied from prev draw context to current.
460     RingBuffer<DRAW_CONTEXT> dcRing;
461 
462     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
463     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
464 
465     MacroTileMgr* pMacroTileManagerArray;
466     DispatchQueue* pDispatchQueueArray;
467 
468     // Draw State Ring
469     //  When draw are very large (lots of primitives) then the API thread will break these up.
470     //  These split draws all have identical state. So instead of storing the state directly
471     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
472     //  to reference a single entry in the DS ring.
473     RingBuffer<DRAW_STATE> dsRing;
474 
475     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
476 
477     uint32_t NumWorkerThreads;
478     uint32_t NumFEThreads;
479     uint32_t NumBEThreads;
480 
481     THREAD_POOL threadPool; // Thread pool associated with this context
482     SWR_THREADING_INFO threadInfo;
483     SWR_API_THREADING_INFO apiThreadInfo;
484 
485     uint32_t MAX_DRAWS_IN_FLIGHT;
486 
487     std::condition_variable FifosNotEmpty;
488     std::mutex WaitLock;
489 
490     uint32_t privateStateSize;
491 
492     HotTileMgr *pHotTileMgr;
493 
494     // Callback functions, passed in at create context time
495     PFN_LOAD_TILE               pfnLoadTile;
496     PFN_STORE_TILE              pfnStoreTile;
497     PFN_CLEAR_TILE              pfnClearTile;
498     PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
499     PFN_UPDATE_STATS            pfnUpdateStats;
500     PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
501 
502 
503     // Global Stats
504     SWR_STATS* pStats;
505 
506     // Scratch space for workers.
507     uint8_t** ppScratch;
508 
509     volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;
510 
511     OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
512     uint32_t frameCount;
513 
514     uint32_t lastFrameChecked;
515     uint64_t lastDrawChecked;
516     TileSet singleThreadLockedTiles;
517 
518     // ArchRast thread contexts.
519     HANDLE* pArContext;
520 };
521 
522 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
523 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
524 
525 // ArchRast instrumentation framework
526 #define AR_WORKER_CTX  pContext->pArContext[workerId]
527 #define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]
528 
529 #ifdef KNOB_ENABLE_AR
530     #define _AR_BEGIN(ctx, type, id)    ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
531     #define _AR_END(ctx, type, count)   ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
532     #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
533     #define _AR_FLUSH(ctx, id)          ArchRast::FlushDraw(ctx, id)
534 #else
535     #ifdef KNOB_ENABLE_RDTSC
536         #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
537         #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
538     #else
539         #define _AR_BEGIN(ctx, type, id) (void)ctx
540         #define _AR_END(ctx, type, id)
541     #endif
542     #define _AR_EVENT(ctx, event)
543     #define _AR_FLUSH(ctx, id)
544 #endif
545 
546 // Use these macros for api thread.
547 #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
548 #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
549 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
550 
551 // Use these macros for worker threads.
552 #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
553 #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
554 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
555 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
556