1 /**************************************************************************
2  *
3  * Copyright 2017 Advanced Micro Devices, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * on the rights to use, copy, modify, merge, publish, distribute, sub
10  * license, and/or sell copies of the Software, and to permit persons to whom
11  * the Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
20  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
24  *
25  **************************************************************************/
26 
27 /* This is a wrapper for pipe_context that executes all pipe_context calls
28  * in another thread.
29  *
30  *
31  * Guidelines for adopters and deviations from Gallium
32  * ---------------------------------------------------
33  *
34  * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
35  *    driver functions that take a context (fence_finish, texture_get_handle)
36  *    should manually unwrap pipe_context by doing:
37  *      pipe = threaded_context_unwrap_sync(pipe);
38  *
39  *    pipe_context::priv is used to unwrap the context, so drivers and state
40  *    trackers shouldn't use it.
41  *
42  *    No other objects are wrapped.
43  *
44  * 2) Drivers must subclass and initialize these structures:
45  *    - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
46  *    - threaded_query for pipe_query (zero memory)
47  *    - threaded_transfer for pipe_transfer (zero memory)
48  *
49  * 3) The threaded context must not be enabled for contexts that can use video
50  *    codecs.
51  *
52  * 4) Changes in driver behavior:
53  *    - begin_query and end_query always return true; return values from
54  *      the driver are ignored.
55  *    - generate_mipmap uses is_format_supported to determine success;
56  *      the return value from the driver is ignored.
57  *    - resource_commit always returns true; failures are ignored.
58  *    - set_debug_callback is skipped if the callback is synchronous.
59  *
60  *
61  * Thread-safety requirements on context functions
62  * -----------------------------------------------
63  *
64  * These pipe_context functions are executed directly, so they shouldn't use
65  * pipe_context in an unsafe way. They are de-facto screen functions now:
66  * - create_query
67  * - create_batch_query
68  * - create_*_state (all CSOs and shaders)
69  *     - Make sure the shader compiler doesn't use any per-context stuff.
70  *       (e.g. LLVM target machine)
71  *     - Only pipe_context's debug callback for shader dumps is guaranteed to
72  *       be up to date, because set_debug_callback synchronizes execution.
73  * - create_surface
74  * - surface_destroy
75  * - create_sampler_view
76  * - sampler_view_destroy
77  * - stream_output_target_destroy
78  * - transfer_map (only unsychronized buffer mappings)
79  * - get_query_result (when threaded_query::flushed == true)
80  *
81  * Create calls causing a sync that can't be async due to driver limitations:
82  * - create_stream_output_target
83  *
84  *
85  * Transfer_map rules for buffer mappings
86  * --------------------------------------
87  *
88  * 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made
89  *    in the non-driver thread without flushing the queue. The driver will
90  *    receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_-
91  *    UNSYNCHRONIZED to indicate this.
92  *    Note that transfer_unmap is always enqueued and called from the driver
93  *    thread.
94  *
95  * 2) The driver isn't allowed to infer unsychronized mappings by tracking
96  *    the valid buffer range. The threaded context always sends TC_TRANSFER_-
97  *    MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
98  *    to failures.
99  *    The threaded context does its own detection of unsynchronized mappings.
100  *
101  * 3) The driver isn't allowed to do buffer invalidations by itself under any
102  *    circumstances. This is necessary for unsychronized maps to map the latest
103  *    version of the buffer. (because invalidations can be queued, while
104  *    unsychronized maps are not queued and they should return the latest
105  *    storage after invalidation). The threaded context always sends
106  *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
107  *    indicate this. Ignoring the flag will lead to failures.
108  *    The threaded context uses its own buffer invalidation mechanism.
109  *
110  *
111  * Rules for fences
112  * ----------------
113  *
114  * Flushes will be executed asynchronously in the driver thread if a
115  * create_fence callback is provided. This affects fence semantics as follows.
116  *
117  * When the threaded context wants to perform an asynchronous flush, it will
118  * use the create_fence callback to pre-create the fence from the calling
119  * thread. This pre-created fence will be passed to pipe_context::flush
120  * together with the TC_FLUSH_ASYNC flag.
121  *
122  * The callback receives the unwrapped context as a parameter, but must use it
123  * in a thread-safe way because it is called from a non-driver thread.
124  *
125  * If the threaded_context does not immediately flush the current batch, the
126  * callback also receives a tc_unflushed_batch_token. If fence_finish is called
127  * on the returned fence in the context that created the fence,
128  * threaded_context_flush must be called.
129  *
130  * The driver must implement pipe_context::fence_server_sync properly, since
131  * the threaded context handles PIPE_FLUSH_ASYNC.
132  *
133  *
134  * Additional requirements
135  * -----------------------
136  *
137  * get_query_result:
138  *    If threaded_query::flushed == true, get_query_result should assume that
139  *    it's called from a non-driver thread, in which case the driver shouldn't
140  *    use the context in an unsafe way.
141  *
142  * replace_buffer_storage:
143  *    The driver has to implement this callback, which will be called when
144  *    the threaded context wants to replace a resource's backing storage with
145  *    another resource's backing storage. The threaded context uses it to
146  *    implement buffer invalidation. This call is always queued.
147  *
148  * pipe_context::multi_draw() must be implemented.
149  *
150  *
151  * Performance gotchas
152  * -------------------
153  *
154  * Buffer invalidations are done unconditionally - they don't check whether
155  * the buffer is busy. This can cause drivers to have more live allocations
156  * and CPU mappings than necessary.
157  *
158  *
159  * How it works (queue architecture)
160  * ---------------------------------
161  *
162  * There is a multithreaded queue consisting of batches, each batch consisting
163  * of call slots. Each call slot consists of an 8-byte header (call ID +
164  * call size + constant 32-bit marker for integrity checking) and an 8-byte
165  * body for per-call data. That is 16 bytes per call slot.
166  *
167  * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
168  * calls occupy multiple call slots depending on the size needed by call
169  * parameters. That means that calls can have a variable size in the batch.
170  * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
171  * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
172  * Even though the first call slot can use only 8 bytes for data, additional
173  * call slots used by the same call can use all 16 bytes for data.
174  * For example, a call using 2 call slots has 24 bytes of space for data.
175  *
176  * Once a batch is full and there is no space for the next call, it's flushed,
177  * meaning that it's added to the queue for execution in the other thread.
178  * The batches are ordered in a ring and reused once they are idle again.
179  * The batching is necessary for low queue/mutex overhead.
180  *
181  */
182 
183 #ifndef U_THREADED_CONTEXT_H
184 #define U_THREADED_CONTEXT_H
185 
186 #include "pipe/p_context.h"
187 #include "pipe/p_state.h"
188 #include "util/u_inlines.h"
189 #include "util/u_queue.h"
190 #include "util/u_range.h"
191 #include "util/slab.h"
192 
193 struct threaded_context;
194 struct tc_unflushed_batch_token;
195 
196 /* These are map flags sent to drivers. */
197 /* Never infer whether it's safe to use unsychronized mappings: */
198 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
199 /* Don't invalidate buffers: */
200 #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
201 /* transfer_map is called from a non-driver thread: */
202 #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
203 
204 /* Custom flush flags sent to drivers. */
205 /* fence is pre-populated with a fence created by the create_fence callback */
206 #define TC_FLUSH_ASYNC        (1u << 31)
207 
208 /* Size of the queue = number of batch slots in memory.
209  * - 1 batch is always idle and records new commands
210  * - 1 batch is being executed
211  * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
212  *
213  * Use a size as small as possible for low CPU L2 cache usage but large enough
214  * so that the queue isn't stalled too often for not having enough idle batch
215  * slots.
216  */
217 #define TC_MAX_BATCHES        10
218 
219 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
220  * can occupy multiple call slots.
221  *
222  * The idea is to have batches as small as possible but large enough so that
223  * the queuing and mutex overhead is negligible.
224  */
225 #define TC_CALLS_PER_BATCH    768
226 
227 /* Threshold for when to use the queue or sync. */
228 #define TC_MAX_STRING_MARKER_BYTES  512
229 
230 /* Threshold for when to enqueue buffer/texture_subdata as-is.
231  * If the upload size is greater than this, it will do instead:
232  * - for buffers: DISCARD_RANGE is done by the threaded context
233  * - for textures: sync and call the driver directly
234  */
235 #define TC_MAX_SUBDATA_BYTES        320
236 
237 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
238                                                struct pipe_resource *dst,
239                                                struct pipe_resource *src);
240 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
241                                                           struct tc_unflushed_batch_token *token);
242 
243 struct threaded_resource {
244    struct pipe_resource b;
245    const struct u_resource_vtbl *vtbl;
246 
247    /* Since buffer invalidations are queued, we can't use the base resource
248     * for unsychronized mappings. This points to the latest version of
249     * the buffer after the latest invalidation. It's only used for unsychro-
250     * nized mappings in the non-driver thread. Initially it's set to &b.
251     */
252    struct pipe_resource *latest;
253 
254    /* The buffer range which is initialized (with a write transfer, streamout,
255     * or writable shader resources). The remainder of the buffer is considered
256     * invalid and can be mapped unsynchronized.
257     *
258     * This allows unsychronized mapping of a buffer range which hasn't been
259     * used yet. It's for applications which forget to use the unsynchronized
260     * map flag and expect the driver to figure it out.
261     *
262     * Drivers should set this to the full range for buffers backed by user
263     * memory.
264     */
265    struct util_range valid_buffer_range;
266 
267    /* If "this" is not the base instance of the buffer, but it's one of its
268     * reallocations (set in "latest" of the base instance), this points to
269     * the valid range of the base instance. It's used for transfers after
270     * a buffer invalidation, because such transfers operate on "latest", not
271     * the base instance. Initially it's set to &valid_buffer_range.
272     */
273    struct util_range *base_valid_buffer_range;
274 
275    /* Drivers are required to update this for shared resources and user
276     * pointers. */
277    bool	is_shared;
278    bool is_user_ptr;
279 
280    /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
281     * method of CPU access when map flags allow it. Useful for buffers that
282     * are too large for the visible VRAM window.
283     */
284    int max_forced_staging_uploads;
285 };
286 
287 struct threaded_transfer {
288    struct pipe_transfer b;
289 
290    /* Staging buffer for DISCARD_RANGE transfers. */
291    struct pipe_resource *staging;
292 
293    /* Offset into the staging buffer, because the backing buffer is
294     * sub-allocated. */
295    unsigned offset;
296 };
297 
298 struct threaded_query {
299    /* The query is added to the list in end_query and removed in flush. */
300    struct list_head head_unflushed;
301 
302    /* Whether pipe->flush has been called in non-deferred mode after end_query. */
303    bool flushed;
304 };
305 
306 /* This is the second half of tc_call containing call data.
307  * Most calls will typecast this to the type they need, typically larger
308  * than 8 bytes.
309  */
310 union tc_payload {
311    struct pipe_query *query;
312    struct pipe_resource *resource;
313    struct pipe_transfer *transfer;
314    struct pipe_fence_handle *fence;
315    uint64_t handle;
316    bool boolean;
317 };
318 
319 #ifdef _MSC_VER
320 #define ALIGN16 __declspec(align(16))
321 #else
322 #define ALIGN16 __attribute__((aligned(16)))
323 #endif
324 
325 /* Each call slot should be aligned to its own size for optimal cache usage. */
326 struct ALIGN16 tc_call {
327    unsigned sentinel;
328    ushort num_call_slots;
329    ushort call_id;
330    union tc_payload payload;
331 };
332 
333 /**
334  * A token representing an unflushed batch.
335  *
336  * See the general rules for fences for an explanation.
337  */
338 struct tc_unflushed_batch_token {
339    struct pipe_reference ref;
340    struct threaded_context *tc;
341 };
342 
343 struct tc_batch {
344    struct pipe_context *pipe;
345    unsigned sentinel;
346    unsigned num_total_call_slots;
347    struct tc_unflushed_batch_token *token;
348    struct util_queue_fence fence;
349    struct tc_call call[TC_CALLS_PER_BATCH];
350 };
351 
352 struct threaded_context {
353    struct pipe_context base;
354    struct pipe_context *pipe;
355    struct slab_child_pool pool_transfers;
356    tc_replace_buffer_storage_func replace_buffer_storage;
357    tc_create_fence_func create_fence;
358    unsigned map_buffer_alignment;
359 
360    struct list_head unflushed_queries;
361 
362    /* Counters for the HUD. */
363    unsigned num_offloaded_slots;
364    unsigned num_direct_slots;
365    unsigned num_syncs;
366 
367    /* Estimation of how much vram/gtt bytes are mmap'd in
368     * the current tc_batch.
369     */
370    uint64_t bytes_mapped_estimate;
371    uint64_t bytes_mapped_limit;
372 
373    struct util_queue queue;
374    struct util_queue_fence *fence;
375 
376    unsigned last, next;
377    struct tc_batch batch_slots[TC_MAX_BATCHES];
378 };
379 
380 void threaded_resource_init(struct pipe_resource *res);
381 void threaded_resource_deinit(struct pipe_resource *res);
382 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
383 
384 struct pipe_context *
385 threaded_context_create(struct pipe_context *pipe,
386                         struct slab_parent_pool *parent_transfer_pool,
387                         tc_replace_buffer_storage_func replace_buffer,
388                         tc_create_fence_func create_fence,
389                         struct threaded_context **out);
390 
391 void
392 threaded_context_flush(struct pipe_context *_pipe,
393                        struct tc_unflushed_batch_token *token,
394                        bool prefer_async);
395 
396 static inline struct threaded_context *
threaded_context(struct pipe_context * pipe)397 threaded_context(struct pipe_context *pipe)
398 {
399    return (struct threaded_context*)pipe;
400 }
401 
402 static inline struct threaded_resource *
threaded_resource(struct pipe_resource * res)403 threaded_resource(struct pipe_resource *res)
404 {
405    return (struct threaded_resource*)res;
406 }
407 
408 static inline struct threaded_query *
threaded_query(struct pipe_query * q)409 threaded_query(struct pipe_query *q)
410 {
411    return (struct threaded_query*)q;
412 }
413 
414 static inline struct threaded_transfer *
threaded_transfer(struct pipe_transfer * transfer)415 threaded_transfer(struct pipe_transfer *transfer)
416 {
417    return (struct threaded_transfer*)transfer;
418 }
419 
420 static inline void
tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token ** dst,struct tc_unflushed_batch_token * src)421 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
422                                    struct tc_unflushed_batch_token *src)
423 {
424    if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
425       free(*dst);
426    *dst = src;
427 }
428 
429 #endif
430