1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * Copyright © 2015 Advanced Micro Devices, Inc.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sub license, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
19  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * The above copyright notice and this permission notice (including the
25  * next paragraph) shall be included in all copies or substantial portions
26  * of the Software.
27  */
28 
29 #include "amdgpu_cs.h"
30 #include "util/os_time.h"
31 #include <inttypes.h>
32 #include <stdio.h>
33 
34 #include "amd/common/sid.h"
35 
36 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
37 
38 /* FENCES */
39 
40 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_ctx * ctx,unsigned ip_type,unsigned ip_instance,unsigned ring)41 amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
42                     unsigned ip_instance, unsigned ring)
43 {
44    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
45 
46    fence->reference.count = 1;
47    fence->ws = ctx->ws;
48    fence->ctx = ctx;
49    fence->fence.context = ctx->ctx;
50    fence->fence.ip_type = ip_type;
51    fence->fence.ip_instance = ip_instance;
52    fence->fence.ring = ring;
53    util_queue_fence_init(&fence->submitted);
54    util_queue_fence_reset(&fence->submitted);
55    p_atomic_inc(&ctx->refcount);
56    return (struct pipe_fence_handle *)fence;
57 }
58 
59 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)60 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
61 {
62    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
63    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
64 
65    if (!fence)
66       return NULL;
67 
68    pipe_reference_init(&fence->reference, 1);
69    fence->ws = ws;
70    /* fence->ctx == NULL means that the fence is syncobj-based. */
71 
72    /* Convert sync_file into syncobj. */
73    int r = amdgpu_cs_create_syncobj(ws->dev, &fence->syncobj);
74    if (r) {
75       FREE(fence);
76       return NULL;
77    }
78 
79    r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
80    if (r) {
81       amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
82       FREE(fence);
83       return NULL;
84    }
85 
86    util_queue_fence_init(&fence->submitted);
87 
88    return (struct pipe_fence_handle*)fence;
89 }
90 
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)91 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
92 					 struct pipe_fence_handle *pfence)
93 {
94    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
95    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
96 
97    if (amdgpu_fence_is_syncobj(fence)) {
98       int fd, r;
99 
100       /* Convert syncobj into sync_file. */
101       r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
102       return r ? -1 : fd;
103    }
104 
105    util_queue_fence_wait(&fence->submitted);
106 
107    /* Convert the amdgpu fence into a fence FD. */
108    int fd;
109    if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence,
110                                  AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD,
111                                  (uint32_t*)&fd))
112       return -1;
113 
114    return fd;
115 }
116 
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)117 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
118 {
119    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
120    uint32_t syncobj;
121    int fd = -1;
122 
123    int r = amdgpu_cs_create_syncobj2(ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
124                                      &syncobj);
125    if (r) {
126       return -1;
127    }
128 
129    r = amdgpu_cs_syncobj_export_sync_file(ws->dev, syncobj, &fd);
130    if (r) {
131       fd = -1;
132    }
133 
134    amdgpu_cs_destroy_syncobj(ws->dev, syncobj);
135    return fd;
136 }
137 
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)138 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
139                                    uint64_t seq_no,
140                                    uint64_t *user_fence_cpu_address)
141 {
142    struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
143 
144    rfence->fence.fence = seq_no;
145    rfence->user_fence_cpu_address = user_fence_cpu_address;
146    util_queue_fence_signal(&rfence->submitted);
147 }
148 
amdgpu_fence_signalled(struct pipe_fence_handle * fence)149 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
150 {
151    struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
152 
153    rfence->signalled = true;
154    util_queue_fence_signal(&rfence->submitted);
155 }
156 
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)157 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
158                        bool absolute)
159 {
160    struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
161    uint32_t expired;
162    int64_t abs_timeout;
163    uint64_t *user_fence_cpu;
164    int r;
165 
166    if (rfence->signalled)
167       return true;
168 
169    /* Handle syncobjs. */
170    if (amdgpu_fence_is_syncobj(rfence)) {
171       /* Absolute timeouts are only be used by BO fences, which aren't
172        * backed by syncobjs.
173        */
174       assert(!absolute);
175 
176       if (amdgpu_cs_syncobj_wait(rfence->ws->dev, &rfence->syncobj, 1,
177                                  timeout, 0, NULL))
178          return false;
179 
180       rfence->signalled = true;
181       return true;
182    }
183 
184    if (absolute)
185       abs_timeout = timeout;
186    else
187       abs_timeout = os_time_get_absolute_timeout(timeout);
188 
189    /* The fence might not have a number assigned if its IB is being
190     * submitted in the other thread right now. Wait until the submission
191     * is done. */
192    if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout))
193       return false;
194 
195    user_fence_cpu = rfence->user_fence_cpu_address;
196    if (user_fence_cpu) {
197       if (*user_fence_cpu >= rfence->fence.fence) {
198          rfence->signalled = true;
199          return true;
200       }
201 
202       /* No timeout, just query: no need for the ioctl. */
203       if (!absolute && !timeout)
204          return false;
205    }
206 
207    /* Now use the libdrm query. */
208    r = amdgpu_cs_query_fence_status(&rfence->fence,
209 				    abs_timeout,
210 				    AMDGPU_QUERY_FENCE_TIMEOUT_IS_ABSOLUTE,
211 				    &expired);
212    if (r) {
213       fprintf(stderr, "amdgpu: amdgpu_cs_query_fence_status failed.\n");
214       return false;
215    }
216 
217    if (expired) {
218       /* This variable can only transition from false to true, so it doesn't
219        * matter if threads race for it. */
220       rfence->signalled = true;
221       return true;
222    }
223    return false;
224 }
225 
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)226 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
227                                           struct pipe_fence_handle *fence,
228                                           uint64_t timeout)
229 {
230    return amdgpu_fence_wait(fence, timeout, false);
231 }
232 
233 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_winsys_cs * rcs)234 amdgpu_cs_get_next_fence(struct radeon_winsys_cs *rcs)
235 {
236    struct amdgpu_cs *cs = amdgpu_cs(rcs);
237    struct pipe_fence_handle *fence = NULL;
238 
239    if (debug_get_option_noop())
240       return NULL;
241 
242    if (cs->next_fence) {
243       amdgpu_fence_reference(&fence, cs->next_fence);
244       return fence;
245    }
246 
247    fence = amdgpu_fence_create(cs->ctx,
248                                cs->csc->ib[IB_MAIN].ip_type,
249                                cs->csc->ib[IB_MAIN].ip_instance,
250                                cs->csc->ib[IB_MAIN].ring);
251    if (!fence)
252       return NULL;
253 
254    amdgpu_fence_reference(&cs->next_fence, fence);
255    return fence;
256 }
257 
258 /* CONTEXTS */
259 
amdgpu_ctx_create(struct radeon_winsys * ws)260 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *ws)
261 {
262    struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
263    int r;
264    struct amdgpu_bo_alloc_request alloc_buffer = {};
265    amdgpu_bo_handle buf_handle;
266 
267    if (!ctx)
268       return NULL;
269 
270    ctx->ws = amdgpu_winsys(ws);
271    ctx->refcount = 1;
272    ctx->initial_num_total_rejected_cs = ctx->ws->num_total_rejected_cs;
273 
274    r = amdgpu_cs_ctx_create(ctx->ws->dev, &ctx->ctx);
275    if (r) {
276       fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create failed. (%i)\n", r);
277       goto error_create;
278    }
279 
280    alloc_buffer.alloc_size = ctx->ws->info.gart_page_size;
281    alloc_buffer.phys_alignment = ctx->ws->info.gart_page_size;
282    alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
283 
284    r = amdgpu_bo_alloc(ctx->ws->dev, &alloc_buffer, &buf_handle);
285    if (r) {
286       fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
287       goto error_user_fence_alloc;
288    }
289 
290    r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
291    if (r) {
292       fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
293       goto error_user_fence_map;
294    }
295 
296    memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
297    ctx->user_fence_bo = buf_handle;
298 
299    return (struct radeon_winsys_ctx*)ctx;
300 
301 error_user_fence_map:
302    amdgpu_bo_free(buf_handle);
303 error_user_fence_alloc:
304    amdgpu_cs_ctx_free(ctx->ctx);
305 error_create:
306    FREE(ctx);
307    return NULL;
308 }
309 
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)310 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
311 {
312    amdgpu_ctx_unref((struct amdgpu_ctx*)rwctx);
313 }
314 
315 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx)316 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx)
317 {
318    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
319    uint32_t result, hangs;
320    int r;
321 
322    /* Return a failure due to a rejected command submission. */
323    if (ctx->ws->num_total_rejected_cs > ctx->initial_num_total_rejected_cs) {
324       return ctx->num_rejected_cs ? PIPE_GUILTY_CONTEXT_RESET :
325                                     PIPE_INNOCENT_CONTEXT_RESET;
326    }
327 
328    /* Return a failure due to a GPU hang. */
329    r = amdgpu_cs_query_reset_state(ctx->ctx, &result, &hangs);
330    if (r) {
331       fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state failed. (%i)\n", r);
332       return PIPE_NO_RESET;
333    }
334 
335    switch (result) {
336    case AMDGPU_CTX_GUILTY_RESET:
337       return PIPE_GUILTY_CONTEXT_RESET;
338    case AMDGPU_CTX_INNOCENT_RESET:
339       return PIPE_INNOCENT_CONTEXT_RESET;
340    case AMDGPU_CTX_UNKNOWN_RESET:
341       return PIPE_UNKNOWN_CONTEXT_RESET;
342    case AMDGPU_CTX_NO_RESET:
343    default:
344       return PIPE_NO_RESET;
345    }
346 }
347 
348 /* COMMAND SUBMISSION */
349 
amdgpu_cs_has_user_fence(struct amdgpu_cs_context * cs)350 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs_context *cs)
351 {
352    return cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_UVD &&
353           cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCE &&
354           cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_DEC &&
355           cs->ib[IB_MAIN].ip_type != AMDGPU_HW_IP_VCN_ENC;
356 }
357 
amdgpu_cs_has_chaining(struct amdgpu_cs * cs)358 static bool amdgpu_cs_has_chaining(struct amdgpu_cs *cs)
359 {
360    return cs->ctx->ws->info.chip_class >= CIK &&
361           cs->ring_type == RING_GFX;
362 }
363 
amdgpu_cs_epilog_dws(enum ring_type ring_type)364 static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type)
365 {
366    if (ring_type == RING_GFX)
367       return 4; /* for chaining */
368 
369    return 0;
370 }
371 
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)372 int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
373 {
374    unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
375    int i = cs->buffer_indices_hashlist[hash];
376    struct amdgpu_cs_buffer *buffers;
377    int num_buffers;
378 
379    if (bo->bo) {
380       buffers = cs->real_buffers;
381       num_buffers = cs->num_real_buffers;
382    } else if (!bo->sparse) {
383       buffers = cs->slab_buffers;
384       num_buffers = cs->num_slab_buffers;
385    } else {
386       buffers = cs->sparse_buffers;
387       num_buffers = cs->num_sparse_buffers;
388    }
389 
390    /* not found or found */
391    if (i < 0 || (i < num_buffers && buffers[i].bo == bo))
392       return i;
393 
394    /* Hash collision, look for the BO in the list of buffers linearly. */
395    for (i = num_buffers - 1; i >= 0; i--) {
396       if (buffers[i].bo == bo) {
397          /* Put this buffer in the hash list.
398           * This will prevent additional hash collisions if there are
399           * several consecutive lookup_buffer calls for the same buffer.
400           *
401           * Example: Assuming buffers A,B,C collide in the hash list,
402           * the following sequence of buffers:
403           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
404           * will collide here: ^ and here:   ^,
405           * meaning that we should get very few collisions in the end. */
406          cs->buffer_indices_hashlist[hash] = i;
407          return i;
408       }
409    }
410    return -1;
411 }
412 
413 static int
amdgpu_do_add_real_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)414 amdgpu_do_add_real_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
415 {
416    struct amdgpu_cs_buffer *buffer;
417    int idx;
418 
419    /* New buffer, check if the backing array is large enough. */
420    if (cs->num_real_buffers >= cs->max_real_buffers) {
421       unsigned new_max =
422          MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3));
423       struct amdgpu_cs_buffer *new_buffers;
424 
425       new_buffers = MALLOC(new_max * sizeof(*new_buffers));
426 
427       if (!new_buffers) {
428          fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
429          FREE(new_buffers);
430          return -1;
431       }
432 
433       memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers));
434 
435       FREE(cs->real_buffers);
436 
437       cs->max_real_buffers = new_max;
438       cs->real_buffers = new_buffers;
439    }
440 
441    idx = cs->num_real_buffers;
442    buffer = &cs->real_buffers[idx];
443 
444    memset(buffer, 0, sizeof(*buffer));
445    amdgpu_winsys_bo_reference(&buffer->bo, bo);
446    p_atomic_inc(&bo->num_cs_references);
447    cs->num_real_buffers++;
448 
449    return idx;
450 }
451 
452 static int
amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs * acs,struct amdgpu_winsys_bo * bo)453 amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo)
454 {
455    struct amdgpu_cs_context *cs = acs->csc;
456    unsigned hash;
457    int idx = amdgpu_lookup_buffer(cs, bo);
458 
459    if (idx >= 0)
460       return idx;
461 
462    idx = amdgpu_do_add_real_buffer(cs, bo);
463 
464    hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
465    cs->buffer_indices_hashlist[hash] = idx;
466 
467    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
468       acs->main.base.used_vram += bo->base.size;
469    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
470       acs->main.base.used_gart += bo->base.size;
471 
472    return idx;
473 }
474 
amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs * acs,struct amdgpu_winsys_bo * bo)475 static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs,
476                                             struct amdgpu_winsys_bo *bo)
477 {
478    struct amdgpu_cs_context *cs = acs->csc;
479    struct amdgpu_cs_buffer *buffer;
480    unsigned hash;
481    int idx = amdgpu_lookup_buffer(cs, bo);
482    int real_idx;
483 
484    if (idx >= 0)
485       return idx;
486 
487    real_idx = amdgpu_lookup_or_add_real_buffer(acs, bo->u.slab.real);
488    if (real_idx < 0)
489       return -1;
490 
491    /* New buffer, check if the backing array is large enough. */
492    if (cs->num_slab_buffers >= cs->max_slab_buffers) {
493       unsigned new_max =
494          MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3));
495       struct amdgpu_cs_buffer *new_buffers;
496 
497       new_buffers = REALLOC(cs->slab_buffers,
498                             cs->max_slab_buffers * sizeof(*new_buffers),
499                             new_max * sizeof(*new_buffers));
500       if (!new_buffers) {
501          fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n");
502          return -1;
503       }
504 
505       cs->max_slab_buffers = new_max;
506       cs->slab_buffers = new_buffers;
507    }
508 
509    idx = cs->num_slab_buffers;
510    buffer = &cs->slab_buffers[idx];
511 
512    memset(buffer, 0, sizeof(*buffer));
513    amdgpu_winsys_bo_reference(&buffer->bo, bo);
514    buffer->u.slab.real_idx = real_idx;
515    p_atomic_inc(&bo->num_cs_references);
516    cs->num_slab_buffers++;
517 
518    hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
519    cs->buffer_indices_hashlist[hash] = idx;
520 
521    return idx;
522 }
523 
amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs * acs,struct amdgpu_winsys_bo * bo)524 static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs,
525                                               struct amdgpu_winsys_bo *bo)
526 {
527    struct amdgpu_cs_context *cs = acs->csc;
528    struct amdgpu_cs_buffer *buffer;
529    unsigned hash;
530    int idx = amdgpu_lookup_buffer(cs, bo);
531 
532    if (idx >= 0)
533       return idx;
534 
535    /* New buffer, check if the backing array is large enough. */
536    if (cs->num_sparse_buffers >= cs->max_sparse_buffers) {
537       unsigned new_max =
538          MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3));
539       struct amdgpu_cs_buffer *new_buffers;
540 
541       new_buffers = REALLOC(cs->sparse_buffers,
542                             cs->max_sparse_buffers * sizeof(*new_buffers),
543                             new_max * sizeof(*new_buffers));
544       if (!new_buffers) {
545          fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n");
546          return -1;
547       }
548 
549       cs->max_sparse_buffers = new_max;
550       cs->sparse_buffers = new_buffers;
551    }
552 
553    idx = cs->num_sparse_buffers;
554    buffer = &cs->sparse_buffers[idx];
555 
556    memset(buffer, 0, sizeof(*buffer));
557    amdgpu_winsys_bo_reference(&buffer->bo, bo);
558    p_atomic_inc(&bo->num_cs_references);
559    cs->num_sparse_buffers++;
560 
561    hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1);
562    cs->buffer_indices_hashlist[hash] = idx;
563 
564    /* We delay adding the backing buffers until we really have to. However,
565     * we cannot delay accounting for memory use.
566     */
567    simple_mtx_lock(&bo->u.sparse.commit_lock);
568 
569    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
570       if (bo->initial_domain & RADEON_DOMAIN_VRAM)
571          acs->main.base.used_vram += backing->bo->base.size;
572       else if (bo->initial_domain & RADEON_DOMAIN_GTT)
573          acs->main.base.used_gart += backing->bo->base.size;
574    }
575 
576    simple_mtx_unlock(&bo->u.sparse.commit_lock);
577 
578    return idx;
579 }
580 
amdgpu_cs_add_buffer(struct radeon_winsys_cs * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)581 static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
582                                     struct pb_buffer *buf,
583                                     enum radeon_bo_usage usage,
584                                     enum radeon_bo_domain domains,
585                                     enum radeon_bo_priority priority)
586 {
587    /* Don't use the "domains" parameter. Amdgpu doesn't support changing
588     * the buffer placement during command submission.
589     */
590    struct amdgpu_cs *acs = amdgpu_cs(rcs);
591    struct amdgpu_cs_context *cs = acs->csc;
592    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
593    struct amdgpu_cs_buffer *buffer;
594    int index;
595 
596    /* Fast exit for no-op calls.
597     * This is very effective with suballocators and linear uploaders that
598     * are outside of the winsys.
599     */
600    if (bo == cs->last_added_bo &&
601        (usage & cs->last_added_bo_usage) == usage &&
602        (1ull << priority) & cs->last_added_bo_priority_usage)
603       return cs->last_added_bo_index;
604 
605    if (!bo->sparse) {
606       if (!bo->bo) {
607          index = amdgpu_lookup_or_add_slab_buffer(acs, bo);
608          if (index < 0)
609             return 0;
610 
611          buffer = &cs->slab_buffers[index];
612          buffer->usage |= usage;
613 
614          usage &= ~RADEON_USAGE_SYNCHRONIZED;
615          index = buffer->u.slab.real_idx;
616       } else {
617          index = amdgpu_lookup_or_add_real_buffer(acs, bo);
618          if (index < 0)
619             return 0;
620       }
621 
622       buffer = &cs->real_buffers[index];
623    } else {
624       index = amdgpu_lookup_or_add_sparse_buffer(acs, bo);
625       if (index < 0)
626          return 0;
627 
628       buffer = &cs->sparse_buffers[index];
629    }
630 
631    buffer->u.real.priority_usage |= 1ull << priority;
632    buffer->usage |= usage;
633 
634    cs->last_added_bo = bo;
635    cs->last_added_bo_index = index;
636    cs->last_added_bo_usage = buffer->usage;
637    cs->last_added_bo_priority_usage = buffer->u.real.priority_usage;
638    return index;
639 }
640 
amdgpu_ib_new_buffer(struct amdgpu_winsys * ws,struct amdgpu_ib * ib,enum ring_type ring_type)641 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib,
642                                  enum ring_type ring_type)
643 {
644    struct pb_buffer *pb;
645    uint8_t *mapped;
646    unsigned buffer_size;
647 
648    /* Always create a buffer that is at least as large as the maximum seen IB
649     * size, aligned to a power of two (and multiplied by 4 to reduce internal
650     * fragmentation if chaining is not available). Limit to 512k dwords, which
651     * is the largest power of two that fits into the size field of the
652     * INDIRECT_BUFFER packet.
653     */
654    if (amdgpu_cs_has_chaining(amdgpu_cs_from_ib(ib)))
655       buffer_size = 4 *util_next_power_of_two(ib->max_ib_size);
656    else
657       buffer_size = 4 *util_next_power_of_two(4 * ib->max_ib_size);
658 
659    buffer_size = MIN2(buffer_size, 4 * 512 * 1024);
660 
661    switch (ib->ib_type) {
662    case IB_MAIN:
663       buffer_size = MAX2(buffer_size, 8 * 1024 * 4);
664       break;
665    default:
666       unreachable("unhandled IB type");
667    }
668 
669    pb = ws->base.buffer_create(&ws->base, buffer_size,
670                                ws->info.gart_page_size,
671                                RADEON_DOMAIN_GTT,
672                                RADEON_FLAG_NO_INTERPROCESS_SHARING |
673                                (ring_type == RING_GFX ||
674                                 ring_type == RING_COMPUTE ||
675                                 ring_type == RING_DMA ?
676                                    RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC : 0));
677    if (!pb)
678       return false;
679 
680    mapped = ws->base.buffer_map(pb, NULL, PIPE_TRANSFER_WRITE);
681    if (!mapped) {
682       pb_reference(&pb, NULL);
683       return false;
684    }
685 
686    pb_reference(&ib->big_ib_buffer, pb);
687    pb_reference(&pb, NULL);
688 
689    ib->ib_mapped = mapped;
690    ib->used_ib_space = 0;
691 
692    return true;
693 }
694 
amdgpu_ib_max_submit_dwords(enum ib_type ib_type)695 static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type)
696 {
697    switch (ib_type) {
698    case IB_MAIN:
699       /* Smaller submits means the GPU gets busy sooner and there is less
700        * waiting for buffers and fences. Proof:
701        *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
702        */
703       return 20 * 1024;
704    default:
705       unreachable("bad ib_type");
706    }
707 }
708 
amdgpu_get_new_ib(struct radeon_winsys * ws,struct amdgpu_cs * cs,enum ib_type ib_type)709 static bool amdgpu_get_new_ib(struct radeon_winsys *ws, struct amdgpu_cs *cs,
710                               enum ib_type ib_type)
711 {
712    struct amdgpu_winsys *aws = (struct amdgpu_winsys*)ws;
713    /* Small IBs are better than big IBs, because the GPU goes idle quicker
714     * and there is less waiting for buffers and fences. Proof:
715     *   http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1
716     */
717    struct amdgpu_ib *ib = NULL;
718    struct drm_amdgpu_cs_chunk_ib *info = &cs->csc->ib[ib_type];
719    unsigned ib_size = 0;
720 
721    switch (ib_type) {
722    case IB_MAIN:
723       ib = &cs->main;
724       ib_size = 4 * 1024 * 4;
725       break;
726    default:
727       unreachable("unhandled IB type");
728    }
729 
730    if (!amdgpu_cs_has_chaining(cs)) {
731       ib_size = MAX2(ib_size,
732                      4 * MIN2(util_next_power_of_two(ib->max_ib_size),
733                               amdgpu_ib_max_submit_dwords(ib_type)));
734    }
735 
736    ib->max_ib_size = ib->max_ib_size - ib->max_ib_size / 32;
737 
738    ib->base.prev_dw = 0;
739    ib->base.num_prev = 0;
740    ib->base.current.cdw = 0;
741    ib->base.current.buf = NULL;
742 
743    /* Allocate a new buffer for IBs if the current buffer is all used. */
744    if (!ib->big_ib_buffer ||
745        ib->used_ib_space + ib_size > ib->big_ib_buffer->size) {
746       if (!amdgpu_ib_new_buffer(aws, ib, cs->ring_type))
747          return false;
748    }
749 
750    info->va_start = amdgpu_winsys_bo(ib->big_ib_buffer)->va + ib->used_ib_space;
751    info->ib_bytes = 0;
752    /* ib_bytes is in dwords and the conversion to bytes will be done before
753     * the CS ioctl. */
754    ib->ptr_ib_size = &info->ib_bytes;
755    ib->ptr_ib_size_inside_ib = false;
756 
757    amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer,
758                         RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
759 
760    ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
761 
762    ib_size = ib->big_ib_buffer->size - ib->used_ib_space;
763    ib->base.current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
764    return true;
765 }
766 
amdgpu_set_ib_size(struct amdgpu_ib * ib)767 static void amdgpu_set_ib_size(struct amdgpu_ib *ib)
768 {
769    if (ib->ptr_ib_size_inside_ib) {
770       *ib->ptr_ib_size = ib->base.current.cdw |
771                          S_3F2_CHAIN(1) | S_3F2_VALID(1);
772    } else {
773       *ib->ptr_ib_size = ib->base.current.cdw;
774    }
775 }
776 
amdgpu_ib_finalize(struct amdgpu_winsys * ws,struct amdgpu_ib * ib)777 static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
778 {
779    amdgpu_set_ib_size(ib);
780    ib->used_ib_space += ib->base.current.cdw * 4;
781    ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_start_alignment);
782    ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw);
783 }
784 
amdgpu_init_cs_context(struct amdgpu_cs_context * cs,enum ring_type ring_type)785 static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs,
786                                    enum ring_type ring_type)
787 {
788    switch (ring_type) {
789    case RING_DMA:
790       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_DMA;
791       break;
792 
793    case RING_UVD:
794       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD;
795       break;
796 
797    case RING_VCE:
798       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCE;
799       break;
800 
801    case RING_COMPUTE:
802       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_COMPUTE;
803       break;
804 
805    case RING_VCN_DEC:
806       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_DEC;
807       break;
808 
809   case RING_VCN_ENC:
810       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_ENC;
811       break;
812 
813    default:
814    case RING_GFX:
815       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_GFX;
816       break;
817    }
818 
819    memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
820    cs->last_added_bo = NULL;
821    return true;
822 }
823 
amdgpu_cs_context_cleanup(struct amdgpu_cs_context * cs)824 static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
825 {
826    unsigned i;
827 
828    for (i = 0; i < cs->num_real_buffers; i++) {
829       p_atomic_dec(&cs->real_buffers[i].bo->num_cs_references);
830       amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL);
831    }
832    for (i = 0; i < cs->num_slab_buffers; i++) {
833       p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references);
834       amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL);
835    }
836    for (i = 0; i < cs->num_sparse_buffers; i++) {
837       p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references);
838       amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL);
839    }
840    for (i = 0; i < cs->num_fence_dependencies; i++)
841       amdgpu_fence_reference(&cs->fence_dependencies[i], NULL);
842 
843    cs->num_real_buffers = 0;
844    cs->num_slab_buffers = 0;
845    cs->num_sparse_buffers = 0;
846    cs->num_fence_dependencies = 0;
847    amdgpu_fence_reference(&cs->fence, NULL);
848 
849    memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
850    cs->last_added_bo = NULL;
851 }
852 
amdgpu_destroy_cs_context(struct amdgpu_cs_context * cs)853 static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
854 {
855    amdgpu_cs_context_cleanup(cs);
856    FREE(cs->flags);
857    FREE(cs->real_buffers);
858    FREE(cs->handles);
859    FREE(cs->slab_buffers);
860    FREE(cs->sparse_buffers);
861    FREE(cs->fence_dependencies);
862 }
863 
864 
865 static struct radeon_winsys_cs *
amdgpu_cs_create(struct radeon_winsys_ctx * rwctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)866 amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
867                  enum ring_type ring_type,
868                  void (*flush)(void *ctx, unsigned flags,
869                                struct pipe_fence_handle **fence),
870                  void *flush_ctx)
871 {
872    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
873    struct amdgpu_cs *cs;
874 
875    cs = CALLOC_STRUCT(amdgpu_cs);
876    if (!cs) {
877       return NULL;
878    }
879 
880    util_queue_fence_init(&cs->flush_completed);
881 
882    cs->ctx = ctx;
883    cs->flush_cs = flush;
884    cs->flush_data = flush_ctx;
885    cs->ring_type = ring_type;
886 
887    struct amdgpu_cs_fence_info fence_info;
888    fence_info.handle = cs->ctx->user_fence_bo;
889    fence_info.offset = cs->ring_type;
890    amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
891 
892    cs->main.ib_type = IB_MAIN;
893 
894    if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) {
895       FREE(cs);
896       return NULL;
897    }
898 
899    if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) {
900       amdgpu_destroy_cs_context(&cs->csc1);
901       FREE(cs);
902       return NULL;
903    }
904 
905    /* Set the first submission context as current. */
906    cs->csc = &cs->csc1;
907    cs->cst = &cs->csc2;
908 
909    if (!amdgpu_get_new_ib(&ctx->ws->base, cs, IB_MAIN)) {
910       amdgpu_destroy_cs_context(&cs->csc2);
911       amdgpu_destroy_cs_context(&cs->csc1);
912       FREE(cs);
913       return NULL;
914    }
915 
916    p_atomic_inc(&ctx->ws->num_cs);
917    return &cs->main.base;
918 }
919 
amdgpu_cs_validate(struct radeon_winsys_cs * rcs)920 static bool amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
921 {
922    return true;
923 }
924 
amdgpu_cs_check_space(struct radeon_winsys_cs * rcs,unsigned dw)925 static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
926 {
927    struct amdgpu_ib *ib = amdgpu_ib(rcs);
928    struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
929    unsigned requested_size = rcs->prev_dw + rcs->current.cdw + dw;
930    uint64_t va;
931    uint32_t *new_ptr_ib_size;
932 
933    assert(rcs->current.cdw <= rcs->current.max_dw);
934 
935    if (requested_size > amdgpu_ib_max_submit_dwords(ib->ib_type))
936       return false;
937 
938    ib->max_ib_size = MAX2(ib->max_ib_size, requested_size);
939 
940    if (rcs->current.max_dw - rcs->current.cdw >= dw)
941       return true;
942 
943    if (!amdgpu_cs_has_chaining(cs))
944       return false;
945 
946    /* Allocate a new chunk */
947    if (rcs->num_prev >= rcs->max_prev) {
948       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
949       struct radeon_winsys_cs_chunk *new_prev;
950 
951       new_prev = REALLOC(rcs->prev,
952                          sizeof(*new_prev) * rcs->max_prev,
953                          sizeof(*new_prev) * new_max_prev);
954       if (!new_prev)
955          return false;
956 
957       rcs->prev = new_prev;
958       rcs->max_prev = new_max_prev;
959    }
960 
961    if (!amdgpu_ib_new_buffer(cs->ctx->ws, ib, cs->ring_type))
962       return false;
963 
964    assert(ib->used_ib_space == 0);
965    va = amdgpu_winsys_bo(ib->big_ib_buffer)->va;
966 
967    /* This space was originally reserved. */
968    rcs->current.max_dw += 4;
969    assert(ib->used_ib_space + 4 * rcs->current.max_dw <= ib->big_ib_buffer->size);
970 
971    /* Pad with NOPs and add INDIRECT_BUFFER packet */
972    while ((rcs->current.cdw & 7) != 4)
973       radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
974 
975    radeon_emit(rcs, PKT3(ib->ib_type == IB_MAIN ? PKT3_INDIRECT_BUFFER_CIK
976                                            : PKT3_INDIRECT_BUFFER_CONST, 2, 0));
977    radeon_emit(rcs, va);
978    radeon_emit(rcs, va >> 32);
979    new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
980 
981    assert((rcs->current.cdw & 7) == 0);
982    assert(rcs->current.cdw <= rcs->current.max_dw);
983 
984    amdgpu_set_ib_size(ib);
985    ib->ptr_ib_size = new_ptr_ib_size;
986    ib->ptr_ib_size_inside_ib = true;
987 
988    /* Hook up the new chunk */
989    rcs->prev[rcs->num_prev].buf = rcs->current.buf;
990    rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
991    rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
992    rcs->num_prev++;
993 
994    ib->base.prev_dw += ib->base.current.cdw;
995    ib->base.current.cdw = 0;
996 
997    ib->base.current.buf = (uint32_t*)(ib->ib_mapped + ib->used_ib_space);
998    ib->base.current.max_dw = ib->big_ib_buffer->size / 4 - amdgpu_cs_epilog_dws(cs->ring_type);
999 
1000    amdgpu_cs_add_buffer(&cs->main.base, ib->big_ib_buffer,
1001                         RADEON_USAGE_READ, 0, RADEON_PRIO_IB1);
1002 
1003    return true;
1004 }
1005 
amdgpu_cs_get_buffer_list(struct radeon_winsys_cs * rcs,struct radeon_bo_list_item * list)1006 static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
1007                                           struct radeon_bo_list_item *list)
1008 {
1009     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1010     int i;
1011 
1012     if (list) {
1013         for (i = 0; i < cs->num_real_buffers; i++) {
1014             list[i].bo_size = cs->real_buffers[i].bo->base.size;
1015             list[i].vm_address = cs->real_buffers[i].bo->va;
1016             list[i].priority_usage = cs->real_buffers[i].u.real.priority_usage;
1017         }
1018     }
1019     return cs->num_real_buffers;
1020 }
1021 
add_fence_dependency_entry(struct amdgpu_cs_context * cs)1022 static unsigned add_fence_dependency_entry(struct amdgpu_cs_context *cs)
1023 {
1024    unsigned idx = cs->num_fence_dependencies++;
1025 
1026    if (idx >= cs->max_fence_dependencies) {
1027       unsigned size;
1028       const unsigned increment = 8;
1029 
1030       cs->max_fence_dependencies = idx + increment;
1031       size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]);
1032       cs->fence_dependencies = realloc(cs->fence_dependencies, size);
1033       /* Clear the newly-allocated elements. */
1034       memset(cs->fence_dependencies + idx, 0,
1035              increment * sizeof(cs->fence_dependencies[0]));
1036    }
1037    return idx;
1038 }
1039 
is_noop_fence_dependency(struct amdgpu_cs * acs,struct amdgpu_fence * fence)1040 static bool is_noop_fence_dependency(struct amdgpu_cs *acs,
1041                                      struct amdgpu_fence *fence)
1042 {
1043    struct amdgpu_cs_context *cs = acs->csc;
1044 
1045    if (!amdgpu_fence_is_syncobj(fence) &&
1046        fence->ctx == acs->ctx &&
1047        fence->fence.ip_type == cs->ib[IB_MAIN].ip_type &&
1048        fence->fence.ip_instance == cs->ib[IB_MAIN].ip_instance &&
1049        fence->fence.ring == cs->ib[IB_MAIN].ring)
1050       return true;
1051 
1052    return amdgpu_fence_wait((void *)fence, 0, false);
1053 }
1054 
amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs * rws,struct pipe_fence_handle * pfence)1055 static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws,
1056                                            struct pipe_fence_handle *pfence)
1057 {
1058    struct amdgpu_cs *acs = amdgpu_cs(rws);
1059    struct amdgpu_cs_context *cs = acs->csc;
1060    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1061 
1062    util_queue_fence_wait(&fence->submitted);
1063 
1064    if (is_noop_fence_dependency(acs, fence))
1065       return;
1066 
1067    unsigned idx = add_fence_dependency_entry(cs);
1068    amdgpu_fence_reference(&cs->fence_dependencies[idx],
1069                           (struct pipe_fence_handle*)fence);
1070 }
1071 
amdgpu_add_bo_fence_dependencies(struct amdgpu_cs * acs,struct amdgpu_cs_buffer * buffer)1072 static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
1073                                              struct amdgpu_cs_buffer *buffer)
1074 {
1075    struct amdgpu_cs_context *cs = acs->csc;
1076    struct amdgpu_winsys_bo *bo = buffer->bo;
1077    unsigned new_num_fences = 0;
1078 
1079    for (unsigned j = 0; j < bo->num_fences; ++j) {
1080       struct amdgpu_fence *bo_fence = (void *)bo->fences[j];
1081 
1082       if (is_noop_fence_dependency(acs, bo_fence))
1083          continue;
1084 
1085       amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]);
1086       new_num_fences++;
1087 
1088       if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED))
1089          continue;
1090 
1091       unsigned idx = add_fence_dependency_entry(cs);
1092       amdgpu_fence_reference(&cs->fence_dependencies[idx],
1093                              (struct pipe_fence_handle*)bo_fence);
1094    }
1095 
1096    for (unsigned j = new_num_fences; j < bo->num_fences; ++j)
1097       amdgpu_fence_reference(&bo->fences[j], NULL);
1098 
1099    bo->num_fences = new_num_fences;
1100 }
1101 
1102 /* Add the given list of fences to the buffer's fence list.
1103  *
1104  * Must be called with the winsys bo_fence_lock held.
1105  */
amdgpu_add_fences(struct amdgpu_winsys_bo * bo,unsigned num_fences,struct pipe_fence_handle ** fences)1106 void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
1107                        unsigned num_fences,
1108                        struct pipe_fence_handle **fences)
1109 {
1110    if (bo->num_fences + num_fences > bo->max_fences) {
1111       unsigned new_max_fences = MAX2(bo->num_fences + num_fences, bo->max_fences * 2);
1112       struct pipe_fence_handle **new_fences =
1113          REALLOC(bo->fences,
1114                  bo->num_fences * sizeof(*new_fences),
1115                  new_max_fences * sizeof(*new_fences));
1116       if (likely(new_fences)) {
1117          bo->fences = new_fences;
1118          bo->max_fences = new_max_fences;
1119       } else {
1120          unsigned drop;
1121 
1122          fprintf(stderr, "amdgpu_add_fences: allocation failure, dropping fence(s)\n");
1123          if (!bo->num_fences)
1124             return;
1125 
1126          bo->num_fences--; /* prefer to keep the most recent fence if possible */
1127          amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL);
1128 
1129          drop = bo->num_fences + num_fences - bo->max_fences;
1130          num_fences -= drop;
1131          fences += drop;
1132       }
1133    }
1134 
1135    for (unsigned i = 0; i < num_fences; ++i) {
1136       bo->fences[bo->num_fences] = NULL;
1137       amdgpu_fence_reference(&bo->fences[bo->num_fences], fences[i]);
1138       bo->num_fences++;
1139    }
1140 }
1141 
amdgpu_add_fence_dependencies_bo_list(struct amdgpu_cs * acs,struct pipe_fence_handle * fence,unsigned num_buffers,struct amdgpu_cs_buffer * buffers)1142 static void amdgpu_add_fence_dependencies_bo_list(struct amdgpu_cs *acs,
1143                                                   struct pipe_fence_handle *fence,
1144                                                   unsigned num_buffers,
1145                                                   struct amdgpu_cs_buffer *buffers)
1146 {
1147    for (unsigned i = 0; i < num_buffers; i++) {
1148       struct amdgpu_cs_buffer *buffer = &buffers[i];
1149       struct amdgpu_winsys_bo *bo = buffer->bo;
1150 
1151       amdgpu_add_bo_fence_dependencies(acs, buffer);
1152       p_atomic_inc(&bo->num_active_ioctls);
1153       amdgpu_add_fences(bo, 1, &fence);
1154    }
1155 }
1156 
1157 /* Since the kernel driver doesn't synchronize execution between different
1158  * rings automatically, we have to add fence dependencies manually.
1159  */
amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs * acs)1160 static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs)
1161 {
1162    struct amdgpu_cs_context *cs = acs->csc;
1163 
1164    cs->num_fence_dependencies = 0;
1165 
1166    amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers);
1167    amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers);
1168    amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers);
1169 }
1170 
1171 /* Add backing of sparse buffers to the buffer list.
1172  *
1173  * This is done late, during submission, to keep the buffer list short before
1174  * submit, and to avoid managing fences for the backing buffers.
1175  */
amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context * cs)1176 static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs)
1177 {
1178    for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) {
1179       struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i];
1180       struct amdgpu_winsys_bo *bo = buffer->bo;
1181 
1182       simple_mtx_lock(&bo->u.sparse.commit_lock);
1183 
1184       list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
1185          /* We can directly add the buffer here, because we know that each
1186           * backing buffer occurs only once.
1187           */
1188          int idx = amdgpu_do_add_real_buffer(cs, backing->bo);
1189          if (idx < 0) {
1190             fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__);
1191             simple_mtx_unlock(&bo->u.sparse.commit_lock);
1192             return false;
1193          }
1194 
1195          cs->real_buffers[idx].usage = buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1196          cs->real_buffers[idx].u.real.priority_usage = buffer->u.real.priority_usage;
1197          p_atomic_inc(&backing->bo->num_active_ioctls);
1198       }
1199 
1200       simple_mtx_unlock(&bo->u.sparse.commit_lock);
1201    }
1202 
1203    return true;
1204 }
1205 
amdgpu_cs_submit_ib(void * job,int thread_index)1206 void amdgpu_cs_submit_ib(void *job, int thread_index)
1207 {
1208    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1209    struct amdgpu_winsys *ws = acs->ctx->ws;
1210    struct amdgpu_cs_context *cs = acs->cst;
1211    int i, r;
1212    amdgpu_bo_list_handle bo_list = NULL;
1213    uint64_t seq_no = 0;
1214    bool has_user_fence = amdgpu_cs_has_user_fence(cs);
1215 
1216    /* Create the buffer list.
1217     * Use a buffer list containing all allocated buffers if requested.
1218     */
1219    if (ws->debug_all_bos) {
1220       struct amdgpu_winsys_bo *bo;
1221       amdgpu_bo_handle *handles;
1222       unsigned num = 0;
1223 
1224       simple_mtx_lock(&ws->global_bo_list_lock);
1225 
1226       handles = malloc(sizeof(handles[0]) * ws->num_buffers);
1227       if (!handles) {
1228          simple_mtx_unlock(&ws->global_bo_list_lock);
1229          amdgpu_cs_context_cleanup(cs);
1230          cs->error_code = -ENOMEM;
1231          return;
1232       }
1233 
1234       LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) {
1235          assert(num < ws->num_buffers);
1236          handles[num++] = bo->bo;
1237       }
1238 
1239       r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
1240                                 handles, NULL, &bo_list);
1241       free(handles);
1242       simple_mtx_unlock(&ws->global_bo_list_lock);
1243    } else {
1244       unsigned num_handles;
1245 
1246       if (!amdgpu_add_sparse_backing_buffers(cs)) {
1247          r = -ENOMEM;
1248          goto bo_list_error;
1249       }
1250 
1251       if (cs->max_real_submit < cs->num_real_buffers) {
1252          FREE(cs->handles);
1253          FREE(cs->flags);
1254 
1255          cs->handles = MALLOC(sizeof(*cs->handles) * cs->num_real_buffers);
1256          cs->flags = MALLOC(sizeof(*cs->flags) * cs->num_real_buffers);
1257 
1258          if (!cs->handles || !cs->flags) {
1259             cs->max_real_submit = 0;
1260             r = -ENOMEM;
1261             goto bo_list_error;
1262          }
1263       }
1264 
1265       num_handles = 0;
1266       for (i = 0; i < cs->num_real_buffers; ++i) {
1267          struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i];
1268 
1269 	 if (buffer->bo->is_local)
1270             continue;
1271 
1272          assert(buffer->u.real.priority_usage != 0);
1273 
1274          cs->handles[num_handles] = buffer->bo->bo;
1275          cs->flags[num_handles] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4;
1276 	 ++num_handles;
1277       }
1278 
1279       if (acs->ring_type == RING_GFX)
1280          ws->gfx_bo_list_counter += cs->num_real_buffers;
1281 
1282       if (num_handles) {
1283          r = amdgpu_bo_list_create(ws->dev, num_handles,
1284                                    cs->handles, cs->flags, &bo_list);
1285       } else {
1286          r = 0;
1287       }
1288    }
1289 bo_list_error:
1290 
1291    if (r) {
1292       fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
1293       amdgpu_fence_signalled(cs->fence);
1294       cs->error_code = r;
1295       goto cleanup;
1296    }
1297 
1298    if (acs->ctx->num_rejected_cs) {
1299       r = -ECANCELED;
1300    } else {
1301       struct drm_amdgpu_cs_chunk chunks[4];
1302       unsigned num_chunks = 0;
1303 
1304       /* Convert from dwords to bytes. */
1305       cs->ib[IB_MAIN].ib_bytes *= 4;
1306 
1307       /* IB */
1308       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1309       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1310       chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN];
1311       num_chunks++;
1312 
1313       /* Fence */
1314       if (has_user_fence) {
1315          chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1316          chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1317          chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1318          num_chunks++;
1319       }
1320 
1321       /* Dependencies */
1322       unsigned num_dependencies = cs->num_fence_dependencies;
1323       unsigned num_syncobj_dependencies = 0;
1324 
1325       if (num_dependencies) {
1326          struct drm_amdgpu_cs_chunk_dep *dep_chunk =
1327             alloca(num_dependencies * sizeof(*dep_chunk));
1328          unsigned num = 0;
1329 
1330          for (unsigned i = 0; i < num_dependencies; i++) {
1331             struct amdgpu_fence *fence =
1332                (struct amdgpu_fence*)cs->fence_dependencies[i];
1333 
1334             if (amdgpu_fence_is_syncobj(fence)) {
1335                num_syncobj_dependencies++;
1336                continue;
1337             }
1338 
1339             assert(util_queue_fence_is_signalled(&fence->submitted));
1340             amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]);
1341          }
1342 
1343          chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
1344          chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num;
1345          chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
1346          num_chunks++;
1347       }
1348 
1349       /* Syncobj dependencies. */
1350       if (num_syncobj_dependencies) {
1351          struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1352             alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1353          unsigned num = 0;
1354 
1355          for (unsigned i = 0; i < num_dependencies; i++) {
1356             struct amdgpu_fence *fence =
1357                (struct amdgpu_fence*)cs->fence_dependencies[i];
1358 
1359             if (!amdgpu_fence_is_syncobj(fence))
1360                continue;
1361 
1362             assert(util_queue_fence_is_signalled(&fence->submitted));
1363             sem_chunk[num++].handle = fence->syncobj;
1364          }
1365 
1366          chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1367          chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num;
1368          chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1369          num_chunks++;
1370       }
1371 
1372       assert(num_chunks <= ARRAY_SIZE(chunks));
1373 
1374       r = amdgpu_cs_submit_raw(ws->dev, acs->ctx->ctx, bo_list,
1375                                num_chunks, chunks, &seq_no);
1376    }
1377 
1378    cs->error_code = r;
1379    if (r) {
1380       if (r == -ENOMEM)
1381          fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
1382       else if (r == -ECANCELED)
1383          fprintf(stderr, "amdgpu: The CS has been cancelled because the context is lost.\n");
1384       else
1385          fprintf(stderr, "amdgpu: The CS has been rejected, "
1386                  "see dmesg for more information (%i).\n", r);
1387 
1388       amdgpu_fence_signalled(cs->fence);
1389 
1390       acs->ctx->num_rejected_cs++;
1391       ws->num_total_rejected_cs++;
1392    } else {
1393       /* Success. */
1394       uint64_t *user_fence = NULL;
1395 
1396       if (has_user_fence)
1397          user_fence = acs->ctx->user_fence_cpu_address_base + acs->ring_type;
1398       amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1399    }
1400 
1401    /* Cleanup. */
1402    if (bo_list)
1403       amdgpu_bo_list_destroy(bo_list);
1404 
1405 cleanup:
1406    for (i = 0; i < cs->num_real_buffers; i++)
1407       p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls);
1408    for (i = 0; i < cs->num_slab_buffers; i++)
1409       p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls);
1410    for (i = 0; i < cs->num_sparse_buffers; i++)
1411       p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls);
1412 
1413    amdgpu_cs_context_cleanup(cs);
1414 }
1415 
1416 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_winsys_cs * rcs)1417 void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
1418 {
1419    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1420 
1421    /* Wait for any pending ioctl of this CS to complete. */
1422    util_queue_fence_wait(&cs->flush_completed);
1423 }
1424 
amdgpu_cs_flush(struct radeon_winsys_cs * rcs,unsigned flags,struct pipe_fence_handle ** fence)1425 static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
1426                            unsigned flags,
1427                            struct pipe_fence_handle **fence)
1428 {
1429    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1430    struct amdgpu_winsys *ws = cs->ctx->ws;
1431    int error_code = 0;
1432 
1433    rcs->current.max_dw += amdgpu_cs_epilog_dws(cs->ring_type);
1434 
1435    switch (cs->ring_type) {
1436    case RING_DMA:
1437       /* pad DMA ring to 8 DWs */
1438       if (ws->info.chip_class <= SI) {
1439          while (rcs->current.cdw & 7)
1440             radeon_emit(rcs, 0xf0000000); /* NOP packet */
1441       } else {
1442          while (rcs->current.cdw & 7)
1443             radeon_emit(rcs, 0x00000000); /* NOP packet */
1444       }
1445       break;
1446    case RING_GFX:
1447       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements */
1448       if (ws->info.gfx_ib_pad_with_type2) {
1449          while (rcs->current.cdw & 7)
1450             radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1451       } else {
1452          while (rcs->current.cdw & 7)
1453             radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
1454       }
1455       ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
1456       break;
1457    case RING_UVD:
1458       while (rcs->current.cdw & 15)
1459          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1460       break;
1461    case RING_VCN_DEC:
1462       while (rcs->current.cdw & 15)
1463          radeon_emit(rcs, 0x81ff); /* nop packet */
1464       break;
1465    default:
1466       break;
1467    }
1468 
1469    if (rcs->current.cdw > rcs->current.max_dw) {
1470       fprintf(stderr, "amdgpu: command stream overflowed\n");
1471    }
1472 
1473    /* If the CS is not empty or overflowed.... */
1474    if (likely(radeon_emitted(&cs->main.base, 0) &&
1475        cs->main.base.current.cdw <= cs->main.base.current.max_dw &&
1476        !debug_get_option_noop())) {
1477       struct amdgpu_cs_context *cur = cs->csc;
1478 
1479       /* Set IB sizes. */
1480       amdgpu_ib_finalize(ws, &cs->main);
1481 
1482       /* Create a fence. */
1483       amdgpu_fence_reference(&cur->fence, NULL);
1484       if (cs->next_fence) {
1485          /* just move the reference */
1486          cur->fence = cs->next_fence;
1487          cs->next_fence = NULL;
1488       } else {
1489          cur->fence = amdgpu_fence_create(cs->ctx,
1490                                           cur->ib[IB_MAIN].ip_type,
1491                                           cur->ib[IB_MAIN].ip_instance,
1492                                           cur->ib[IB_MAIN].ring);
1493       }
1494       if (fence)
1495          amdgpu_fence_reference(fence, cur->fence);
1496 
1497       amdgpu_cs_sync_flush(rcs);
1498 
1499       /* Prepare buffers.
1500        *
1501        * This fence must be held until the submission is queued to ensure
1502        * that the order of fence dependency updates matches the order of
1503        * submissions.
1504        */
1505       simple_mtx_lock(&ws->bo_fence_lock);
1506       amdgpu_add_fence_dependencies_bo_lists(cs);
1507 
1508       /* Swap command streams. "cst" is going to be submitted. */
1509       cs->csc = cs->cst;
1510       cs->cst = cur;
1511 
1512       /* Submit. */
1513       util_queue_add_job(&ws->cs_queue, cs, &cs->flush_completed,
1514                          amdgpu_cs_submit_ib, NULL);
1515       /* The submission has been queued, unlock the fence now. */
1516       simple_mtx_unlock(&ws->bo_fence_lock);
1517 
1518       if (!(flags & PIPE_FLUSH_ASYNC)) {
1519          amdgpu_cs_sync_flush(rcs);
1520          error_code = cur->error_code;
1521       }
1522    } else {
1523       amdgpu_cs_context_cleanup(cs->csc);
1524    }
1525 
1526    amdgpu_get_new_ib(&ws->base, cs, IB_MAIN);
1527 
1528    cs->main.base.used_gart = 0;
1529    cs->main.base.used_vram = 0;
1530 
1531    if (cs->ring_type == RING_GFX)
1532       ws->num_gfx_IBs++;
1533    else if (cs->ring_type == RING_DMA)
1534       ws->num_sdma_IBs++;
1535 
1536    return error_code;
1537 }
1538 
amdgpu_cs_destroy(struct radeon_winsys_cs * rcs)1539 static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
1540 {
1541    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1542 
1543    amdgpu_cs_sync_flush(rcs);
1544    util_queue_fence_destroy(&cs->flush_completed);
1545    p_atomic_dec(&cs->ctx->ws->num_cs);
1546    pb_reference(&cs->main.big_ib_buffer, NULL);
1547    FREE(cs->main.base.prev);
1548    amdgpu_destroy_cs_context(&cs->csc1);
1549    amdgpu_destroy_cs_context(&cs->csc2);
1550    amdgpu_fence_reference(&cs->next_fence, NULL);
1551    FREE(cs);
1552 }
1553 
amdgpu_bo_is_referenced(struct radeon_winsys_cs * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)1554 static bool amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
1555                                     struct pb_buffer *_buf,
1556                                     enum radeon_bo_usage usage)
1557 {
1558    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1559    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
1560 
1561    return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
1562 }
1563 
amdgpu_cs_init_functions(struct amdgpu_winsys * ws)1564 void amdgpu_cs_init_functions(struct amdgpu_winsys *ws)
1565 {
1566    ws->base.ctx_create = amdgpu_ctx_create;
1567    ws->base.ctx_destroy = amdgpu_ctx_destroy;
1568    ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
1569    ws->base.cs_create = amdgpu_cs_create;
1570    ws->base.cs_destroy = amdgpu_cs_destroy;
1571    ws->base.cs_add_buffer = amdgpu_cs_add_buffer;
1572    ws->base.cs_validate = amdgpu_cs_validate;
1573    ws->base.cs_check_space = amdgpu_cs_check_space;
1574    ws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
1575    ws->base.cs_flush = amdgpu_cs_flush;
1576    ws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
1577    ws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
1578    ws->base.cs_sync_flush = amdgpu_cs_sync_flush;
1579    ws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
1580    ws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
1581    ws->base.fence_reference = amdgpu_fence_reference;
1582    ws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
1583    ws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
1584    ws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
1585 }
1586