1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * The above copyright notice and this permission notice (including the
24  * next paragraph) shall be included in all copies or substantial portions
25  * of the Software.
26  */
27 
28 /*
29     This file replaces libdrm's radeon_cs_gem with our own implemention.
30     It's optimized specifically for Radeon DRM.
31     Adding buffers and space checking are faster and simpler than their
32     counterparts in libdrm (the time complexity of all the functions
33     is O(1) in nearly all scenarios, thanks to hashing).
34 
35     It works like this:
36 
37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
39     based on the domains, which are simply or'd for the accounting purposes.
40     The adding is skipped if the reloc is already present in the list, but it
41     accounts any newly-referenced domains.
42 
43     cs_validate is then called, which just checks:
44         used_vram/gart < vram/gart_size * 0.8
45     The 0.8 number allows for some memory fragmentation. If the validation
46     fails, the pipe driver flushes CS and tries do the validation again,
47     i.e. it validates only that one operation. If it fails again, it drops
48     the operation on the floor and prints some nasty message to stderr.
49     (done in the pipe driver)
50 
51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
53     because we already specify them in cs_add_buffer.
54 */
55 
56 #include "radeon_drm_cs.h"
57 
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60 
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65 
66 
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68 
69 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
70 static void radeon_fence_reference(struct pipe_fence_handle **dst,
71                                    struct pipe_fence_handle *src);
72 
radeon_drm_ctx_create(struct radeon_winsys * ws)73 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
74 {
75    struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
76    if (!ctx)
77       return NULL;
78 
79    ctx->ws = (struct radeon_drm_winsys*)ws;
80    ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
81    return (struct radeon_winsys_ctx*)ctx;
82 }
83 
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)84 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
85 {
86    FREE(ctx);
87 }
88 
89 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx)90 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx)
91 {
92    struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
93 
94    unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
95 
96    if (ctx->gpu_reset_counter == latest)
97       return PIPE_NO_RESET;
98 
99    ctx->gpu_reset_counter = latest;
100    return PIPE_UNKNOWN_CONTEXT_RESET;
101 }
102 
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)103 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
104                                    struct radeon_drm_winsys *ws)
105 {
106    int i;
107 
108    csc->fd = ws->fd;
109 
110    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
111    csc->chunks[0].length_dw = 0;
112    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
113    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
114    csc->chunks[1].length_dw = 0;
115    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
116    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
117    csc->chunks[2].length_dw = 2;
118    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
119 
120    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
121    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
122    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
123 
124    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
125 
126    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
127       csc->reloc_indices_hashlist[i] = -1;
128    }
129    return true;
130 }
131 
radeon_cs_context_cleanup(struct radeon_cs_context * csc)132 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
133 {
134    unsigned i;
135 
136    for (i = 0; i < csc->num_relocs; i++) {
137       p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
138       radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
139    }
140    for (i = 0; i < csc->num_slab_buffers; ++i) {
141       p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
142       radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
143    }
144 
145    csc->num_relocs = 0;
146    csc->num_validated_relocs = 0;
147    csc->num_slab_buffers = 0;
148    csc->chunks[0].length_dw = 0;
149    csc->chunks[1].length_dw = 0;
150 
151    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
152       csc->reloc_indices_hashlist[i] = -1;
153    }
154 }
155 
radeon_destroy_cs_context(struct radeon_cs_context * csc)156 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
157 {
158    radeon_cs_context_cleanup(csc);
159    FREE(csc->slab_buffers);
160    FREE(csc->relocs_bo);
161    FREE(csc->relocs);
162 }
163 
164 
165 static struct radeon_cmdbuf *
radeon_drm_cs_create(struct radeon_winsys_ctx * ctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx,bool stop_exec_on_failure)166 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
167                      enum ring_type ring_type,
168                      void (*flush)(void *ctx, unsigned flags,
169                                    struct pipe_fence_handle **fence),
170                      void *flush_ctx,
171                      bool stop_exec_on_failure)
172 {
173    struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
174    struct radeon_drm_cs *cs;
175 
176    cs = CALLOC_STRUCT(radeon_drm_cs);
177    if (!cs) {
178       return NULL;
179    }
180    util_queue_fence_init(&cs->flush_completed);
181 
182    cs->ws = ws;
183    cs->flush_cs = flush;
184    cs->flush_data = flush_ctx;
185 
186    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
187       FREE(cs);
188       return NULL;
189    }
190    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
191       radeon_destroy_cs_context(&cs->csc1);
192       FREE(cs);
193       return NULL;
194    }
195 
196    /* Set the first command buffer as current. */
197    cs->csc = &cs->csc1;
198    cs->cst = &cs->csc2;
199    cs->base.current.buf = cs->csc->buf;
200    cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
201    cs->ring_type = ring_type;
202 
203    p_atomic_inc(&ws->num_cs);
204    return &cs->base;
205 }
206 
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)207 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
208 {
209    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
210    struct radeon_bo_item *buffers;
211    unsigned num_buffers;
212    int i = csc->reloc_indices_hashlist[hash];
213 
214    if (bo->handle) {
215       buffers = csc->relocs_bo;
216       num_buffers = csc->num_relocs;
217    } else {
218       buffers = csc->slab_buffers;
219       num_buffers = csc->num_slab_buffers;
220    }
221 
222    /* not found or found */
223    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
224       return i;
225 
226    /* Hash collision, look for the BO in the list of relocs linearly. */
227    for (i = num_buffers - 1; i >= 0; i--) {
228       if (buffers[i].bo == bo) {
229          /* Put this reloc in the hash list.
230           * This will prevent additional hash collisions if there are
231           * several consecutive lookup_buffer calls for the same buffer.
232           *
233           * Example: Assuming buffers A,B,C collide in the hash list,
234           * the following sequence of relocs:
235           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
236           * will collide here: ^ and here:   ^,
237           * meaning that we should get very few collisions in the end. */
238          csc->reloc_indices_hashlist[hash] = i;
239          return i;
240       }
241    }
242    return -1;
243 }
244 
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)245 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
246                                                  struct radeon_bo *bo)
247 {
248    struct radeon_cs_context *csc = cs->csc;
249    struct drm_radeon_cs_reloc *reloc;
250    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
251    int i = -1;
252 
253    i = radeon_lookup_buffer(csc, bo);
254 
255    if (i >= 0) {
256       /* For async DMA, every add_buffer call must add a buffer to the list
257        * no matter how many duplicates there are. This is due to the fact
258        * the DMA CS checker doesn't use NOP packets for offset patching,
259        * but always uses the i-th buffer from the list to patch the i-th
260        * offset. If there are N offsets in a DMA CS, there must also be N
261        * buffers in the relocation list.
262        *
263        * This doesn't have to be done if virtual memory is enabled,
264        * because there is no offset patching with virtual memory.
265        */
266       if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
267          return i;
268       }
269    }
270 
271    /* New relocation, check if the backing array is large enough. */
272    if (csc->num_relocs >= csc->max_relocs) {
273       uint32_t size;
274       csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
275 
276       size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
277       csc->relocs_bo = realloc(csc->relocs_bo, size);
278 
279       size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
280       csc->relocs = realloc(csc->relocs, size);
281 
282       csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
283    }
284 
285    /* Initialize the new relocation. */
286    csc->relocs_bo[csc->num_relocs].bo = NULL;
287    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
288    radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
289    p_atomic_inc(&bo->num_cs_references);
290    reloc = &csc->relocs[csc->num_relocs];
291    reloc->handle = bo->handle;
292    reloc->read_domains = 0;
293    reloc->write_domain = 0;
294    reloc->flags = 0;
295 
296    csc->reloc_indices_hashlist[hash] = csc->num_relocs;
297 
298    csc->chunks[1].length_dw += RELOC_DWORDS;
299 
300    return csc->num_relocs++;
301 }
302 
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)303 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
304                                             struct radeon_bo *bo)
305 {
306    struct radeon_cs_context *csc = cs->csc;
307    unsigned hash;
308    struct radeon_bo_item *item;
309    int idx;
310    int real_idx;
311 
312    idx = radeon_lookup_buffer(csc, bo);
313    if (idx >= 0)
314       return idx;
315 
316    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
317 
318    /* Check if the backing array is large enough. */
319    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
320       unsigned new_max = MAX2(csc->max_slab_buffers + 16,
321                               (unsigned)(csc->max_slab_buffers * 1.3));
322       struct radeon_bo_item *new_buffers =
323             REALLOC(csc->slab_buffers,
324                     csc->max_slab_buffers * sizeof(*new_buffers),
325                     new_max * sizeof(*new_buffers));
326       if (!new_buffers) {
327          fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
328          return -1;
329       }
330 
331       csc->max_slab_buffers = new_max;
332       csc->slab_buffers = new_buffers;
333    }
334 
335    /* Initialize the new relocation. */
336    idx = csc->num_slab_buffers++;
337    item = &csc->slab_buffers[idx];
338 
339    item->bo = NULL;
340    item->u.slab.real_idx = real_idx;
341    radeon_bo_reference(&item->bo, bo);
342    p_atomic_inc(&bo->num_cs_references);
343 
344    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
345    csc->reloc_indices_hashlist[hash] = idx;
346 
347    return idx;
348 }
349 
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)350 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
351                                          struct pb_buffer *buf,
352                                          enum radeon_bo_usage usage,
353                                          enum radeon_bo_domain domains,
354                                          enum radeon_bo_priority priority)
355 {
356    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
357    struct radeon_bo *bo = (struct radeon_bo*)buf;
358    enum radeon_bo_domain added_domains;
359 
360    /* If VRAM is just stolen system memory, allow both VRAM and
361     * GTT, whichever has free space. If a buffer is evicted from
362     * VRAM to GTT, it will stay there.
363     */
364    if (!cs->ws->info.has_dedicated_vram)
365       domains |= RADEON_DOMAIN_GTT;
366 
367    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
368    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
369    struct drm_radeon_cs_reloc *reloc;
370    int index;
371 
372    if (!bo->handle) {
373       index = radeon_lookup_or_add_slab_buffer(cs, bo);
374       if (index < 0)
375          return 0;
376 
377       index = cs->csc->slab_buffers[index].u.slab.real_idx;
378    } else {
379       index = radeon_lookup_or_add_real_buffer(cs, bo);
380    }
381 
382    reloc = &cs->csc->relocs[index];
383    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
384    reloc->read_domains |= rd;
385    reloc->write_domain |= wd;
386    reloc->flags = MAX2(reloc->flags, priority);
387    cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
388 
389    if (added_domains & RADEON_DOMAIN_VRAM)
390       cs->base.used_vram += bo->base.size;
391    else if (added_domains & RADEON_DOMAIN_GTT)
392       cs->base.used_gart += bo->base.size;
393 
394    return index;
395 }
396 
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer * buf)397 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
398                                        struct pb_buffer *buf)
399 {
400    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
401 
402    return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
403 }
404 
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)405 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
406 {
407    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
408    bool status =
409          cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
410          cs->base.used_vram < cs->ws->info.vram_size * 0.8;
411 
412    if (status) {
413       cs->csc->num_validated_relocs = cs->csc->num_relocs;
414    } else {
415       /* Remove lately-added buffers. The validation failed with them
416        * and the CS is about to be flushed because of that. Keep only
417        * the already-validated buffers. */
418       unsigned i;
419 
420       for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
421          p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
422          radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
423       }
424       cs->csc->num_relocs = cs->csc->num_validated_relocs;
425 
426       /* Flush if there are any relocs. Clean up otherwise. */
427       if (cs->csc->num_relocs) {
428          cs->flush_cs(cs->flush_data,
429                       RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
430       } else {
431          radeon_cs_context_cleanup(cs->csc);
432          cs->base.used_vram = 0;
433          cs->base.used_gart = 0;
434 
435          assert(cs->base.current.cdw == 0);
436          if (cs->base.current.cdw != 0) {
437             fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
438          }
439       }
440    }
441    return status;
442 }
443 
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw,bool force_chaining)444 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw,
445                                       bool force_chaining)
446 {
447    assert(rcs->current.cdw <= rcs->current.max_dw);
448    return rcs->current.max_dw - rcs->current.cdw >= dw;
449 }
450 
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)451 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
452                                               struct radeon_bo_list_item *list)
453 {
454    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
455    int i;
456 
457    if (list) {
458       for (i = 0; i < cs->csc->num_relocs; i++) {
459          list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
460          list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
461          list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
462       }
463    }
464    return cs->csc->num_relocs;
465 }
466 
radeon_drm_cs_emit_ioctl_oneshot(void * job,int thread_index)467 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
468 {
469    struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
470    unsigned i;
471    int r;
472 
473    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
474                            &csc->cs, sizeof(struct drm_radeon_cs));
475    if (r) {
476       if (r == -ENOMEM)
477          fprintf(stderr, "radeon: Not enough memory for command submission.\n");
478       else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
479          unsigned i;
480 
481          fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
482          for (i = 0; i < csc->chunks[0].length_dw; i++) {
483             fprintf(stderr, "0x%08X\n", csc->buf[i]);
484          }
485       } else {
486          fprintf(stderr, "radeon: The kernel rejected CS, "
487                          "see dmesg for more information (%i).\n", r);
488       }
489    }
490 
491    for (i = 0; i < csc->num_relocs; i++)
492       p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
493    for (i = 0; i < csc->num_slab_buffers; i++)
494       p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
495 
496    radeon_cs_context_cleanup(csc);
497 }
498 
499 /*
500  * Make sure previous submission of this cs are completed
501  */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)502 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
503 {
504    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
505 
506    /* Wait for any pending ioctl of this CS to complete. */
507    if (util_queue_is_initialized(&cs->ws->cs_queue))
508       util_queue_fence_wait(&cs->flush_completed);
509 }
510 
511 /* Add the given fence to a slab buffer fence list.
512  *
513  * There is a potential race condition when bo participates in submissions on
514  * two or more threads simultaneously. Since we do not know which of the
515  * submissions will be sent to the GPU first, we have to keep the fences
516  * of all submissions.
517  *
518  * However, fences that belong to submissions that have already returned from
519  * their respective ioctl do not have to be kept, because we know that they
520  * will signal earlier.
521  */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)522 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
523 {
524    unsigned dst;
525 
526    assert(fence->num_cs_references);
527 
528    /* Cleanup older fences */
529    dst = 0;
530    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
531       if (bo->u.slab.fences[src]->num_cs_references) {
532          bo->u.slab.fences[dst] = bo->u.slab.fences[src];
533          dst++;
534       } else {
535          radeon_bo_reference(&bo->u.slab.fences[src], NULL);
536       }
537    }
538    bo->u.slab.num_fences = dst;
539 
540    /* Check available space for the new fence */
541    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
542       unsigned new_max_fences = bo->u.slab.max_fences + 1;
543       struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
544                                               bo->u.slab.max_fences * sizeof(*new_fences),
545                                               new_max_fences * sizeof(*new_fences));
546       if (!new_fences) {
547          fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
548          return;
549       }
550 
551       bo->u.slab.fences = new_fences;
552       bo->u.slab.max_fences = new_max_fences;
553    }
554 
555    /* Add the new fence */
556    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
557    radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
558    bo->u.slab.num_fences++;
559 }
560 
561 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
562 
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)563 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
564                                unsigned flags,
565                                struct pipe_fence_handle **pfence)
566 {
567    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
568    struct radeon_cs_context *tmp;
569 
570    switch (cs->ring_type) {
571    case RING_DMA:
572       /* pad DMA ring to 8 DWs */
573       if (cs->ws->info.chip_class <= GFX6) {
574          while (rcs->current.cdw & 7)
575             radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
576       } else {
577          while (rcs->current.cdw & 7)
578             radeon_emit(&cs->base, 0x00000000); /* NOP packet */
579       }
580       break;
581    case RING_GFX:
582       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
583        * r6xx, requires at least 4 dw alignment to avoid a hw bug.
584        */
585       if (cs->ws->info.gfx_ib_pad_with_type2) {
586          while (rcs->current.cdw & 7)
587             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
588       } else {
589          while (rcs->current.cdw & 7)
590             radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
591       }
592       break;
593    case RING_UVD:
594       while (rcs->current.cdw & 15)
595          radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
596       break;
597    default:
598       break;
599    }
600 
601    if (rcs->current.cdw > rcs->current.max_dw) {
602       fprintf(stderr, "radeon: command stream overflowed\n");
603    }
604 
605    if (pfence || cs->csc->num_slab_buffers) {
606       struct pipe_fence_handle *fence;
607 
608       if (cs->next_fence) {
609          fence = cs->next_fence;
610          cs->next_fence = NULL;
611       } else {
612          fence = radeon_cs_create_fence(rcs);
613       }
614 
615       if (fence) {
616          if (pfence)
617             radeon_fence_reference(pfence, fence);
618 
619          mtx_lock(&cs->ws->bo_fence_lock);
620          for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
621             struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
622             p_atomic_inc(&bo->num_active_ioctls);
623             radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
624          }
625          mtx_unlock(&cs->ws->bo_fence_lock);
626 
627          radeon_fence_reference(&fence, NULL);
628       }
629    } else {
630       radeon_fence_reference(&cs->next_fence, NULL);
631    }
632 
633    radeon_drm_cs_sync_flush(rcs);
634 
635    /* Swap command streams. */
636    tmp = cs->csc;
637    cs->csc = cs->cst;
638    cs->cst = tmp;
639 
640    /* If the CS is not empty or overflowed, emit it in a separate thread. */
641    if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw &&
642        !debug_get_option_noop() && !(flags & RADEON_FLUSH_NOOP)) {
643       unsigned i, num_relocs;
644 
645       num_relocs = cs->cst->num_relocs;
646 
647       cs->cst->chunks[0].length_dw = cs->base.current.cdw;
648 
649       for (i = 0; i < num_relocs; i++) {
650          /* Update the number of active asynchronous CS ioctls for the buffer. */
651          p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
652       }
653 
654       switch (cs->ring_type) {
655       case RING_DMA:
656          cs->cst->flags[0] = 0;
657          cs->cst->flags[1] = RADEON_CS_RING_DMA;
658          cs->cst->cs.num_chunks = 3;
659          if (cs->ws->info.r600_has_virtual_memory) {
660             cs->cst->flags[0] |= RADEON_CS_USE_VM;
661          }
662          break;
663 
664       case RING_UVD:
665          cs->cst->flags[0] = 0;
666          cs->cst->flags[1] = RADEON_CS_RING_UVD;
667          cs->cst->cs.num_chunks = 3;
668          break;
669 
670       case RING_VCE:
671          cs->cst->flags[0] = 0;
672          cs->cst->flags[1] = RADEON_CS_RING_VCE;
673          cs->cst->cs.num_chunks = 3;
674          break;
675 
676       default:
677       case RING_GFX:
678       case RING_COMPUTE:
679          cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
680          cs->cst->flags[1] = RADEON_CS_RING_GFX;
681          cs->cst->cs.num_chunks = 3;
682 
683          if (cs->ws->info.r600_has_virtual_memory) {
684             cs->cst->flags[0] |= RADEON_CS_USE_VM;
685             cs->cst->cs.num_chunks = 3;
686          }
687          if (flags & PIPE_FLUSH_END_OF_FRAME) {
688             cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
689             cs->cst->cs.num_chunks = 3;
690          }
691          if (cs->ring_type == RING_COMPUTE) {
692             cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
693             cs->cst->cs.num_chunks = 3;
694          }
695          break;
696       }
697 
698       if (util_queue_is_initialized(&cs->ws->cs_queue)) {
699          util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
700                             radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
701          if (!(flags & PIPE_FLUSH_ASYNC))
702             radeon_drm_cs_sync_flush(rcs);
703       } else {
704          radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
705       }
706    } else {
707       radeon_cs_context_cleanup(cs->cst);
708    }
709 
710    /* Prepare a new CS. */
711    cs->base.current.buf = cs->csc->buf;
712    cs->base.current.cdw = 0;
713    cs->base.used_vram = 0;
714    cs->base.used_gart = 0;
715 
716    if (cs->ring_type == RING_GFX)
717       cs->ws->num_gfx_IBs++;
718    else if (cs->ring_type == RING_DMA)
719       cs->ws->num_sdma_IBs++;
720    return 0;
721 }
722 
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)723 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
724 {
725    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
726 
727    radeon_drm_cs_sync_flush(rcs);
728    util_queue_fence_destroy(&cs->flush_completed);
729    radeon_cs_context_cleanup(&cs->csc1);
730    radeon_cs_context_cleanup(&cs->csc2);
731    p_atomic_dec(&cs->ws->num_cs);
732    radeon_destroy_cs_context(&cs->csc1);
733    radeon_destroy_cs_context(&cs->csc2);
734    radeon_fence_reference(&cs->next_fence, NULL);
735    FREE(cs);
736 }
737 
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)738 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
739                                     struct pb_buffer *_buf,
740                                     enum radeon_bo_usage usage)
741 {
742    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
743    struct radeon_bo *bo = (struct radeon_bo*)_buf;
744    int index;
745 
746    if (!bo->num_cs_references)
747       return false;
748 
749    index = radeon_lookup_buffer(cs->csc, bo);
750    if (index == -1)
751       return false;
752 
753    if (!bo->handle)
754       index = cs->csc->slab_buffers[index].u.slab.real_idx;
755 
756    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
757       return true;
758    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
759       return true;
760 
761    return false;
762 }
763 
764 /* FENCES */
765 
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)766 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
767 {
768    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
769    struct pb_buffer *fence;
770 
771    /* Create a fence, which is a dummy BO. */
772    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
773                                       RADEON_DOMAIN_GTT,
774                                       RADEON_FLAG_NO_SUBALLOC
775                                       | RADEON_FLAG_NO_INTERPROCESS_SHARING);
776    if (!fence)
777       return NULL;
778 
779    /* Add the fence as a dummy relocation. */
780    cs->ws->base.cs_add_buffer(rcs, fence,
781                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
782                               RADEON_PRIO_FENCE);
783    return (struct pipe_fence_handle*)fence;
784 }
785 
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)786 static bool radeon_fence_wait(struct radeon_winsys *ws,
787                               struct pipe_fence_handle *fence,
788                               uint64_t timeout)
789 {
790    return ws->buffer_wait((struct pb_buffer*)fence, timeout,
791                           RADEON_USAGE_READWRITE);
792 }
793 
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)794 static void radeon_fence_reference(struct pipe_fence_handle **dst,
795                                    struct pipe_fence_handle *src)
796 {
797    pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
798 }
799 
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)800 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
801 {
802    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
803    struct pipe_fence_handle *fence = NULL;
804 
805    if (cs->next_fence) {
806       radeon_fence_reference(&fence, cs->next_fence);
807       return fence;
808    }
809 
810    fence = radeon_cs_create_fence(rcs);
811    if (!fence)
812       return NULL;
813 
814    radeon_fence_reference(&cs->next_fence, fence);
815    return fence;
816 }
817 
818 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence,unsigned dependency_flags)819 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
820                                    struct pipe_fence_handle *fence,
821                                    unsigned dependency_flags)
822 {
823    /* TODO: Handle the following unlikely multi-threaded scenario:
824     *
825     *  Thread 1 / Context 1                   Thread 2 / Context 2
826     *  --------------------                   --------------------
827     *  f = cs_get_next_fence()
828     *                                         cs_add_fence_dependency(f)
829     *                                         cs_flush()
830     *  cs_flush()
831     *
832     * We currently assume that this does not happen because we don't support
833     * asynchronous flushes on Radeon.
834     */
835 }
836 
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)837 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
838 {
839    ws->base.ctx_create = radeon_drm_ctx_create;
840    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
841    ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
842    ws->base.cs_create = radeon_drm_cs_create;
843    ws->base.cs_destroy = radeon_drm_cs_destroy;
844    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
845    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
846    ws->base.cs_validate = radeon_drm_cs_validate;
847    ws->base.cs_check_space = radeon_drm_cs_check_space;
848    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
849    ws->base.cs_flush = radeon_drm_cs_flush;
850    ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
851    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
852    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
853    ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
854    ws->base.fence_wait = radeon_fence_wait;
855    ws->base.fence_reference = radeon_fence_reference;
856 }
857