1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
22  *
23  * The above copyright notice and this permission notice (including the
24  * next paragraph) shall be included in all copies or substantial portions
25  * of the Software.
26  */
27 
28 /*
29     This file replaces libdrm's radeon_cs_gem with our own implemention.
30     It's optimized specifically for Radeon DRM.
31     Adding buffers and space checking are faster and simpler than their
32     counterparts in libdrm (the time complexity of all the functions
33     is O(1) in nearly all scenarios, thanks to hashing).
34 
35     It works like this:
36 
37     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38     also adds the size of 'buf' to the used_gart and used_vram winsys variables
39     based on the domains, which are simply or'd for the accounting purposes.
40     The adding is skipped if the reloc is already present in the list, but it
41     accounts any newly-referenced domains.
42 
43     cs_validate is then called, which just checks:
44         used_vram/gart < vram/gart_size * 0.8
45     The 0.8 number allows for some memory fragmentation. If the validation
46     fails, the pipe driver flushes CS and tries do the validation again,
47     i.e. it validates only that one operation. If it fails again, it drops
48     the operation on the floor and prints some nasty message to stderr.
49     (done in the pipe driver)
50 
51     cs_write_reloc(cs, buf) just writes a reloc that has been added using
52     cs_add_buffer. The read_domain and write_domain parameters have been removed,
53     because we already specify them in cs_add_buffer.
54 */
55 
56 #include "radeon_drm_cs.h"
57 
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60 
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65 
66 
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68 
69 static struct pipe_fence_handle *
70 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
72                                    struct pipe_fence_handle *src);
73 
radeon_drm_ctx_create(struct radeon_winsys * ws)74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
75 {
76     /* No context support here. Just return the winsys pointer
77      * as the "context". */
78     return (struct radeon_winsys_ctx*)ws;
79 }
80 
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
82 {
83     /* No context support here. */
84 }
85 
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
87                                    struct radeon_drm_winsys *ws)
88 {
89     int i;
90 
91     csc->fd = ws->fd;
92 
93     csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
94     csc->chunks[0].length_dw = 0;
95     csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
96     csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
97     csc->chunks[1].length_dw = 0;
98     csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
99     csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
100     csc->chunks[2].length_dw = 2;
101     csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
102 
103     csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
104     csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
105     csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
106 
107     csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
108 
109     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
110         csc->reloc_indices_hashlist[i] = -1;
111     }
112     return true;
113 }
114 
radeon_cs_context_cleanup(struct radeon_cs_context * csc)115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
116 {
117     unsigned i;
118 
119     for (i = 0; i < csc->num_relocs; i++) {
120         p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
121         radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
122     }
123     for (i = 0; i < csc->num_slab_buffers; ++i) {
124         p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
125         radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
126     }
127 
128     csc->num_relocs = 0;
129     csc->num_validated_relocs = 0;
130     csc->num_slab_buffers = 0;
131     csc->chunks[0].length_dw = 0;
132     csc->chunks[1].length_dw = 0;
133 
134     for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
135         csc->reloc_indices_hashlist[i] = -1;
136     }
137 }
138 
radeon_destroy_cs_context(struct radeon_cs_context * csc)139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
140 {
141     radeon_cs_context_cleanup(csc);
142     FREE(csc->slab_buffers);
143     FREE(csc->relocs_bo);
144     FREE(csc->relocs);
145 }
146 
147 
148 static struct radeon_winsys_cs *
radeon_drm_cs_create(struct radeon_winsys_ctx * ctx,enum ring_type ring_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
150                      enum ring_type ring_type,
151                      void (*flush)(void *ctx, unsigned flags,
152                                    struct pipe_fence_handle **fence),
153                      void *flush_ctx)
154 {
155     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
156     struct radeon_drm_cs *cs;
157 
158     cs = CALLOC_STRUCT(radeon_drm_cs);
159     if (!cs) {
160         return NULL;
161     }
162     util_queue_fence_init(&cs->flush_completed);
163 
164     cs->ws = ws;
165     cs->flush_cs = flush;
166     cs->flush_data = flush_ctx;
167 
168     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
169         FREE(cs);
170         return NULL;
171     }
172     if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
173         radeon_destroy_cs_context(&cs->csc1);
174         FREE(cs);
175         return NULL;
176     }
177 
178     /* Set the first command buffer as current. */
179     cs->csc = &cs->csc1;
180     cs->cst = &cs->csc2;
181     cs->base.current.buf = cs->csc->buf;
182     cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
183     cs->ring_type = ring_type;
184 
185     p_atomic_inc(&ws->num_cs);
186     return &cs->base;
187 }
188 
radeon_lookup_buffer(struct radeon_cs_context * csc,struct radeon_bo * bo)189 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
190 {
191     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
192     struct radeon_bo_item *buffers;
193     unsigned num_buffers;
194     int i = csc->reloc_indices_hashlist[hash];
195 
196     if (bo->handle) {
197         buffers = csc->relocs_bo;
198         num_buffers = csc->num_relocs;
199     } else {
200         buffers = csc->slab_buffers;
201         num_buffers = csc->num_slab_buffers;
202     }
203 
204     /* not found or found */
205     if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
206         return i;
207 
208     /* Hash collision, look for the BO in the list of relocs linearly. */
209     for (i = num_buffers - 1; i >= 0; i--) {
210         if (buffers[i].bo == bo) {
211             /* Put this reloc in the hash list.
212              * This will prevent additional hash collisions if there are
213              * several consecutive lookup_buffer calls for the same buffer.
214              *
215              * Example: Assuming buffers A,B,C collide in the hash list,
216              * the following sequence of relocs:
217              *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
218              * will collide here: ^ and here:   ^,
219              * meaning that we should get very few collisions in the end. */
220             csc->reloc_indices_hashlist[hash] = i;
221             return i;
222         }
223     }
224     return -1;
225 }
226 
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)227 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
228                                                  struct radeon_bo *bo)
229 {
230     struct radeon_cs_context *csc = cs->csc;
231     struct drm_radeon_cs_reloc *reloc;
232     unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
233     int i = -1;
234 
235     i = radeon_lookup_buffer(csc, bo);
236 
237     if (i >= 0) {
238         /* For async DMA, every add_buffer call must add a buffer to the list
239          * no matter how many duplicates there are. This is due to the fact
240          * the DMA CS checker doesn't use NOP packets for offset patching,
241          * but always uses the i-th buffer from the list to patch the i-th
242          * offset. If there are N offsets in a DMA CS, there must also be N
243          * buffers in the relocation list.
244          *
245          * This doesn't have to be done if virtual memory is enabled,
246          * because there is no offset patching with virtual memory.
247          */
248         if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
249             return i;
250         }
251     }
252 
253     /* New relocation, check if the backing array is large enough. */
254     if (csc->num_relocs >= csc->max_relocs) {
255         uint32_t size;
256         csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
257 
258         size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
259         csc->relocs_bo = realloc(csc->relocs_bo, size);
260 
261         size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
262         csc->relocs = realloc(csc->relocs, size);
263 
264         csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
265     }
266 
267     /* Initialize the new relocation. */
268     csc->relocs_bo[csc->num_relocs].bo = NULL;
269     csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
270     radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
271     p_atomic_inc(&bo->num_cs_references);
272     reloc = &csc->relocs[csc->num_relocs];
273     reloc->handle = bo->handle;
274     reloc->read_domains = 0;
275     reloc->write_domain = 0;
276     reloc->flags = 0;
277 
278     csc->reloc_indices_hashlist[hash] = csc->num_relocs;
279 
280     csc->chunks[1].length_dw += RELOC_DWORDS;
281 
282     return csc->num_relocs++;
283 }
284 
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)285 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
286                                             struct radeon_bo *bo)
287 {
288     struct radeon_cs_context *csc = cs->csc;
289     unsigned hash;
290     struct radeon_bo_item *item;
291     int idx;
292     int real_idx;
293 
294     idx = radeon_lookup_buffer(csc, bo);
295     if (idx >= 0)
296         return idx;
297 
298     real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
299 
300     /* Check if the backing array is large enough. */
301     if (csc->num_slab_buffers >= csc->max_slab_buffers) {
302         unsigned new_max = MAX2(csc->max_slab_buffers + 16,
303                                 (unsigned)(csc->max_slab_buffers * 1.3));
304         struct radeon_bo_item *new_buffers =
305             REALLOC(csc->slab_buffers,
306                     csc->max_slab_buffers * sizeof(*new_buffers),
307                     new_max * sizeof(*new_buffers));
308         if (!new_buffers) {
309             fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
310             return -1;
311         }
312 
313         csc->max_slab_buffers = new_max;
314         csc->slab_buffers = new_buffers;
315     }
316 
317     /* Initialize the new relocation. */
318     idx = csc->num_slab_buffers++;
319     item = &csc->slab_buffers[idx];
320 
321     item->bo = NULL;
322     item->u.slab.real_idx = real_idx;
323     radeon_bo_reference(&item->bo, bo);
324     p_atomic_inc(&bo->num_cs_references);
325 
326     hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
327     csc->reloc_indices_hashlist[hash] = idx;
328 
329     return idx;
330 }
331 
radeon_drm_cs_add_buffer(struct radeon_winsys_cs * rcs,struct pb_buffer * buf,enum radeon_bo_usage usage,enum radeon_bo_domain domains,enum radeon_bo_priority priority)332 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
333                                         struct pb_buffer *buf,
334                                         enum radeon_bo_usage usage,
335                                         enum radeon_bo_domain domains,
336                                         enum radeon_bo_priority priority)
337 {
338     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
339     struct radeon_bo *bo = (struct radeon_bo*)buf;
340     enum radeon_bo_domain added_domains;
341 
342     /* If VRAM is just stolen system memory, allow both VRAM and
343      * GTT, whichever has free space. If a buffer is evicted from
344      * VRAM to GTT, it will stay there.
345      */
346     if (!cs->ws->info.has_dedicated_vram)
347         domains |= RADEON_DOMAIN_GTT;
348 
349     enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
350     enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
351     struct drm_radeon_cs_reloc *reloc;
352     int index;
353 
354     if (!bo->handle) {
355         index = radeon_lookup_or_add_slab_buffer(cs, bo);
356         if (index < 0)
357             return 0;
358 
359         index = cs->csc->slab_buffers[index].u.slab.real_idx;
360     } else {
361         index = radeon_lookup_or_add_real_buffer(cs, bo);
362     }
363 
364     reloc = &cs->csc->relocs[index];
365     added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
366     reloc->read_domains |= rd;
367     reloc->write_domain |= wd;
368     reloc->flags = MAX2(reloc->flags, priority);
369     cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
370 
371     if (added_domains & RADEON_DOMAIN_VRAM)
372         cs->base.used_vram += bo->base.size;
373     else if (added_domains & RADEON_DOMAIN_GTT)
374         cs->base.used_gart += bo->base.size;
375 
376     return index;
377 }
378 
radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs * rcs,struct pb_buffer * buf)379 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
380                                    struct pb_buffer *buf)
381 {
382     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
383 
384     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
385 }
386 
radeon_drm_cs_validate(struct radeon_winsys_cs * rcs)387 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
388 {
389     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
390     bool status =
391         cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
392         cs->base.used_vram < cs->ws->info.vram_size * 0.8;
393 
394     if (status) {
395         cs->csc->num_validated_relocs = cs->csc->num_relocs;
396     } else {
397         /* Remove lately-added buffers. The validation failed with them
398          * and the CS is about to be flushed because of that. Keep only
399          * the already-validated buffers. */
400         unsigned i;
401 
402         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
403             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
404             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
405         }
406         cs->csc->num_relocs = cs->csc->num_validated_relocs;
407 
408         /* Flush if there are any relocs. Clean up otherwise. */
409         if (cs->csc->num_relocs) {
410             cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
411         } else {
412             radeon_cs_context_cleanup(cs->csc);
413             cs->base.used_vram = 0;
414             cs->base.used_gart = 0;
415 
416             assert(cs->base.current.cdw == 0);
417             if (cs->base.current.cdw != 0) {
418                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
419             }
420         }
421     }
422     return status;
423 }
424 
radeon_drm_cs_check_space(struct radeon_winsys_cs * rcs,unsigned dw)425 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
426 {
427    assert(rcs->current.cdw <= rcs->current.max_dw);
428    return rcs->current.max_dw - rcs->current.cdw >= dw;
429 }
430 
radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs * rcs,struct radeon_bo_list_item * list)431 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
432                                               struct radeon_bo_list_item *list)
433 {
434     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
435     int i;
436 
437     if (list) {
438         for (i = 0; i < cs->csc->num_relocs; i++) {
439             list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
440             list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
441             list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
442         }
443     }
444     return cs->csc->num_relocs;
445 }
446 
radeon_drm_cs_emit_ioctl_oneshot(void * job,int thread_index)447 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
448 {
449     struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
450     unsigned i;
451     int r;
452 
453     r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
454                             &csc->cs, sizeof(struct drm_radeon_cs));
455     if (r) {
456 	if (r == -ENOMEM)
457 	    fprintf(stderr, "radeon: Not enough memory for command submission.\n");
458 	else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
459             unsigned i;
460 
461             fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
462             for (i = 0; i < csc->chunks[0].length_dw; i++) {
463                 fprintf(stderr, "0x%08X\n", csc->buf[i]);
464             }
465         } else {
466             fprintf(stderr, "radeon: The kernel rejected CS, "
467                     "see dmesg for more information (%i).\n", r);
468         }
469     }
470 
471     for (i = 0; i < csc->num_relocs; i++)
472         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
473     for (i = 0; i < csc->num_slab_buffers; i++)
474         p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
475 
476     radeon_cs_context_cleanup(csc);
477 }
478 
479 /*
480  * Make sure previous submission of this cs are completed
481  */
radeon_drm_cs_sync_flush(struct radeon_winsys_cs * rcs)482 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
483 {
484     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
485 
486     /* Wait for any pending ioctl of this CS to complete. */
487     if (util_queue_is_initialized(&cs->ws->cs_queue))
488         util_queue_fence_wait(&cs->flush_completed);
489 }
490 
491 /* Add the given fence to a slab buffer fence list.
492  *
493  * There is a potential race condition when bo participates in submissions on
494  * two or more threads simultaneously. Since we do not know which of the
495  * submissions will be sent to the GPU first, we have to keep the fences
496  * of all submissions.
497  *
498  * However, fences that belong to submissions that have already returned from
499  * their respective ioctl do not have to be kept, because we know that they
500  * will signal earlier.
501  */
radeon_bo_slab_fence(struct radeon_bo * bo,struct radeon_bo * fence)502 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
503 {
504     unsigned dst;
505 
506     assert(fence->num_cs_references);
507 
508     /* Cleanup older fences */
509     dst = 0;
510     for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
511         if (bo->u.slab.fences[src]->num_cs_references) {
512             bo->u.slab.fences[dst] = bo->u.slab.fences[src];
513             dst++;
514         } else {
515             radeon_bo_reference(&bo->u.slab.fences[src], NULL);
516         }
517     }
518     bo->u.slab.num_fences = dst;
519 
520     /* Check available space for the new fence */
521     if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
522         unsigned new_max_fences = bo->u.slab.max_fences + 1;
523         struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
524                                                 bo->u.slab.max_fences * sizeof(*new_fences),
525                                                 new_max_fences * sizeof(*new_fences));
526         if (!new_fences) {
527             fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
528             return;
529         }
530 
531         bo->u.slab.fences = new_fences;
532         bo->u.slab.max_fences = new_max_fences;
533     }
534 
535     /* Add the new fence */
536     bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
537     radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
538     bo->u.slab.num_fences++;
539 }
540 
541 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
542 
radeon_drm_cs_flush(struct radeon_winsys_cs * rcs,unsigned flags,struct pipe_fence_handle ** pfence)543 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
544                                unsigned flags,
545                                struct pipe_fence_handle **pfence)
546 {
547     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
548     struct radeon_cs_context *tmp;
549 
550     switch (cs->ring_type) {
551     case RING_DMA:
552         /* pad DMA ring to 8 DWs */
553         if (cs->ws->info.chip_class <= SI) {
554             while (rcs->current.cdw & 7)
555                 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
556         } else {
557             while (rcs->current.cdw & 7)
558                 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
559         }
560         break;
561     case RING_GFX:
562         /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
563          * r6xx, requires at least 4 dw alignment to avoid a hw bug.
564          */
565         if (cs->ws->info.gfx_ib_pad_with_type2) {
566             while (rcs->current.cdw & 7)
567                 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
568         } else {
569             while (rcs->current.cdw & 7)
570                 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
571         }
572         break;
573     case RING_UVD:
574         while (rcs->current.cdw & 15)
575             radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
576         break;
577     default:
578         break;
579     }
580 
581     if (rcs->current.cdw > rcs->current.max_dw) {
582        fprintf(stderr, "radeon: command stream overflowed\n");
583     }
584 
585     if (pfence || cs->csc->num_slab_buffers) {
586         struct pipe_fence_handle *fence;
587 
588         if (cs->next_fence) {
589             fence = cs->next_fence;
590             cs->next_fence = NULL;
591         } else {
592             fence = radeon_cs_create_fence(rcs);
593         }
594 
595         if (fence) {
596             if (pfence)
597                 radeon_fence_reference(pfence, fence);
598 
599             mtx_lock(&cs->ws->bo_fence_lock);
600             for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
601                 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
602                 p_atomic_inc(&bo->num_active_ioctls);
603                 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
604             }
605             mtx_unlock(&cs->ws->bo_fence_lock);
606 
607             radeon_fence_reference(&fence, NULL);
608         }
609     } else {
610         radeon_fence_reference(&cs->next_fence, NULL);
611     }
612 
613     radeon_drm_cs_sync_flush(rcs);
614 
615     /* Swap command streams. */
616     tmp = cs->csc;
617     cs->csc = cs->cst;
618     cs->cst = tmp;
619 
620     /* If the CS is not empty or overflowed, emit it in a separate thread. */
621     if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
622         unsigned i, num_relocs;
623 
624         num_relocs = cs->cst->num_relocs;
625 
626         cs->cst->chunks[0].length_dw = cs->base.current.cdw;
627 
628         for (i = 0; i < num_relocs; i++) {
629             /* Update the number of active asynchronous CS ioctls for the buffer. */
630             p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
631         }
632 
633         switch (cs->ring_type) {
634         case RING_DMA:
635             cs->cst->flags[0] = 0;
636             cs->cst->flags[1] = RADEON_CS_RING_DMA;
637             cs->cst->cs.num_chunks = 3;
638             if (cs->ws->info.has_virtual_memory) {
639                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
640             }
641             break;
642 
643         case RING_UVD:
644             cs->cst->flags[0] = 0;
645             cs->cst->flags[1] = RADEON_CS_RING_UVD;
646             cs->cst->cs.num_chunks = 3;
647             break;
648 
649         case RING_VCE:
650             cs->cst->flags[0] = 0;
651             cs->cst->flags[1] = RADEON_CS_RING_VCE;
652             cs->cst->cs.num_chunks = 3;
653             break;
654 
655         default:
656         case RING_GFX:
657         case RING_COMPUTE:
658             cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
659             cs->cst->flags[1] = RADEON_CS_RING_GFX;
660             cs->cst->cs.num_chunks = 3;
661 
662             if (cs->ws->info.has_virtual_memory) {
663                 cs->cst->flags[0] |= RADEON_CS_USE_VM;
664                 cs->cst->cs.num_chunks = 3;
665             }
666             if (flags & PIPE_FLUSH_END_OF_FRAME) {
667                 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
668                 cs->cst->cs.num_chunks = 3;
669             }
670             if (cs->ring_type == RING_COMPUTE) {
671                 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
672                 cs->cst->cs.num_chunks = 3;
673             }
674             break;
675         }
676 
677         if (util_queue_is_initialized(&cs->ws->cs_queue)) {
678             util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
679                                radeon_drm_cs_emit_ioctl_oneshot, NULL);
680             if (!(flags & PIPE_FLUSH_ASYNC))
681                 radeon_drm_cs_sync_flush(rcs);
682         } else {
683             radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
684         }
685     } else {
686         radeon_cs_context_cleanup(cs->cst);
687     }
688 
689     /* Prepare a new CS. */
690     cs->base.current.buf = cs->csc->buf;
691     cs->base.current.cdw = 0;
692     cs->base.used_vram = 0;
693     cs->base.used_gart = 0;
694 
695     if (cs->ring_type == RING_GFX)
696         cs->ws->num_gfx_IBs++;
697     else if (cs->ring_type == RING_DMA)
698         cs->ws->num_sdma_IBs++;
699     return 0;
700 }
701 
radeon_drm_cs_destroy(struct radeon_winsys_cs * rcs)702 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
703 {
704     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
705 
706     radeon_drm_cs_sync_flush(rcs);
707     util_queue_fence_destroy(&cs->flush_completed);
708     radeon_cs_context_cleanup(&cs->csc1);
709     radeon_cs_context_cleanup(&cs->csc2);
710     p_atomic_dec(&cs->ws->num_cs);
711     radeon_destroy_cs_context(&cs->csc1);
712     radeon_destroy_cs_context(&cs->csc2);
713     radeon_fence_reference(&cs->next_fence, NULL);
714     FREE(cs);
715 }
716 
radeon_bo_is_referenced(struct radeon_winsys_cs * rcs,struct pb_buffer * _buf,enum radeon_bo_usage usage)717 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
718                                     struct pb_buffer *_buf,
719                                     enum radeon_bo_usage usage)
720 {
721     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
722     struct radeon_bo *bo = (struct radeon_bo*)_buf;
723     int index;
724 
725     if (!bo->num_cs_references)
726         return false;
727 
728     index = radeon_lookup_buffer(cs->csc, bo);
729     if (index == -1)
730         return false;
731 
732     if (!bo->handle)
733         index = cs->csc->slab_buffers[index].u.slab.real_idx;
734 
735     if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
736         return true;
737     if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
738         return true;
739 
740     return false;
741 }
742 
743 /* FENCES */
744 
745 static struct pipe_fence_handle *
radeon_cs_create_fence(struct radeon_winsys_cs * rcs)746 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
747 {
748     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
749     struct pb_buffer *fence;
750 
751     /* Create a fence, which is a dummy BO. */
752     fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
753                                        RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
754     if (!fence)
755        return NULL;
756 
757     /* Add the fence as a dummy relocation. */
758     cs->ws->base.cs_add_buffer(rcs, fence,
759                               RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
760                               RADEON_PRIO_FENCE);
761     return (struct pipe_fence_handle*)fence;
762 }
763 
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)764 static bool radeon_fence_wait(struct radeon_winsys *ws,
765                               struct pipe_fence_handle *fence,
766                               uint64_t timeout)
767 {
768     return ws->buffer_wait((struct pb_buffer*)fence, timeout,
769                            RADEON_USAGE_READWRITE);
770 }
771 
radeon_fence_reference(struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)772 static void radeon_fence_reference(struct pipe_fence_handle **dst,
773                                    struct pipe_fence_handle *src)
774 {
775     pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
776 }
777 
778 static struct pipe_fence_handle *
radeon_drm_cs_get_next_fence(struct radeon_winsys_cs * rcs)779 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
780 {
781    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
782    struct pipe_fence_handle *fence = NULL;
783 
784    if (cs->next_fence) {
785       radeon_fence_reference(&fence, cs->next_fence);
786       return fence;
787    }
788 
789    fence = radeon_cs_create_fence(rcs);
790    if (!fence)
791       return NULL;
792 
793    radeon_fence_reference(&cs->next_fence, fence);
794    return fence;
795 }
796 
797 static void
radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs * cs,struct pipe_fence_handle * fence)798 radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
799                                    struct pipe_fence_handle *fence)
800 {
801    /* TODO: Handle the following unlikely multi-threaded scenario:
802     *
803     *  Thread 1 / Context 1                   Thread 2 / Context 2
804     *  --------------------                   --------------------
805     *  f = cs_get_next_fence()
806     *                                         cs_add_fence_dependency(f)
807     *                                         cs_flush()
808     *  cs_flush()
809     *
810     * We currently assume that this does not happen because we don't support
811     * asynchronous flushes on Radeon.
812     */
813 }
814 
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)815 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
816 {
817     ws->base.ctx_create = radeon_drm_ctx_create;
818     ws->base.ctx_destroy = radeon_drm_ctx_destroy;
819     ws->base.cs_create = radeon_drm_cs_create;
820     ws->base.cs_destroy = radeon_drm_cs_destroy;
821     ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
822     ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
823     ws->base.cs_validate = radeon_drm_cs_validate;
824     ws->base.cs_check_space = radeon_drm_cs_check_space;
825     ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
826     ws->base.cs_flush = radeon_drm_cs_flush;
827     ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
828     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
829     ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
830     ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
831     ws->base.fence_wait = radeon_fence_wait;
832     ws->base.fence_reference = radeon_fence_reference;
833 }
834