1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "si_pipe.h"
26 #include "si_query.h"
27 #include "sid.h"
28 #include "util/u_memory.h"
29 #include "util/u_suballoc.h"
30 
31 #include <stddef.h>
32 
33 /**
34  * The query buffer is written to by ESGS NGG shaders with statistics about
35  * generated and (streamout-)emitted primitives.
36  *
37  * The context maintains a ring of these query buffers, and queries simply
38  * point into the ring, allowing an arbitrary number of queries to be active
39  * without additional GPU cost.
40  */
41 struct gfx10_sh_query_buffer {
42    struct list_head list;
43    struct si_resource *buf;
44    unsigned refcount;
45 
46    /* Offset into the buffer in bytes; points at the first un-emitted entry. */
47    unsigned head;
48 };
49 
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51  * (including QBO shaders) and should be aligned to cachelines.
52  *
53  * The somewhat awkward memory layout is for compatibility with the
54  * SET_PREDICATION packet, which also means that we're setting the high bit
55  * of all those values unconditionally.
56  */
57 struct gfx10_sh_query_buffer_mem {
58    struct {
59       uint64_t generated_primitives_start_dummy;
60       uint64_t emitted_primitives_start_dummy;
61       uint64_t generated_primitives;
62       uint64_t emitted_primitives;
63    } stream[4];
64    uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
65    uint32_t pad[31];
66 };
67 
68 /* Shader-based queries. */
69 struct gfx10_sh_query {
70    struct si_query b;
71 
72    struct gfx10_sh_query_buffer *first;
73    struct gfx10_sh_query_buffer *last;
74    unsigned first_begin;
75    unsigned last_end;
76 
77    unsigned stream;
78 };
79 
emit_shader_query(struct si_context * sctx)80 static void emit_shader_query(struct si_context *sctx)
81 {
82    assert(!list_is_empty(&sctx->shader_query_buffers));
83 
84    struct gfx10_sh_query_buffer *qbuf =
85       list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
86    qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
87 }
88 
gfx10_release_query_buffers(struct si_context * sctx,struct gfx10_sh_query_buffer * first,struct gfx10_sh_query_buffer * last)89 static void gfx10_release_query_buffers(struct si_context *sctx,
90                                         struct gfx10_sh_query_buffer *first,
91                                         struct gfx10_sh_query_buffer *last)
92 {
93    while (first) {
94       struct gfx10_sh_query_buffer *qbuf = first;
95       if (first != last)
96          first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
97       else
98          first = NULL;
99 
100       qbuf->refcount--;
101       if (qbuf->refcount)
102          continue;
103 
104       if (qbuf->list.next == &sctx->shader_query_buffers)
105          continue; /* keep the most recent buffer; it may not be full yet */
106       if (qbuf->list.prev == &sctx->shader_query_buffers)
107          continue; /* keep the oldest buffer for recycling */
108 
109       list_del(&qbuf->list);
110       si_resource_reference(&qbuf->buf, NULL);
111       FREE(qbuf);
112    }
113 }
114 
gfx10_alloc_query_buffer(struct si_context * sctx)115 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
116 {
117    if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
118       return true;
119 
120    struct gfx10_sh_query_buffer *qbuf = NULL;
121 
122    if (!list_is_empty(&sctx->shader_query_buffers)) {
123       qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
124       if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
125          goto success;
126 
127       qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
128       if (!qbuf->refcount &&
129           !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
130           sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
131          /* Can immediately re-use the oldest buffer */
132          list_del(&qbuf->list);
133       } else {
134          qbuf = NULL;
135       }
136    }
137 
138    if (!qbuf) {
139       qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
140       if (unlikely(!qbuf))
141          return false;
142 
143       struct si_screen *screen = sctx->screen;
144       unsigned buf_size =
145          MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
146       qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
147       if (unlikely(!qbuf->buf)) {
148          FREE(qbuf);
149          return false;
150       }
151    }
152 
153    /* The buffer is currently unused by the GPU. Initialize it.
154     *
155     * We need to set the high bit of all the primitive counters for
156     * compatibility with the SET_PREDICATION packet.
157     */
158    uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
159                                             PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
160    assert(results);
161 
162    for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
163         ++i) {
164       for (unsigned j = 0; j < 16; ++j)
165          results[32 * i + j] = (uint64_t)1 << 63;
166       results[32 * i + 16] = 0;
167    }
168 
169    list_addtail(&qbuf->list, &sctx->shader_query_buffers);
170    qbuf->head = 0;
171    qbuf->refcount = sctx->num_active_shader_queries;
172 
173 success:;
174    struct pipe_shader_buffer sbuf;
175    sbuf.buffer = &qbuf->buf->b.b;
176    sbuf.buffer_offset = qbuf->head;
177    sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
178    si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
179    sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
180 
181    si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
182    return true;
183 }
184 
gfx10_sh_query_destroy(struct si_context * sctx,struct si_query * rquery)185 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
186 {
187    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
188    gfx10_release_query_buffers(sctx, query->first, query->last);
189    FREE(query);
190 }
191 
gfx10_sh_query_begin(struct si_context * sctx,struct si_query * rquery)192 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
193 {
194    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
195 
196    gfx10_release_query_buffers(sctx, query->first, query->last);
197    query->first = query->last = NULL;
198 
199    if (unlikely(!gfx10_alloc_query_buffer(sctx)))
200       return false;
201 
202    query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
203    query->first_begin = query->first->head;
204 
205    sctx->num_active_shader_queries++;
206    query->first->refcount++;
207 
208    return true;
209 }
210 
gfx10_sh_query_end(struct si_context * sctx,struct si_query * rquery)211 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
212 {
213    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
214 
215    if (unlikely(!query->first))
216       return false; /* earlier out of memory error */
217 
218    query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
219    query->last_end = query->last->head;
220 
221    /* Signal the fence of the previous chunk */
222    if (query->last_end != 0) {
223       uint64_t fence_va = query->last->buf->gpu_address;
224       fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
225       fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
226       si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
227                         EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
228                         0xffffffff, PIPE_QUERY_GPU_FINISHED);
229    }
230 
231    sctx->num_active_shader_queries--;
232 
233    if (sctx->num_active_shader_queries > 0) {
234       gfx10_alloc_query_buffer(sctx);
235    } else {
236       si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
237       sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
238 
239       /* If a query_begin is followed by a query_end without a draw
240        * in-between, we need to clear the atom to ensure that the
241        * next query_begin will re-initialize the shader buffer. */
242       si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
243    }
244 
245    return true;
246 }
247 
gfx10_sh_query_add_result(struct gfx10_sh_query * query,struct gfx10_sh_query_buffer_mem * qmem,union pipe_query_result * result)248 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
249                                       struct gfx10_sh_query_buffer_mem *qmem,
250                                       union pipe_query_result *result)
251 {
252    static const uint64_t mask = ((uint64_t)1 << 63) - 1;
253 
254    switch (query->b.type) {
255    case PIPE_QUERY_PRIMITIVES_EMITTED:
256       result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
257       break;
258    case PIPE_QUERY_PRIMITIVES_GENERATED:
259       result->u64 += qmem->stream[query->stream].generated_primitives & mask;
260       break;
261    case PIPE_QUERY_SO_STATISTICS:
262       result->so_statistics.num_primitives_written +=
263          qmem->stream[query->stream].emitted_primitives & mask;
264       result->so_statistics.primitives_storage_needed +=
265          qmem->stream[query->stream].generated_primitives & mask;
266       break;
267    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
268       result->b |= qmem->stream[query->stream].emitted_primitives !=
269                    qmem->stream[query->stream].generated_primitives;
270       break;
271    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
272       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
273          result->b |= qmem->stream[stream].emitted_primitives !=
274                       qmem->stream[stream].generated_primitives;
275       }
276       break;
277    default:
278       assert(0);
279    }
280 }
281 
gfx10_sh_query_get_result(struct si_context * sctx,struct si_query * rquery,bool wait,union pipe_query_result * result)282 static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
283                                       union pipe_query_result *result)
284 {
285    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
286 
287    util_query_clear_result(result, query->b.type);
288 
289    if (unlikely(!query->first))
290       return false; /* earlier out of memory error */
291    assert(query->last);
292 
293    for (struct gfx10_sh_query_buffer *qbuf = query->last;;
294         qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
295       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
296       void *map;
297 
298       if (rquery->b.flushed)
299          map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
300       else
301          map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
302 
303       if (!map)
304          return false;
305 
306       unsigned results_begin = 0;
307       unsigned results_end = qbuf->head;
308       if (qbuf == query->first)
309          results_begin = query->first_begin;
310       if (qbuf == query->last)
311          results_end = query->last_end;
312 
313       while (results_begin != results_end) {
314          struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
315          results_begin += sizeof(*qmem);
316 
317          gfx10_sh_query_add_result(query, qmem, result);
318       }
319 
320       if (qbuf == query->first)
321          break;
322    }
323 
324    return true;
325 }
326 
gfx10_sh_query_get_result_resource(struct si_context * sctx,struct si_query * rquery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)327 static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
328                                                bool wait, enum pipe_query_value_type result_type,
329                                                int index, struct pipe_resource *resource,
330                                                unsigned offset)
331 {
332    struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
333    struct si_qbo_state saved_state = {};
334    struct pipe_resource *tmp_buffer = NULL;
335    unsigned tmp_buffer_offset = 0;
336 
337    if (!sctx->sh_query_result_shader) {
338       sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
339       if (!sctx->sh_query_result_shader)
340          return;
341    }
342 
343    if (query->first != query->last) {
344       u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
345       if (!tmp_buffer)
346          return;
347    }
348 
349    si_save_qbo_state(sctx, &saved_state);
350 
351    /* Pre-fill the constants configuring the shader behavior. */
352    struct {
353       uint32_t config;
354       uint32_t offset;
355       uint32_t chain;
356       uint32_t result_count;
357    } consts;
358    struct pipe_constant_buffer constant_buffer = {};
359 
360    if (index >= 0) {
361       switch (query->b.type) {
362       case PIPE_QUERY_PRIMITIVES_GENERATED:
363          consts.offset = sizeof(uint32_t) * query->stream;
364          consts.config = 0;
365          break;
366       case PIPE_QUERY_PRIMITIVES_EMITTED:
367          consts.offset = sizeof(uint32_t) * (4 + query->stream);
368          consts.config = 0;
369          break;
370       case PIPE_QUERY_SO_STATISTICS:
371          consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
372          consts.config = 0;
373          break;
374       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
375          consts.offset = sizeof(uint32_t) * query->stream;
376          consts.config = 2;
377          break;
378       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
379          consts.offset = 0;
380          consts.config = 3;
381          break;
382       default:
383          unreachable("bad query type");
384       }
385    } else {
386       /* Check result availability. */
387       consts.offset = 0;
388       consts.config = 1;
389    }
390 
391    if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
392       consts.config |= 8;
393 
394    constant_buffer.buffer_size = sizeof(consts);
395    constant_buffer.user_buffer = &consts;
396 
397    /* Pre-fill the SSBOs and grid. */
398    struct pipe_shader_buffer ssbo[3];
399    struct pipe_grid_info grid = {};
400 
401    ssbo[1].buffer = tmp_buffer;
402    ssbo[1].buffer_offset = tmp_buffer_offset;
403    ssbo[1].buffer_size = 16;
404 
405    ssbo[2] = ssbo[1];
406 
407    sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
408 
409    grid.block[0] = 1;
410    grid.block[1] = 1;
411    grid.block[2] = 1;
412    grid.grid[0] = 1;
413    grid.grid[1] = 1;
414    grid.grid[2] = 1;
415 
416    struct gfx10_sh_query_buffer *qbuf = query->first;
417    for (;;) {
418       unsigned begin = qbuf == query->first ? query->first_begin : 0;
419       unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
420       if (!end)
421          continue;
422 
423       ssbo[0].buffer = &qbuf->buf->b.b;
424       ssbo[0].buffer_offset = begin;
425       ssbo[0].buffer_size = end - begin;
426 
427       consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
428       consts.chain = 0;
429       if (qbuf != query->first)
430          consts.chain |= 1;
431       if (qbuf != query->last)
432          consts.chain |= 2;
433 
434       if (qbuf == query->last) {
435          ssbo[2].buffer = resource;
436          ssbo[2].buffer_offset = offset;
437          ssbo[2].buffer_size = 8;
438       }
439 
440       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
441       sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
442 
443       if (wait) {
444          uint64_t va;
445 
446          /* Wait for result availability. Wait only for readiness
447           * of the last entry, since the fence writes should be
448           * serialized in the CP.
449           */
450          va = qbuf->buf->gpu_address;
451          va += end - sizeof(struct gfx10_sh_query_buffer_mem);
452          va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
453 
454          si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
455       }
456 
457       sctx->b.launch_grid(&sctx->b, &grid);
458       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
459 
460       if (qbuf == query->last)
461          break;
462       qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
463    }
464 
465    si_restore_qbo_state(sctx, &saved_state);
466    pipe_resource_reference(&tmp_buffer, NULL);
467 }
468 
469 static const struct si_query_ops gfx10_sh_query_ops = {
470    .destroy = gfx10_sh_query_destroy,
471    .begin = gfx10_sh_query_begin,
472    .end = gfx10_sh_query_end,
473    .get_result = gfx10_sh_query_get_result,
474    .get_result_resource = gfx10_sh_query_get_result_resource,
475 };
476 
gfx10_sh_query_create(struct si_screen * screen,enum pipe_query_type query_type,unsigned index)477 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
478                                          unsigned index)
479 {
480    struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
481    if (unlikely(!query))
482       return NULL;
483 
484    query->b.ops = &gfx10_sh_query_ops;
485    query->b.type = query_type;
486    query->stream = index;
487 
488    return (struct pipe_query *)query;
489 }
490 
gfx10_init_query(struct si_context * sctx)491 void gfx10_init_query(struct si_context *sctx)
492 {
493    list_inithead(&sctx->shader_query_buffers);
494    sctx->atoms.s.shader_query.emit = emit_shader_query;
495 }
496 
gfx10_destroy_query(struct si_context * sctx)497 void gfx10_destroy_query(struct si_context *sctx)
498 {
499    while (!list_is_empty(&sctx->shader_query_buffers)) {
500       struct gfx10_sh_query_buffer *qbuf =
501          list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
502       list_del(&qbuf->list);
503 
504       assert(!qbuf->refcount);
505       si_resource_reference(&qbuf->buf, NULL);
506       FREE(qbuf);
507    }
508 }
509