1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4  * Copyright 2018 Advanced Micro Devices, Inc.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * on the rights to use, copy, modify, merge, publish, distribute, sub
11  * license, and/or sell copies of the Software, and to permit persons to whom
12  * the Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the next
15  * paragraph) shall be included in all copies or substantial portions of the
16  * Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24  * USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 #include "si_query.h"
28 
29 #include "amd/common/sid.h"
30 #include "si_pipe.h"
31 #include "util/os_time.h"
32 #include "util/u_memory.h"
33 #include "util/u_suballoc.h"
34 #include "util/u_upload_mgr.h"
35 
36 static const struct si_query_ops query_hw_ops;
37 
38 struct si_hw_query_params {
39    unsigned start_offset;
40    unsigned end_offset;
41    unsigned fence_offset;
42    unsigned pair_stride;
43    unsigned pair_count;
44 };
45 
46 /* Queries without buffer handling or suspend/resume. */
47 struct si_query_sw {
48    struct si_query b;
49 
50    uint64_t begin_result;
51    uint64_t end_result;
52 
53    uint64_t begin_time;
54    uint64_t end_time;
55 
56    /* Fence for GPU_FINISHED. */
57    struct pipe_fence_handle *fence;
58 };
59 
si_query_sw_destroy(struct si_context * sctx,struct si_query * squery)60 static void si_query_sw_destroy(struct si_context *sctx, struct si_query *squery)
61 {
62    struct si_query_sw *query = (struct si_query_sw *)squery;
63 
64    sctx->b.screen->fence_reference(sctx->b.screen, &query->fence, NULL);
65    FREE(query);
66 }
67 
winsys_id_from_type(unsigned type)68 static enum radeon_value_id winsys_id_from_type(unsigned type)
69 {
70    switch (type) {
71    case SI_QUERY_REQUESTED_VRAM:
72       return RADEON_REQUESTED_VRAM_MEMORY;
73    case SI_QUERY_REQUESTED_GTT:
74       return RADEON_REQUESTED_GTT_MEMORY;
75    case SI_QUERY_MAPPED_VRAM:
76       return RADEON_MAPPED_VRAM;
77    case SI_QUERY_MAPPED_GTT:
78       return RADEON_MAPPED_GTT;
79    case SI_QUERY_BUFFER_WAIT_TIME:
80       return RADEON_BUFFER_WAIT_TIME_NS;
81    case SI_QUERY_NUM_MAPPED_BUFFERS:
82       return RADEON_NUM_MAPPED_BUFFERS;
83    case SI_QUERY_NUM_GFX_IBS:
84       return RADEON_NUM_GFX_IBS;
85    case SI_QUERY_NUM_SDMA_IBS:
86       return RADEON_NUM_SDMA_IBS;
87    case SI_QUERY_GFX_BO_LIST_SIZE:
88       return RADEON_GFX_BO_LIST_COUNTER;
89    case SI_QUERY_GFX_IB_SIZE:
90       return RADEON_GFX_IB_SIZE_COUNTER;
91    case SI_QUERY_NUM_BYTES_MOVED:
92       return RADEON_NUM_BYTES_MOVED;
93    case SI_QUERY_NUM_EVICTIONS:
94       return RADEON_NUM_EVICTIONS;
95    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS:
96       return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
97    case SI_QUERY_VRAM_USAGE:
98       return RADEON_VRAM_USAGE;
99    case SI_QUERY_VRAM_VIS_USAGE:
100       return RADEON_VRAM_VIS_USAGE;
101    case SI_QUERY_GTT_USAGE:
102       return RADEON_GTT_USAGE;
103    case SI_QUERY_GPU_TEMPERATURE:
104       return RADEON_GPU_TEMPERATURE;
105    case SI_QUERY_CURRENT_GPU_SCLK:
106       return RADEON_CURRENT_SCLK;
107    case SI_QUERY_CURRENT_GPU_MCLK:
108       return RADEON_CURRENT_MCLK;
109    case SI_QUERY_CS_THREAD_BUSY:
110       return RADEON_CS_THREAD_TIME;
111    default:
112       unreachable("query type does not correspond to winsys id");
113    }
114 }
115 
si_finish_dma_get_cpu_time(struct si_context * sctx)116 static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
117 {
118    struct pipe_fence_handle *fence = NULL;
119 
120    si_flush_dma_cs(sctx, 0, &fence);
121    if (fence) {
122       sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
123       sctx->ws->fence_reference(&fence, NULL);
124    }
125 
126    return os_time_get_nano();
127 }
128 
si_query_sw_begin(struct si_context * sctx,struct si_query * squery)129 static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
130 {
131    struct si_query_sw *query = (struct si_query_sw *)squery;
132    enum radeon_value_id ws_id;
133 
134    switch (query->b.type) {
135    case PIPE_QUERY_TIMESTAMP_DISJOINT:
136    case PIPE_QUERY_GPU_FINISHED:
137       break;
138    case SI_QUERY_TIME_ELAPSED_SDMA_SI:
139       query->begin_result = si_finish_dma_get_cpu_time(sctx);
140       break;
141    case SI_QUERY_DRAW_CALLS:
142       query->begin_result = sctx->num_draw_calls;
143       break;
144    case SI_QUERY_DECOMPRESS_CALLS:
145       query->begin_result = sctx->num_decompress_calls;
146       break;
147    case SI_QUERY_MRT_DRAW_CALLS:
148       query->begin_result = sctx->num_mrt_draw_calls;
149       break;
150    case SI_QUERY_PRIM_RESTART_CALLS:
151       query->begin_result = sctx->num_prim_restart_calls;
152       break;
153    case SI_QUERY_SPILL_DRAW_CALLS:
154       query->begin_result = sctx->num_spill_draw_calls;
155       break;
156    case SI_QUERY_COMPUTE_CALLS:
157       query->begin_result = sctx->num_compute_calls;
158       break;
159    case SI_QUERY_SPILL_COMPUTE_CALLS:
160       query->begin_result = sctx->num_spill_compute_calls;
161       break;
162    case SI_QUERY_DMA_CALLS:
163       query->begin_result = sctx->num_dma_calls;
164       break;
165    case SI_QUERY_CP_DMA_CALLS:
166       query->begin_result = sctx->num_cp_dma_calls;
167       break;
168    case SI_QUERY_NUM_VS_FLUSHES:
169       query->begin_result = sctx->num_vs_flushes;
170       break;
171    case SI_QUERY_NUM_PS_FLUSHES:
172       query->begin_result = sctx->num_ps_flushes;
173       break;
174    case SI_QUERY_NUM_CS_FLUSHES:
175       query->begin_result = sctx->num_cs_flushes;
176       break;
177    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
178       query->begin_result = sctx->num_cb_cache_flushes;
179       break;
180    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
181       query->begin_result = sctx->num_db_cache_flushes;
182       break;
183    case SI_QUERY_NUM_L2_INVALIDATES:
184       query->begin_result = sctx->num_L2_invalidates;
185       break;
186    case SI_QUERY_NUM_L2_WRITEBACKS:
187       query->begin_result = sctx->num_L2_writebacks;
188       break;
189    case SI_QUERY_NUM_RESIDENT_HANDLES:
190       query->begin_result = sctx->num_resident_handles;
191       break;
192    case SI_QUERY_TC_OFFLOADED_SLOTS:
193       query->begin_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
194       break;
195    case SI_QUERY_TC_DIRECT_SLOTS:
196       query->begin_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
197       break;
198    case SI_QUERY_TC_NUM_SYNCS:
199       query->begin_result = sctx->tc ? sctx->tc->num_syncs : 0;
200       break;
201    case SI_QUERY_REQUESTED_VRAM:
202    case SI_QUERY_REQUESTED_GTT:
203    case SI_QUERY_MAPPED_VRAM:
204    case SI_QUERY_MAPPED_GTT:
205    case SI_QUERY_VRAM_USAGE:
206    case SI_QUERY_VRAM_VIS_USAGE:
207    case SI_QUERY_GTT_USAGE:
208    case SI_QUERY_GPU_TEMPERATURE:
209    case SI_QUERY_CURRENT_GPU_SCLK:
210    case SI_QUERY_CURRENT_GPU_MCLK:
211    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
212    case SI_QUERY_NUM_MAPPED_BUFFERS:
213       query->begin_result = 0;
214       break;
215    case SI_QUERY_BUFFER_WAIT_TIME:
216    case SI_QUERY_GFX_IB_SIZE:
217    case SI_QUERY_NUM_GFX_IBS:
218    case SI_QUERY_NUM_SDMA_IBS:
219    case SI_QUERY_NUM_BYTES_MOVED:
220    case SI_QUERY_NUM_EVICTIONS:
221    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
222       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
223       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
224       break;
225    }
226    case SI_QUERY_GFX_BO_LIST_SIZE:
227       ws_id = winsys_id_from_type(query->b.type);
228       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
229       query->begin_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
230       break;
231    case SI_QUERY_CS_THREAD_BUSY:
232       ws_id = winsys_id_from_type(query->b.type);
233       query->begin_result = sctx->ws->query_value(sctx->ws, ws_id);
234       query->begin_time = os_time_get_nano();
235       break;
236    case SI_QUERY_GALLIUM_THREAD_BUSY:
237       query->begin_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
238       query->begin_time = os_time_get_nano();
239       break;
240    case SI_QUERY_GPU_LOAD:
241    case SI_QUERY_GPU_SHADERS_BUSY:
242    case SI_QUERY_GPU_TA_BUSY:
243    case SI_QUERY_GPU_GDS_BUSY:
244    case SI_QUERY_GPU_VGT_BUSY:
245    case SI_QUERY_GPU_IA_BUSY:
246    case SI_QUERY_GPU_SX_BUSY:
247    case SI_QUERY_GPU_WD_BUSY:
248    case SI_QUERY_GPU_BCI_BUSY:
249    case SI_QUERY_GPU_SC_BUSY:
250    case SI_QUERY_GPU_PA_BUSY:
251    case SI_QUERY_GPU_DB_BUSY:
252    case SI_QUERY_GPU_CP_BUSY:
253    case SI_QUERY_GPU_CB_BUSY:
254    case SI_QUERY_GPU_SDMA_BUSY:
255    case SI_QUERY_GPU_PFP_BUSY:
256    case SI_QUERY_GPU_MEQ_BUSY:
257    case SI_QUERY_GPU_ME_BUSY:
258    case SI_QUERY_GPU_SURF_SYNC_BUSY:
259    case SI_QUERY_GPU_CP_DMA_BUSY:
260    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
261       query->begin_result = si_begin_counter(sctx->screen, query->b.type);
262       break;
263    case SI_QUERY_NUM_COMPILATIONS:
264       query->begin_result = p_atomic_read(&sctx->screen->num_compilations);
265       break;
266    case SI_QUERY_NUM_SHADERS_CREATED:
267       query->begin_result = p_atomic_read(&sctx->screen->num_shaders_created);
268       break;
269    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
270       query->begin_result = sctx->screen->live_shader_cache.hits;
271       break;
272    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
273       query->begin_result = sctx->screen->live_shader_cache.misses;
274       break;
275    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
276       query->begin_result = sctx->screen->num_memory_shader_cache_hits;
277       break;
278    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
279       query->begin_result = sctx->screen->num_memory_shader_cache_misses;
280       break;
281    case SI_QUERY_DISK_SHADER_CACHE_HITS:
282       query->begin_result = sctx->screen->num_disk_shader_cache_hits;
283       break;
284    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
285       query->begin_result = sctx->screen->num_disk_shader_cache_misses;
286       break;
287    case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
288       query->begin_result = sctx->compute_num_verts_accepted;
289       break;
290    case SI_QUERY_PD_NUM_PRIMS_REJECTED:
291       query->begin_result = sctx->compute_num_verts_rejected;
292       break;
293    case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
294       query->begin_result = sctx->compute_num_verts_ineligible;
295       break;
296    case SI_QUERY_GPIN_ASIC_ID:
297    case SI_QUERY_GPIN_NUM_SIMD:
298    case SI_QUERY_GPIN_NUM_RB:
299    case SI_QUERY_GPIN_NUM_SPI:
300    case SI_QUERY_GPIN_NUM_SE:
301       break;
302    default:
303       unreachable("si_query_sw_begin: bad query type");
304    }
305 
306    return true;
307 }
308 
si_query_sw_end(struct si_context * sctx,struct si_query * squery)309 static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery)
310 {
311    struct si_query_sw *query = (struct si_query_sw *)squery;
312    enum radeon_value_id ws_id;
313 
314    switch (query->b.type) {
315    case PIPE_QUERY_TIMESTAMP_DISJOINT:
316       break;
317    case PIPE_QUERY_GPU_FINISHED:
318       sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
319       break;
320    case SI_QUERY_TIME_ELAPSED_SDMA_SI:
321       query->end_result = si_finish_dma_get_cpu_time(sctx);
322       break;
323    case SI_QUERY_DRAW_CALLS:
324       query->end_result = sctx->num_draw_calls;
325       break;
326    case SI_QUERY_DECOMPRESS_CALLS:
327       query->end_result = sctx->num_decompress_calls;
328       break;
329    case SI_QUERY_MRT_DRAW_CALLS:
330       query->end_result = sctx->num_mrt_draw_calls;
331       break;
332    case SI_QUERY_PRIM_RESTART_CALLS:
333       query->end_result = sctx->num_prim_restart_calls;
334       break;
335    case SI_QUERY_SPILL_DRAW_CALLS:
336       query->end_result = sctx->num_spill_draw_calls;
337       break;
338    case SI_QUERY_COMPUTE_CALLS:
339       query->end_result = sctx->num_compute_calls;
340       break;
341    case SI_QUERY_SPILL_COMPUTE_CALLS:
342       query->end_result = sctx->num_spill_compute_calls;
343       break;
344    case SI_QUERY_DMA_CALLS:
345       query->end_result = sctx->num_dma_calls;
346       break;
347    case SI_QUERY_CP_DMA_CALLS:
348       query->end_result = sctx->num_cp_dma_calls;
349       break;
350    case SI_QUERY_NUM_VS_FLUSHES:
351       query->end_result = sctx->num_vs_flushes;
352       break;
353    case SI_QUERY_NUM_PS_FLUSHES:
354       query->end_result = sctx->num_ps_flushes;
355       break;
356    case SI_QUERY_NUM_CS_FLUSHES:
357       query->end_result = sctx->num_cs_flushes;
358       break;
359    case SI_QUERY_NUM_CB_CACHE_FLUSHES:
360       query->end_result = sctx->num_cb_cache_flushes;
361       break;
362    case SI_QUERY_NUM_DB_CACHE_FLUSHES:
363       query->end_result = sctx->num_db_cache_flushes;
364       break;
365    case SI_QUERY_NUM_L2_INVALIDATES:
366       query->end_result = sctx->num_L2_invalidates;
367       break;
368    case SI_QUERY_NUM_L2_WRITEBACKS:
369       query->end_result = sctx->num_L2_writebacks;
370       break;
371    case SI_QUERY_NUM_RESIDENT_HANDLES:
372       query->end_result = sctx->num_resident_handles;
373       break;
374    case SI_QUERY_TC_OFFLOADED_SLOTS:
375       query->end_result = sctx->tc ? sctx->tc->num_offloaded_slots : 0;
376       break;
377    case SI_QUERY_TC_DIRECT_SLOTS:
378       query->end_result = sctx->tc ? sctx->tc->num_direct_slots : 0;
379       break;
380    case SI_QUERY_TC_NUM_SYNCS:
381       query->end_result = sctx->tc ? sctx->tc->num_syncs : 0;
382       break;
383    case SI_QUERY_REQUESTED_VRAM:
384    case SI_QUERY_REQUESTED_GTT:
385    case SI_QUERY_MAPPED_VRAM:
386    case SI_QUERY_MAPPED_GTT:
387    case SI_QUERY_VRAM_USAGE:
388    case SI_QUERY_VRAM_VIS_USAGE:
389    case SI_QUERY_GTT_USAGE:
390    case SI_QUERY_GPU_TEMPERATURE:
391    case SI_QUERY_CURRENT_GPU_SCLK:
392    case SI_QUERY_CURRENT_GPU_MCLK:
393    case SI_QUERY_BUFFER_WAIT_TIME:
394    case SI_QUERY_GFX_IB_SIZE:
395    case SI_QUERY_NUM_MAPPED_BUFFERS:
396    case SI_QUERY_NUM_GFX_IBS:
397    case SI_QUERY_NUM_SDMA_IBS:
398    case SI_QUERY_NUM_BYTES_MOVED:
399    case SI_QUERY_NUM_EVICTIONS:
400    case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
401       enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
402       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
403       break;
404    }
405    case SI_QUERY_GFX_BO_LIST_SIZE:
406       ws_id = winsys_id_from_type(query->b.type);
407       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
408       query->end_time = sctx->ws->query_value(sctx->ws, RADEON_NUM_GFX_IBS);
409       break;
410    case SI_QUERY_CS_THREAD_BUSY:
411       ws_id = winsys_id_from_type(query->b.type);
412       query->end_result = sctx->ws->query_value(sctx->ws, ws_id);
413       query->end_time = os_time_get_nano();
414       break;
415    case SI_QUERY_GALLIUM_THREAD_BUSY:
416       query->end_result = sctx->tc ? util_queue_get_thread_time_nano(&sctx->tc->queue, 0) : 0;
417       query->end_time = os_time_get_nano();
418       break;
419    case SI_QUERY_GPU_LOAD:
420    case SI_QUERY_GPU_SHADERS_BUSY:
421    case SI_QUERY_GPU_TA_BUSY:
422    case SI_QUERY_GPU_GDS_BUSY:
423    case SI_QUERY_GPU_VGT_BUSY:
424    case SI_QUERY_GPU_IA_BUSY:
425    case SI_QUERY_GPU_SX_BUSY:
426    case SI_QUERY_GPU_WD_BUSY:
427    case SI_QUERY_GPU_BCI_BUSY:
428    case SI_QUERY_GPU_SC_BUSY:
429    case SI_QUERY_GPU_PA_BUSY:
430    case SI_QUERY_GPU_DB_BUSY:
431    case SI_QUERY_GPU_CP_BUSY:
432    case SI_QUERY_GPU_CB_BUSY:
433    case SI_QUERY_GPU_SDMA_BUSY:
434    case SI_QUERY_GPU_PFP_BUSY:
435    case SI_QUERY_GPU_MEQ_BUSY:
436    case SI_QUERY_GPU_ME_BUSY:
437    case SI_QUERY_GPU_SURF_SYNC_BUSY:
438    case SI_QUERY_GPU_CP_DMA_BUSY:
439    case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
440       query->end_result = si_end_counter(sctx->screen, query->b.type, query->begin_result);
441       query->begin_result = 0;
442       break;
443    case SI_QUERY_NUM_COMPILATIONS:
444       query->end_result = p_atomic_read(&sctx->screen->num_compilations);
445       break;
446    case SI_QUERY_NUM_SHADERS_CREATED:
447       query->end_result = p_atomic_read(&sctx->screen->num_shaders_created);
448       break;
449    case SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
450       query->end_result = sctx->last_tex_ps_draw_ratio;
451       break;
452    case SI_QUERY_LIVE_SHADER_CACHE_HITS:
453       query->end_result = sctx->screen->live_shader_cache.hits;
454       break;
455    case SI_QUERY_LIVE_SHADER_CACHE_MISSES:
456       query->end_result = sctx->screen->live_shader_cache.misses;
457       break;
458    case SI_QUERY_MEMORY_SHADER_CACHE_HITS:
459       query->end_result = sctx->screen->num_memory_shader_cache_hits;
460       break;
461    case SI_QUERY_MEMORY_SHADER_CACHE_MISSES:
462       query->end_result = sctx->screen->num_memory_shader_cache_misses;
463       break;
464    case SI_QUERY_DISK_SHADER_CACHE_HITS:
465       query->end_result = sctx->screen->num_disk_shader_cache_hits;
466       break;
467    case SI_QUERY_DISK_SHADER_CACHE_MISSES:
468       query->end_result = sctx->screen->num_disk_shader_cache_misses;
469       break;
470    case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
471       query->end_result = sctx->compute_num_verts_accepted;
472       break;
473    case SI_QUERY_PD_NUM_PRIMS_REJECTED:
474       query->end_result = sctx->compute_num_verts_rejected;
475       break;
476    case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
477       query->end_result = sctx->compute_num_verts_ineligible;
478       break;
479    case SI_QUERY_GPIN_ASIC_ID:
480    case SI_QUERY_GPIN_NUM_SIMD:
481    case SI_QUERY_GPIN_NUM_RB:
482    case SI_QUERY_GPIN_NUM_SPI:
483    case SI_QUERY_GPIN_NUM_SE:
484       break;
485    default:
486       unreachable("si_query_sw_end: bad query type");
487    }
488 
489    return true;
490 }
491 
si_query_sw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)492 static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
493                                    union pipe_query_result *result)
494 {
495    struct si_query_sw *query = (struct si_query_sw *)squery;
496 
497    switch (query->b.type) {
498    case PIPE_QUERY_TIMESTAMP_DISJOINT:
499       /* Convert from cycles per millisecond to cycles per second (Hz). */
500       result->timestamp_disjoint.frequency = (uint64_t)sctx->screen->info.clock_crystal_freq * 1000;
501       result->timestamp_disjoint.disjoint = false;
502       return true;
503    case PIPE_QUERY_GPU_FINISHED: {
504       struct pipe_screen *screen = sctx->b.screen;
505       struct pipe_context *ctx = squery->b.flushed ? NULL : &sctx->b;
506 
507       result->b = screen->fence_finish(screen, ctx, query->fence, wait ? PIPE_TIMEOUT_INFINITE : 0);
508       return result->b;
509    }
510 
511    case SI_QUERY_GFX_BO_LIST_SIZE:
512       result->u64 =
513          (query->end_result - query->begin_result) / (query->end_time - query->begin_time);
514       return true;
515    case SI_QUERY_CS_THREAD_BUSY:
516    case SI_QUERY_GALLIUM_THREAD_BUSY:
517       result->u64 =
518          (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time);
519       return true;
520    case SI_QUERY_PD_NUM_PRIMS_ACCEPTED:
521    case SI_QUERY_PD_NUM_PRIMS_REJECTED:
522    case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE:
523       result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3;
524       return true;
525    case SI_QUERY_GPIN_ASIC_ID:
526       result->u32 = 0;
527       return true;
528    case SI_QUERY_GPIN_NUM_SIMD:
529       result->u32 = sctx->screen->info.num_good_compute_units;
530       return true;
531    case SI_QUERY_GPIN_NUM_RB:
532       result->u32 = sctx->screen->info.num_render_backends;
533       return true;
534    case SI_QUERY_GPIN_NUM_SPI:
535       result->u32 = 1; /* all supported chips have one SPI per SE */
536       return true;
537    case SI_QUERY_GPIN_NUM_SE:
538       result->u32 = sctx->screen->info.max_se;
539       return true;
540    }
541 
542    result->u64 = query->end_result - query->begin_result;
543 
544    switch (query->b.type) {
545    case SI_QUERY_BUFFER_WAIT_TIME:
546    case SI_QUERY_GPU_TEMPERATURE:
547       result->u64 /= 1000;
548       break;
549    case SI_QUERY_CURRENT_GPU_SCLK:
550    case SI_QUERY_CURRENT_GPU_MCLK:
551       result->u64 *= 1000000;
552       break;
553    }
554 
555    return true;
556 }
557 
558 static const struct si_query_ops sw_query_ops = {.destroy = si_query_sw_destroy,
559                                                  .begin = si_query_sw_begin,
560                                                  .end = si_query_sw_end,
561                                                  .get_result = si_query_sw_get_result,
562                                                  .get_result_resource = NULL};
563 
si_query_sw_create(unsigned query_type)564 static struct pipe_query *si_query_sw_create(unsigned query_type)
565 {
566    struct si_query_sw *query;
567 
568    query = CALLOC_STRUCT(si_query_sw);
569    if (!query)
570       return NULL;
571 
572    query->b.type = query_type;
573    query->b.ops = &sw_query_ops;
574 
575    return (struct pipe_query *)query;
576 }
577 
si_query_buffer_destroy(struct si_screen * sscreen,struct si_query_buffer * buffer)578 void si_query_buffer_destroy(struct si_screen *sscreen, struct si_query_buffer *buffer)
579 {
580    struct si_query_buffer *prev = buffer->previous;
581 
582    /* Release all query buffers. */
583    while (prev) {
584       struct si_query_buffer *qbuf = prev;
585       prev = prev->previous;
586       si_resource_reference(&qbuf->buf, NULL);
587       FREE(qbuf);
588    }
589 
590    si_resource_reference(&buffer->buf, NULL);
591 }
592 
si_query_buffer_reset(struct si_context * sctx,struct si_query_buffer * buffer)593 void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer)
594 {
595    /* Discard all query buffers except for the oldest. */
596    while (buffer->previous) {
597       struct si_query_buffer *qbuf = buffer->previous;
598       buffer->previous = qbuf->previous;
599 
600       si_resource_reference(&buffer->buf, NULL);
601       buffer->buf = qbuf->buf; /* move ownership */
602       FREE(qbuf);
603    }
604    buffer->results_end = 0;
605 
606    if (!buffer->buf)
607       return;
608 
609    /* Discard even the oldest buffer if it can't be mapped without a stall. */
610    if (si_rings_is_buffer_referenced(sctx, buffer->buf->buf, RADEON_USAGE_READWRITE) ||
611        !sctx->ws->buffer_wait(buffer->buf->buf, 0, RADEON_USAGE_READWRITE)) {
612       si_resource_reference(&buffer->buf, NULL);
613    } else {
614       buffer->unprepared = true;
615    }
616 }
617 
si_query_buffer_alloc(struct si_context * sctx,struct si_query_buffer * buffer,bool (* prepare_buffer)(struct si_context *,struct si_query_buffer *),unsigned size)618 bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
619                            bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
620                            unsigned size)
621 {
622    bool unprepared = buffer->unprepared;
623    buffer->unprepared = false;
624 
625    if (!buffer->buf || buffer->results_end + size > buffer->buf->b.b.width0) {
626       if (buffer->buf) {
627          struct si_query_buffer *qbuf = MALLOC_STRUCT(si_query_buffer);
628          memcpy(qbuf, buffer, sizeof(*qbuf));
629          buffer->previous = qbuf;
630       }
631       buffer->results_end = 0;
632 
633       /* Queries are normally read by the CPU after
634        * being written by the gpu, hence staging is probably a good
635        * usage pattern.
636        */
637       struct si_screen *screen = sctx->screen;
638       unsigned buf_size = MAX2(size, screen->info.min_alloc_size);
639       buffer->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
640       if (unlikely(!buffer->buf))
641          return false;
642       unprepared = true;
643    }
644 
645    if (unprepared && prepare_buffer) {
646       if (unlikely(!prepare_buffer(sctx, buffer))) {
647          si_resource_reference(&buffer->buf, NULL);
648          return false;
649       }
650    }
651 
652    return true;
653 }
654 
si_query_hw_destroy(struct si_context * sctx,struct si_query * squery)655 void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery)
656 {
657    struct si_query_hw *query = (struct si_query_hw *)squery;
658 
659    si_query_buffer_destroy(sctx->screen, &query->buffer);
660    si_resource_reference(&query->workaround_buf, NULL);
661    FREE(squery);
662 }
663 
si_query_hw_prepare_buffer(struct si_context * sctx,struct si_query_buffer * qbuf)664 static bool si_query_hw_prepare_buffer(struct si_context *sctx, struct si_query_buffer *qbuf)
665 {
666    static const struct si_query_hw si_query_hw_s;
667    struct si_query_hw *query = container_of(qbuf, &si_query_hw_s, buffer);
668    struct si_screen *screen = sctx->screen;
669 
670    /* The caller ensures that the buffer is currently unused by the GPU. */
671    uint32_t *results = screen->ws->buffer_map(qbuf->buf->buf, NULL,
672                                               PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED);
673    if (!results)
674       return false;
675 
676    memset(results, 0, qbuf->buf->b.b.width0);
677 
678    if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
679        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
680        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
681       unsigned max_rbs = screen->info.num_render_backends;
682       unsigned enabled_rb_mask = screen->info.enabled_rb_mask;
683       unsigned num_results;
684       unsigned i, j;
685 
686       /* Set top bits for unused backends. */
687       num_results = qbuf->buf->b.b.width0 / query->result_size;
688       for (j = 0; j < num_results; j++) {
689          for (i = 0; i < max_rbs; i++) {
690             if (!(enabled_rb_mask & (1 << i))) {
691                results[(i * 4) + 1] = 0x80000000;
692                results[(i * 4) + 3] = 0x80000000;
693             }
694          }
695          results += 4 * max_rbs;
696       }
697    }
698 
699    return true;
700 }
701 
702 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
703                                             bool wait, enum pipe_query_value_type result_type,
704                                             int index, struct pipe_resource *resource,
705                                             unsigned offset);
706 
707 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
708                                       struct si_resource *buffer, uint64_t va);
709 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
710                                      struct si_resource *buffer, uint64_t va);
711 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *, void *buffer,
712                                    union pipe_query_result *result);
713 static void si_query_hw_clear_result(struct si_query_hw *, union pipe_query_result *);
714 
715 static struct si_query_hw_ops query_hw_default_hw_ops = {
716    .prepare_buffer = si_query_hw_prepare_buffer,
717    .emit_start = si_query_hw_do_emit_start,
718    .emit_stop = si_query_hw_do_emit_stop,
719    .clear_result = si_query_hw_clear_result,
720    .add_result = si_query_hw_add_result,
721 };
722 
si_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)723 static struct pipe_query *si_query_hw_create(struct si_screen *sscreen, unsigned query_type,
724                                              unsigned index)
725 {
726    struct si_query_hw *query = CALLOC_STRUCT(si_query_hw);
727    if (!query)
728       return NULL;
729 
730    query->b.type = query_type;
731    query->b.ops = &query_hw_ops;
732    query->ops = &query_hw_default_hw_ops;
733 
734    switch (query_type) {
735    case PIPE_QUERY_OCCLUSION_COUNTER:
736    case PIPE_QUERY_OCCLUSION_PREDICATE:
737    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
738       query->result_size = 16 * sscreen->info.num_render_backends;
739       query->result_size += 16; /* for the fence + alignment */
740       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
741       break;
742    case SI_QUERY_TIME_ELAPSED_SDMA:
743       /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
744       query->result_size = 64;
745       break;
746    case PIPE_QUERY_TIME_ELAPSED:
747       query->result_size = 24;
748       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
749       break;
750    case PIPE_QUERY_TIMESTAMP:
751       query->result_size = 16;
752       query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
753       query->flags = SI_QUERY_HW_FLAG_NO_START;
754       break;
755    case PIPE_QUERY_PRIMITIVES_EMITTED:
756    case PIPE_QUERY_PRIMITIVES_GENERATED:
757    case PIPE_QUERY_SO_STATISTICS:
758    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
759       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
760       query->result_size = 32;
761       query->b.num_cs_dw_suspend = 6;
762       query->stream = index;
763       break;
764    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
765       /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
766       query->result_size = 32 * SI_MAX_STREAMS;
767       query->b.num_cs_dw_suspend = 6 * SI_MAX_STREAMS;
768       break;
769    case PIPE_QUERY_PIPELINE_STATISTICS:
770       /* 11 values on GCN. */
771       query->result_size = 11 * 16;
772       query->result_size += 8; /* for the fence + alignment */
773       query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
774       break;
775    default:
776       assert(0);
777       FREE(query);
778       return NULL;
779    }
780 
781    return (struct pipe_query *)query;
782 }
783 
si_update_occlusion_query_state(struct si_context * sctx,unsigned type,int diff)784 static void si_update_occlusion_query_state(struct si_context *sctx, unsigned type, int diff)
785 {
786    if (type == PIPE_QUERY_OCCLUSION_COUNTER || type == PIPE_QUERY_OCCLUSION_PREDICATE ||
787        type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
788       bool old_enable = sctx->num_occlusion_queries != 0;
789       bool old_perfect_enable = sctx->num_perfect_occlusion_queries != 0;
790       bool enable, perfect_enable;
791 
792       sctx->num_occlusion_queries += diff;
793       assert(sctx->num_occlusion_queries >= 0);
794 
795       if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
796          sctx->num_perfect_occlusion_queries += diff;
797          assert(sctx->num_perfect_occlusion_queries >= 0);
798       }
799 
800       enable = sctx->num_occlusion_queries != 0;
801       perfect_enable = sctx->num_perfect_occlusion_queries != 0;
802 
803       if (enable != old_enable || perfect_enable != old_perfect_enable) {
804          si_set_occlusion_query_state(sctx, old_perfect_enable);
805       }
806    }
807 }
808 
event_type_for_stream(unsigned stream)809 static unsigned event_type_for_stream(unsigned stream)
810 {
811    switch (stream) {
812    default:
813    case 0:
814       return V_028A90_SAMPLE_STREAMOUTSTATS;
815    case 1:
816       return V_028A90_SAMPLE_STREAMOUTSTATS1;
817    case 2:
818       return V_028A90_SAMPLE_STREAMOUTSTATS2;
819    case 3:
820       return V_028A90_SAMPLE_STREAMOUTSTATS3;
821    }
822 }
823 
emit_sample_streamout(struct radeon_cmdbuf * cs,uint64_t va,unsigned stream)824 static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
825 {
826    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
827    radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
828    radeon_emit(cs, va);
829    radeon_emit(cs, va >> 32);
830 }
831 
si_query_hw_do_emit_start(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)832 static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
833                                       struct si_resource *buffer, uint64_t va)
834 {
835    struct radeon_cmdbuf *cs = sctx->gfx_cs;
836 
837    switch (query->b.type) {
838    case SI_QUERY_TIME_ELAPSED_SDMA:
839       si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
840       return;
841    case PIPE_QUERY_OCCLUSION_COUNTER:
842    case PIPE_QUERY_OCCLUSION_PREDICATE:
843    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
844       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
845       radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
846       radeon_emit(cs, va);
847       radeon_emit(cs, va >> 32);
848       break;
849    case PIPE_QUERY_PRIMITIVES_EMITTED:
850    case PIPE_QUERY_PRIMITIVES_GENERATED:
851    case PIPE_QUERY_SO_STATISTICS:
852    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
853       emit_sample_streamout(cs, va, query->stream);
854       break;
855    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
856       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
857          emit_sample_streamout(cs, va + 32 * stream, stream);
858       break;
859    case PIPE_QUERY_TIME_ELAPSED:
860       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
861                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
862       break;
863    case PIPE_QUERY_PIPELINE_STATISTICS:
864       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
865       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
866       radeon_emit(cs, va);
867       radeon_emit(cs, va >> 32);
868       break;
869    default:
870       assert(0);
871    }
872    radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
873                              RADEON_PRIO_QUERY);
874 }
875 
si_query_hw_emit_start(struct si_context * sctx,struct si_query_hw * query)876 static void si_query_hw_emit_start(struct si_context *sctx, struct si_query_hw *query)
877 {
878    uint64_t va;
879 
880    if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer, query->result_size))
881       return;
882 
883    si_update_occlusion_query_state(sctx, query->b.type, 1);
884    si_update_prims_generated_query_state(sctx, query->b.type, 1);
885 
886    if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
887       sctx->num_pipeline_stat_queries++;
888 
889    if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
890       si_need_gfx_cs_space(sctx, 0);
891 
892    va = query->buffer.buf->gpu_address + query->buffer.results_end;
893    query->ops->emit_start(sctx, query, query->buffer.buf, va);
894 }
895 
si_query_hw_do_emit_stop(struct si_context * sctx,struct si_query_hw * query,struct si_resource * buffer,uint64_t va)896 static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw *query,
897                                      struct si_resource *buffer, uint64_t va)
898 {
899    struct radeon_cmdbuf *cs = sctx->gfx_cs;
900    uint64_t fence_va = 0;
901 
902    switch (query->b.type) {
903    case SI_QUERY_TIME_ELAPSED_SDMA:
904       si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
905       return;
906    case PIPE_QUERY_OCCLUSION_COUNTER:
907    case PIPE_QUERY_OCCLUSION_PREDICATE:
908    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
909       va += 8;
910       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
911       radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
912       radeon_emit(cs, va);
913       radeon_emit(cs, va >> 32);
914 
915       fence_va = va + sctx->screen->info.num_render_backends * 16 - 8;
916       break;
917    case PIPE_QUERY_PRIMITIVES_EMITTED:
918    case PIPE_QUERY_PRIMITIVES_GENERATED:
919    case PIPE_QUERY_SO_STATISTICS:
920    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
921       va += 16;
922       emit_sample_streamout(cs, va, query->stream);
923       break;
924    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
925       va += 16;
926       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream)
927          emit_sample_streamout(cs, va + 32 * stream, stream);
928       break;
929    case PIPE_QUERY_TIME_ELAPSED:
930       va += 8;
931       /* fall through */
932    case PIPE_QUERY_TIMESTAMP:
933       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
934                         EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
935       fence_va = va + 8;
936       break;
937    case PIPE_QUERY_PIPELINE_STATISTICS: {
938       unsigned sample_size = (query->result_size - 8) / 2;
939 
940       va += sample_size;
941       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
942       radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
943       radeon_emit(cs, va);
944       radeon_emit(cs, va >> 32);
945 
946       fence_va = va + sample_size;
947       break;
948    }
949    default:
950       assert(0);
951    }
952    radeon_add_to_buffer_list(sctx, sctx->gfx_cs, query->buffer.buf, RADEON_USAGE_WRITE,
953                              RADEON_PRIO_QUERY);
954 
955    if (fence_va) {
956       si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
957                         EOP_DATA_SEL_VALUE_32BIT, query->buffer.buf, fence_va, 0x80000000,
958                         query->b.type);
959    }
960 }
961 
si_query_hw_emit_stop(struct si_context * sctx,struct si_query_hw * query)962 static void si_query_hw_emit_stop(struct si_context *sctx, struct si_query_hw *query)
963 {
964    uint64_t va;
965 
966    /* The queries which need begin already called this in begin_query. */
967    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
968       si_need_gfx_cs_space(sctx, 0);
969       if (!si_query_buffer_alloc(sctx, &query->buffer, query->ops->prepare_buffer,
970                                  query->result_size))
971          return;
972    }
973 
974    if (!query->buffer.buf)
975       return; // previous buffer allocation failure
976 
977    /* emit end query */
978    va = query->buffer.buf->gpu_address + query->buffer.results_end;
979 
980    query->ops->emit_stop(sctx, query, query->buffer.buf, va);
981 
982    query->buffer.results_end += query->result_size;
983 
984    si_update_occlusion_query_state(sctx, query->b.type, -1);
985    si_update_prims_generated_query_state(sctx, query->b.type, -1);
986 
987    if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
988       sctx->num_pipeline_stat_queries--;
989 }
990 
emit_set_predicate(struct si_context * ctx,struct si_resource * buf,uint64_t va,uint32_t op)991 static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf, uint64_t va,
992                                uint32_t op)
993 {
994    struct radeon_cmdbuf *cs = ctx->gfx_cs;
995 
996    if (ctx->chip_class >= GFX9) {
997       radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
998       radeon_emit(cs, op);
999       radeon_emit(cs, va);
1000       radeon_emit(cs, va >> 32);
1001    } else {
1002       radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
1003       radeon_emit(cs, va);
1004       radeon_emit(cs, op | ((va >> 32) & 0xFF));
1005    }
1006    radeon_add_to_buffer_list(ctx, ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
1007 }
1008 
si_emit_query_predication(struct si_context * ctx)1009 static void si_emit_query_predication(struct si_context *ctx)
1010 {
1011    struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
1012    struct si_query_buffer *qbuf;
1013    uint32_t op;
1014    bool flag_wait, invert;
1015 
1016    if (!query)
1017       return;
1018 
1019    if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1020                                           query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) {
1021       assert(!"not implemented");
1022    }
1023 
1024    invert = ctx->render_cond_invert;
1025    flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
1026                ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
1027 
1028    if (query->workaround_buf) {
1029       op = PRED_OP(PREDICATION_OP_BOOL64);
1030    } else {
1031       switch (query->b.type) {
1032       case PIPE_QUERY_OCCLUSION_COUNTER:
1033       case PIPE_QUERY_OCCLUSION_PREDICATE:
1034       case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1035          op = PRED_OP(PREDICATION_OP_ZPASS);
1036          break;
1037       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1038       case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1039          op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
1040          invert = !invert;
1041          break;
1042       default:
1043          assert(0);
1044          return;
1045       }
1046    }
1047 
1048    /* if true then invert, see GL_ARB_conditional_render_inverted */
1049    if (invert)
1050       op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
1051    else
1052       op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
1053 
1054    /* Use the value written by compute shader as a workaround. Note that
1055     * the wait flag does not apply in this predication mode.
1056     *
1057     * The shader outputs the result value to L2. Workarounds only affect GFX8
1058     * and later, where the CP reads data from L2, so we don't need an
1059     * additional flush.
1060     */
1061    if (query->workaround_buf) {
1062       uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
1063       emit_set_predicate(ctx, query->workaround_buf, va, op);
1064       return;
1065    }
1066 
1067    op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1068 
1069    /* emit predicate packets for all data blocks */
1070    for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1071       unsigned results_base = 0;
1072       uint64_t va_base = qbuf->buf->gpu_address;
1073 
1074       while (results_base < qbuf->results_end) {
1075          uint64_t va = va_base + results_base;
1076 
1077          if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1078             for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1079                emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1080 
1081                /* set CONTINUE bit for all packets except the first */
1082                op |= PREDICATION_CONTINUE;
1083             }
1084          } else {
1085             emit_set_predicate(ctx, qbuf->buf, va, op);
1086             op |= PREDICATION_CONTINUE;
1087          }
1088 
1089          results_base += query->result_size;
1090       }
1091    }
1092 }
1093 
si_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1094 static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned query_type,
1095                                           unsigned index)
1096 {
1097    struct si_screen *sscreen = (struct si_screen *)ctx->screen;
1098 
1099    if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
1100        (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA))
1101       return si_query_sw_create(query_type);
1102 
1103    if (sscreen->use_ngg_streamout &&
1104        (query_type == PIPE_QUERY_PRIMITIVES_EMITTED ||
1105         query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS ||
1106         query_type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1107         query_type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE))
1108       return gfx10_sh_query_create(sscreen, query_type, index);
1109 
1110    return si_query_hw_create(sscreen, query_type, index);
1111 }
1112 
si_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1113 static void si_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1114 {
1115    struct si_context *sctx = (struct si_context *)ctx;
1116    struct si_query *squery = (struct si_query *)query;
1117 
1118    squery->ops->destroy(sctx, squery);
1119 }
1120 
si_begin_query(struct pipe_context * ctx,struct pipe_query * query)1121 static bool si_begin_query(struct pipe_context *ctx, struct pipe_query *query)
1122 {
1123    struct si_context *sctx = (struct si_context *)ctx;
1124    struct si_query *squery = (struct si_query *)query;
1125 
1126    return squery->ops->begin(sctx, squery);
1127 }
1128 
si_query_hw_begin(struct si_context * sctx,struct si_query * squery)1129 bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery)
1130 {
1131    struct si_query_hw *query = (struct si_query_hw *)squery;
1132 
1133    if (query->flags & SI_QUERY_HW_FLAG_NO_START) {
1134       assert(0);
1135       return false;
1136    }
1137 
1138    if (!(query->flags & SI_QUERY_HW_FLAG_BEGIN_RESUMES))
1139       si_query_buffer_reset(sctx, &query->buffer);
1140 
1141    si_resource_reference(&query->workaround_buf, NULL);
1142 
1143    si_query_hw_emit_start(sctx, query);
1144    if (!query->buffer.buf)
1145       return false;
1146 
1147    list_addtail(&query->b.active_list, &sctx->active_queries);
1148    sctx->num_cs_dw_queries_suspend += query->b.num_cs_dw_suspend;
1149    return true;
1150 }
1151 
si_end_query(struct pipe_context * ctx,struct pipe_query * query)1152 static bool si_end_query(struct pipe_context *ctx, struct pipe_query *query)
1153 {
1154    struct si_context *sctx = (struct si_context *)ctx;
1155    struct si_query *squery = (struct si_query *)query;
1156 
1157    return squery->ops->end(sctx, squery);
1158 }
1159 
si_query_hw_end(struct si_context * sctx,struct si_query * squery)1160 bool si_query_hw_end(struct si_context *sctx, struct si_query *squery)
1161 {
1162    struct si_query_hw *query = (struct si_query_hw *)squery;
1163 
1164    if (query->flags & SI_QUERY_HW_FLAG_NO_START)
1165       si_query_buffer_reset(sctx, &query->buffer);
1166 
1167    si_query_hw_emit_stop(sctx, query);
1168 
1169    if (!(query->flags & SI_QUERY_HW_FLAG_NO_START)) {
1170       list_delinit(&query->b.active_list);
1171       sctx->num_cs_dw_queries_suspend -= query->b.num_cs_dw_suspend;
1172    }
1173 
1174    if (!query->buffer.buf)
1175       return false;
1176 
1177    return true;
1178 }
1179 
si_get_hw_query_params(struct si_context * sctx,struct si_query_hw * squery,int index,struct si_hw_query_params * params)1180 static void si_get_hw_query_params(struct si_context *sctx, struct si_query_hw *squery, int index,
1181                                    struct si_hw_query_params *params)
1182 {
1183    unsigned max_rbs = sctx->screen->info.num_render_backends;
1184 
1185    params->pair_stride = 0;
1186    params->pair_count = 1;
1187 
1188    switch (squery->b.type) {
1189    case PIPE_QUERY_OCCLUSION_COUNTER:
1190    case PIPE_QUERY_OCCLUSION_PREDICATE:
1191    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1192       params->start_offset = 0;
1193       params->end_offset = 8;
1194       params->fence_offset = max_rbs * 16;
1195       params->pair_stride = 16;
1196       params->pair_count = max_rbs;
1197       break;
1198    case PIPE_QUERY_TIME_ELAPSED:
1199       params->start_offset = 0;
1200       params->end_offset = 8;
1201       params->fence_offset = 16;
1202       break;
1203    case PIPE_QUERY_TIMESTAMP:
1204       params->start_offset = 0;
1205       params->end_offset = 0;
1206       params->fence_offset = 8;
1207       break;
1208    case PIPE_QUERY_PRIMITIVES_EMITTED:
1209       params->start_offset = 8;
1210       params->end_offset = 24;
1211       params->fence_offset = params->end_offset + 4;
1212       break;
1213    case PIPE_QUERY_PRIMITIVES_GENERATED:
1214       params->start_offset = 0;
1215       params->end_offset = 16;
1216       params->fence_offset = params->end_offset + 4;
1217       break;
1218    case PIPE_QUERY_SO_STATISTICS:
1219       params->start_offset = 8 - index * 8;
1220       params->end_offset = 24 - index * 8;
1221       params->fence_offset = params->end_offset + 4;
1222       break;
1223    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1224       params->pair_count = SI_MAX_STREAMS;
1225       params->pair_stride = 32;
1226       /* fallthrough */
1227    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1228       params->start_offset = 0;
1229       params->end_offset = 16;
1230 
1231       /* We can re-use the high dword of the last 64-bit value as a
1232        * fence: it is initialized as 0, and the high bit is set by
1233        * the write of the streamout stats event.
1234        */
1235       params->fence_offset = squery->result_size - 4;
1236       break;
1237    case PIPE_QUERY_PIPELINE_STATISTICS: {
1238       static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1239       params->start_offset = offsets[index];
1240       params->end_offset = 88 + offsets[index];
1241       params->fence_offset = 2 * 88;
1242       break;
1243    }
1244    default:
1245       unreachable("si_get_hw_query_params unsupported");
1246    }
1247 }
1248 
si_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1249 static unsigned si_query_read_result(void *map, unsigned start_index, unsigned end_index,
1250                                      bool test_status_bit)
1251 {
1252    uint32_t *current_result = (uint32_t *)map;
1253    uint64_t start, end;
1254 
1255    start = (uint64_t)current_result[start_index] | (uint64_t)current_result[start_index + 1] << 32;
1256    end = (uint64_t)current_result[end_index] | (uint64_t)current_result[end_index + 1] << 32;
1257 
1258    if (!test_status_bit || ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1259       return end - start;
1260    }
1261    return 0;
1262 }
1263 
si_query_hw_add_result(struct si_screen * sscreen,struct si_query_hw * query,void * buffer,union pipe_query_result * result)1264 static void si_query_hw_add_result(struct si_screen *sscreen, struct si_query_hw *query,
1265                                    void *buffer, union pipe_query_result *result)
1266 {
1267    unsigned max_rbs = sscreen->info.num_render_backends;
1268 
1269    switch (query->b.type) {
1270    case PIPE_QUERY_OCCLUSION_COUNTER: {
1271       for (unsigned i = 0; i < max_rbs; ++i) {
1272          unsigned results_base = i * 16;
1273          result->u64 += si_query_read_result(buffer + results_base, 0, 2, true);
1274       }
1275       break;
1276    }
1277    case PIPE_QUERY_OCCLUSION_PREDICATE:
1278    case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1279       for (unsigned i = 0; i < max_rbs; ++i) {
1280          unsigned results_base = i * 16;
1281          result->b = result->b || si_query_read_result(buffer + results_base, 0, 2, true) != 0;
1282       }
1283       break;
1284    }
1285    case PIPE_QUERY_TIME_ELAPSED:
1286       result->u64 += si_query_read_result(buffer, 0, 2, false);
1287       break;
1288    case SI_QUERY_TIME_ELAPSED_SDMA:
1289       result->u64 += si_query_read_result(buffer, 0, 32 / 4, false);
1290       break;
1291    case PIPE_QUERY_TIMESTAMP:
1292       result->u64 = *(uint64_t *)buffer;
1293       break;
1294    case PIPE_QUERY_PRIMITIVES_EMITTED:
1295       /* SAMPLE_STREAMOUTSTATS stores this structure:
1296        * {
1297        *    u64 NumPrimitivesWritten;
1298        *    u64 PrimitiveStorageNeeded;
1299        * }
1300        * We only need NumPrimitivesWritten here. */
1301       result->u64 += si_query_read_result(buffer, 2, 6, true);
1302       break;
1303    case PIPE_QUERY_PRIMITIVES_GENERATED:
1304       /* Here we read PrimitiveStorageNeeded. */
1305       result->u64 += si_query_read_result(buffer, 0, 4, true);
1306       break;
1307    case PIPE_QUERY_SO_STATISTICS:
1308       result->so_statistics.num_primitives_written += si_query_read_result(buffer, 2, 6, true);
1309       result->so_statistics.primitives_storage_needed += si_query_read_result(buffer, 0, 4, true);
1310       break;
1311    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1312       result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1313                                   si_query_read_result(buffer, 0, 4, true);
1314       break;
1315    case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1316       for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
1317          result->b = result->b || si_query_read_result(buffer, 2, 6, true) !=
1318                                      si_query_read_result(buffer, 0, 4, true);
1319          buffer = (char *)buffer + 32;
1320       }
1321       break;
1322    case PIPE_QUERY_PIPELINE_STATISTICS:
1323       result->pipeline_statistics.ps_invocations += si_query_read_result(buffer, 0, 22, false);
1324       result->pipeline_statistics.c_primitives += si_query_read_result(buffer, 2, 24, false);
1325       result->pipeline_statistics.c_invocations += si_query_read_result(buffer, 4, 26, false);
1326       result->pipeline_statistics.vs_invocations += si_query_read_result(buffer, 6, 28, false);
1327       result->pipeline_statistics.gs_invocations += si_query_read_result(buffer, 8, 30, false);
1328       result->pipeline_statistics.gs_primitives += si_query_read_result(buffer, 10, 32, false);
1329       result->pipeline_statistics.ia_primitives += si_query_read_result(buffer, 12, 34, false);
1330       result->pipeline_statistics.ia_vertices += si_query_read_result(buffer, 14, 36, false);
1331       result->pipeline_statistics.hs_invocations += si_query_read_result(buffer, 16, 38, false);
1332       result->pipeline_statistics.ds_invocations += si_query_read_result(buffer, 18, 40, false);
1333       result->pipeline_statistics.cs_invocations += si_query_read_result(buffer, 20, 42, false);
1334 #if 0 /* for testing */
1335       printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1336              "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1337              "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1338              result->pipeline_statistics.ia_vertices,
1339              result->pipeline_statistics.ia_primitives,
1340              result->pipeline_statistics.vs_invocations,
1341              result->pipeline_statistics.hs_invocations,
1342              result->pipeline_statistics.ds_invocations,
1343              result->pipeline_statistics.gs_invocations,
1344              result->pipeline_statistics.gs_primitives,
1345              result->pipeline_statistics.c_invocations,
1346              result->pipeline_statistics.c_primitives,
1347              result->pipeline_statistics.ps_invocations,
1348              result->pipeline_statistics.cs_invocations);
1349 #endif
1350       break;
1351    default:
1352       assert(0);
1353    }
1354 }
1355 
si_query_hw_suspend(struct si_context * sctx,struct si_query * query)1356 void si_query_hw_suspend(struct si_context *sctx, struct si_query *query)
1357 {
1358    si_query_hw_emit_stop(sctx, (struct si_query_hw *)query);
1359 }
1360 
si_query_hw_resume(struct si_context * sctx,struct si_query * query)1361 void si_query_hw_resume(struct si_context *sctx, struct si_query *query)
1362 {
1363    si_query_hw_emit_start(sctx, (struct si_query_hw *)query);
1364 }
1365 
1366 static const struct si_query_ops query_hw_ops = {
1367    .destroy = si_query_hw_destroy,
1368    .begin = si_query_hw_begin,
1369    .end = si_query_hw_end,
1370    .get_result = si_query_hw_get_result,
1371    .get_result_resource = si_query_hw_get_result_resource,
1372 
1373    .suspend = si_query_hw_suspend,
1374    .resume = si_query_hw_resume,
1375 };
1376 
si_get_query_result(struct pipe_context * ctx,struct pipe_query * query,bool wait,union pipe_query_result * result)1377 static bool si_get_query_result(struct pipe_context *ctx, struct pipe_query *query, bool wait,
1378                                 union pipe_query_result *result)
1379 {
1380    struct si_context *sctx = (struct si_context *)ctx;
1381    struct si_query *squery = (struct si_query *)query;
1382 
1383    return squery->ops->get_result(sctx, squery, wait, result);
1384 }
1385 
si_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1386 static void si_get_query_result_resource(struct pipe_context *ctx, struct pipe_query *query,
1387                                          bool wait, enum pipe_query_value_type result_type,
1388                                          int index, struct pipe_resource *resource, unsigned offset)
1389 {
1390    struct si_context *sctx = (struct si_context *)ctx;
1391    struct si_query *squery = (struct si_query *)query;
1392 
1393    squery->ops->get_result_resource(sctx, squery, wait, result_type, index, resource, offset);
1394 }
1395 
si_query_hw_clear_result(struct si_query_hw * query,union pipe_query_result * result)1396 static void si_query_hw_clear_result(struct si_query_hw *query, union pipe_query_result *result)
1397 {
1398    util_query_clear_result(result, query->b.type);
1399 }
1400 
si_query_hw_get_result(struct si_context * sctx,struct si_query * squery,bool wait,union pipe_query_result * result)1401 bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
1402                             union pipe_query_result *result)
1403 {
1404    struct si_screen *sscreen = sctx->screen;
1405    struct si_query_hw *query = (struct si_query_hw *)squery;
1406    struct si_query_buffer *qbuf;
1407 
1408    query->ops->clear_result(query, result);
1409 
1410    for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1411       unsigned usage = PIPE_MAP_READ | (wait ? 0 : PIPE_MAP_DONTBLOCK);
1412       unsigned results_base = 0;
1413       void *map;
1414 
1415       if (squery->b.flushed)
1416          map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1417       else
1418          map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
1419 
1420       if (!map)
1421          return false;
1422 
1423       while (results_base != qbuf->results_end) {
1424          query->ops->add_result(sscreen, query, map + results_base, result);
1425          results_base += query->result_size;
1426       }
1427    }
1428 
1429    /* Convert the time to expected units. */
1430    if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
1431        squery->type == PIPE_QUERY_TIMESTAMP) {
1432       result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1433    }
1434    return true;
1435 }
1436 
si_query_hw_get_result_resource(struct si_context * sctx,struct si_query * squery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1437 static void si_query_hw_get_result_resource(struct si_context *sctx, struct si_query *squery,
1438                                             bool wait, enum pipe_query_value_type result_type,
1439                                             int index, struct pipe_resource *resource,
1440                                             unsigned offset)
1441 {
1442    struct si_query_hw *query = (struct si_query_hw *)squery;
1443    struct si_query_buffer *qbuf;
1444    struct si_query_buffer *qbuf_prev;
1445    struct pipe_resource *tmp_buffer = NULL;
1446    unsigned tmp_buffer_offset = 0;
1447    struct si_qbo_state saved_state = {};
1448    struct pipe_grid_info grid = {};
1449    struct pipe_constant_buffer constant_buffer = {};
1450    struct pipe_shader_buffer ssbo[3];
1451    struct si_hw_query_params params;
1452    struct {
1453       uint32_t end_offset;
1454       uint32_t result_stride;
1455       uint32_t result_count;
1456       uint32_t config;
1457       uint32_t fence_offset;
1458       uint32_t pair_stride;
1459       uint32_t pair_count;
1460    } consts;
1461 
1462    if (!sctx->query_result_shader) {
1463       sctx->query_result_shader = si_create_query_result_cs(sctx);
1464       if (!sctx->query_result_shader)
1465          return;
1466    }
1467 
1468    if (query->buffer.previous) {
1469       u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
1470       if (!tmp_buffer)
1471          return;
1472    }
1473 
1474    si_save_qbo_state(sctx, &saved_state);
1475 
1476    si_get_hw_query_params(sctx, query, index >= 0 ? index : 0, &params);
1477    consts.end_offset = params.end_offset - params.start_offset;
1478    consts.fence_offset = params.fence_offset - params.start_offset;
1479    consts.result_stride = query->result_size;
1480    consts.pair_stride = params.pair_stride;
1481    consts.pair_count = params.pair_count;
1482 
1483    constant_buffer.buffer_size = sizeof(consts);
1484    constant_buffer.user_buffer = &consts;
1485 
1486    ssbo[1].buffer = tmp_buffer;
1487    ssbo[1].buffer_offset = tmp_buffer_offset;
1488    ssbo[1].buffer_size = 16;
1489 
1490    ssbo[2] = ssbo[1];
1491 
1492    sctx->b.bind_compute_state(&sctx->b, sctx->query_result_shader);
1493 
1494    grid.block[0] = 1;
1495    grid.block[1] = 1;
1496    grid.block[2] = 1;
1497    grid.grid[0] = 1;
1498    grid.grid[1] = 1;
1499    grid.grid[2] = 1;
1500 
1501    consts.config = 0;
1502    if (index < 0)
1503       consts.config |= 4;
1504    if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1505        query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1506       consts.config |= 8;
1507    else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1508             query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1509       consts.config |= 8 | 256;
1510    else if (query->b.type == PIPE_QUERY_TIMESTAMP || query->b.type == PIPE_QUERY_TIME_ELAPSED)
1511       consts.config |= 32;
1512 
1513    switch (result_type) {
1514    case PIPE_QUERY_TYPE_U64:
1515    case PIPE_QUERY_TYPE_I64:
1516       consts.config |= 64;
1517       break;
1518    case PIPE_QUERY_TYPE_I32:
1519       consts.config |= 128;
1520       break;
1521    case PIPE_QUERY_TYPE_U32:
1522       break;
1523    }
1524 
1525    sctx->flags |= sctx->screen->barrier_flags.cp_to_L2;
1526 
1527    for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1528       if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1529          qbuf_prev = qbuf->previous;
1530          consts.result_count = qbuf->results_end / query->result_size;
1531          consts.config &= ~3;
1532          if (qbuf != &query->buffer)
1533             consts.config |= 1;
1534          if (qbuf->previous)
1535             consts.config |= 2;
1536       } else {
1537          /* Only read the last timestamp. */
1538          qbuf_prev = NULL;
1539          consts.result_count = 0;
1540          consts.config |= 16;
1541          params.start_offset += qbuf->results_end - query->result_size;
1542       }
1543 
1544       sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
1545 
1546       ssbo[0].buffer = &qbuf->buf->b.b;
1547       ssbo[0].buffer_offset = params.start_offset;
1548       ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1549 
1550       if (!qbuf->previous) {
1551          ssbo[2].buffer = resource;
1552          ssbo[2].buffer_offset = offset;
1553          ssbo[2].buffer_size = 8;
1554 
1555          si_resource(resource)->TC_L2_dirty = true;
1556       }
1557 
1558       sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 1 << 2);
1559 
1560       if (wait && qbuf == &query->buffer) {
1561          uint64_t va;
1562 
1563          /* Wait for result availability. Wait only for readiness
1564           * of the last entry, since the fence writes should be
1565           * serialized in the CP.
1566           */
1567          va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1568          va += params.fence_offset;
1569 
1570          si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x80000000, 0x80000000, WAIT_REG_MEM_EQUAL);
1571       }
1572 
1573       sctx->b.launch_grid(&sctx->b, &grid);
1574       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
1575    }
1576 
1577    si_restore_qbo_state(sctx, &saved_state);
1578    pipe_resource_reference(&tmp_buffer, NULL);
1579 }
1580 
si_render_condition(struct pipe_context * ctx,struct pipe_query * query,bool condition,enum pipe_render_cond_flag mode)1581 static void si_render_condition(struct pipe_context *ctx, struct pipe_query *query, bool condition,
1582                                 enum pipe_render_cond_flag mode)
1583 {
1584    struct si_context *sctx = (struct si_context *)ctx;
1585    struct si_query_hw *squery = (struct si_query_hw *)query;
1586    struct si_atom *atom = &sctx->atoms.s.render_cond;
1587 
1588    if (query) {
1589       bool needs_workaround = false;
1590 
1591       /* There was a firmware regression in GFX8 which causes successive
1592        * SET_PREDICATION packets to give the wrong answer for
1593        * non-inverted stream overflow predication.
1594        */
1595       if (((sctx->chip_class == GFX8 && sctx->screen->info.pfp_fw_feature < 49) ||
1596            (sctx->chip_class == GFX9 && sctx->screen->info.pfp_fw_feature < 38)) &&
1597           !condition &&
1598           (squery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1599            (squery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1600             (squery->buffer.previous || squery->buffer.results_end > squery->result_size)))) {
1601          needs_workaround = true;
1602       }
1603 
1604       if (needs_workaround && !squery->workaround_buf) {
1605          bool old_force_off = sctx->render_cond_force_off;
1606          sctx->render_cond_force_off = true;
1607 
1608          u_suballocator_alloc(sctx->allocator_zeroed_memory, 8, 8, &squery->workaround_offset,
1609                               (struct pipe_resource **)&squery->workaround_buf);
1610 
1611          /* Reset to NULL to avoid a redundant SET_PREDICATION
1612           * from launching the compute grid.
1613           */
1614          sctx->render_cond = NULL;
1615 
1616          ctx->get_query_result_resource(ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1617                                         &squery->workaround_buf->b.b, squery->workaround_offset);
1618 
1619          /* Settings this in the render cond atom is too late,
1620           * so set it here. */
1621          sctx->flags |= sctx->screen->barrier_flags.L2_to_cp | SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1622 
1623          sctx->render_cond_force_off = old_force_off;
1624       }
1625    }
1626 
1627    sctx->render_cond = query;
1628    sctx->render_cond_invert = condition;
1629    sctx->render_cond_mode = mode;
1630 
1631    si_set_atom_dirty(sctx, atom, query != NULL);
1632 }
1633 
si_suspend_queries(struct si_context * sctx)1634 void si_suspend_queries(struct si_context *sctx)
1635 {
1636    struct si_query *query;
1637 
1638    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1639       query->ops->suspend(sctx, query);
1640 }
1641 
si_resume_queries(struct si_context * sctx)1642 void si_resume_queries(struct si_context *sctx)
1643 {
1644    struct si_query *query;
1645 
1646    /* Check CS space here. Resuming must not be interrupted by flushes. */
1647    si_need_gfx_cs_space(sctx, 0);
1648 
1649    LIST_FOR_EACH_ENTRY (query, &sctx->active_queries, active_list)
1650       query->ops->resume(sctx, query);
1651 }
1652 
1653 #define XFULL(name_, query_type_, type_, result_type_, group_id_)                                  \
1654    {                                                                                               \
1655       .name = name_, .query_type = SI_QUERY_##query_type_, .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1656       .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, .group_id = group_id_           \
1657    }
1658 
1659 #define X(name_, query_type_, type_, result_type_)                                                 \
1660    XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1661 
1662 #define XG(group_, name_, query_type_, type_, result_type_)                                        \
1663    XFULL(name_, query_type_, type_, result_type_, SI_QUERY_GROUP_##group_)
1664 
1665 static struct pipe_driver_query_info si_driver_query_list[] = {
1666    X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1667    X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1668    X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1669    X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1670    X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
1671    X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1672    X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
1673    X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1674    X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
1675    X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
1676    X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1677    X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1678    X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1679    X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1680    X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1681    X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1682    X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1683    X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1684    X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1685    X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1686    X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1687    X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1688    X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1689    X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1690    X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1691    X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1692    X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1693    X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1694    X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1695    X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1696    X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1697    X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
1698    X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1699    X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
1700    X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1701    X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1702    X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1703    X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1704    X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1705    X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1706    X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1707    X("live-shader-cache-hits", LIVE_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1708    X("live-shader-cache-misses", LIVE_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1709    X("memory-shader-cache-hits", MEMORY_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1710    X("memory-shader-cache-misses", MEMORY_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1711    X("disk-shader-cache-hits", DISK_SHADER_CACHE_HITS, UINT, CUMULATIVE),
1712    X("disk-shader-cache-misses", DISK_SHADER_CACHE_MISSES, UINT, CUMULATIVE),
1713 
1714    /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1715     * which use it as a fallback path to detect the GPU type.
1716     *
1717     * Note: The names of these queries are significant for GPUPerfStudio
1718     * (and possibly their order as well). */
1719    XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1720    XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1721    XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1722    XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1723    XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1724 
1725    X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1726    X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1727    X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1728 
1729    /* The following queries must be at the end of the list because their
1730     * availability is adjusted dynamically based on the DRM version. */
1731    X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1732    X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1733    X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1734    X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1735    X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1736    X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1737    X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1738    X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1739    X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1740    X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1741    X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1742    X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1743    X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1744    X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1745 
1746    /* SRBM_STATUS2 */
1747    X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1748 
1749    /* CP_STAT */
1750    X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1751    X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1752    X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1753    X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1754    X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1755    X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1756 
1757    X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE),
1758    X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE),
1759    X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE),
1760 };
1761 
1762 #undef X
1763 #undef XG
1764 #undef XFULL
1765 
si_get_num_queries(struct si_screen * sscreen)1766 static unsigned si_get_num_queries(struct si_screen *sscreen)
1767 {
1768    /* amdgpu */
1769    if (sscreen->info.is_amdgpu) {
1770       if (sscreen->info.chip_class >= GFX8)
1771          return ARRAY_SIZE(si_driver_query_list);
1772       else
1773          return ARRAY_SIZE(si_driver_query_list) - 7;
1774    }
1775 
1776    /* radeon */
1777    if (sscreen->info.has_read_registers_query) {
1778       if (sscreen->info.chip_class == GFX7)
1779          return ARRAY_SIZE(si_driver_query_list) - 6;
1780       else
1781          return ARRAY_SIZE(si_driver_query_list) - 7;
1782    }
1783 
1784    return ARRAY_SIZE(si_driver_query_list) - 21;
1785 }
1786 
si_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)1787 static int si_get_driver_query_info(struct pipe_screen *screen, unsigned index,
1788                                     struct pipe_driver_query_info *info)
1789 {
1790    struct si_screen *sscreen = (struct si_screen *)screen;
1791    unsigned num_queries = si_get_num_queries(sscreen);
1792 
1793    if (!info) {
1794       unsigned num_perfcounters = si_get_perfcounter_info(sscreen, 0, NULL);
1795 
1796       return num_queries + num_perfcounters;
1797    }
1798 
1799    if (index >= num_queries)
1800       return si_get_perfcounter_info(sscreen, index - num_queries, info);
1801 
1802    *info = si_driver_query_list[index];
1803 
1804    switch (info->query_type) {
1805    case SI_QUERY_REQUESTED_VRAM:
1806    case SI_QUERY_VRAM_USAGE:
1807    case SI_QUERY_MAPPED_VRAM:
1808       info->max_value.u64 = sscreen->info.vram_size;
1809       break;
1810    case SI_QUERY_REQUESTED_GTT:
1811    case SI_QUERY_GTT_USAGE:
1812    case SI_QUERY_MAPPED_GTT:
1813       info->max_value.u64 = sscreen->info.gart_size;
1814       break;
1815    case SI_QUERY_GPU_TEMPERATURE:
1816       info->max_value.u64 = 125;
1817       break;
1818    case SI_QUERY_VRAM_VIS_USAGE:
1819       info->max_value.u64 = sscreen->info.vram_vis_size;
1820       break;
1821    }
1822 
1823    if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
1824       info->group_id += sscreen->perfcounters->num_groups;
1825 
1826    return 1;
1827 }
1828 
1829 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1830  * performance counter groups, so be careful when changing this and related
1831  * functions.
1832  */
si_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)1833 static int si_get_driver_query_group_info(struct pipe_screen *screen, unsigned index,
1834                                           struct pipe_driver_query_group_info *info)
1835 {
1836    struct si_screen *sscreen = (struct si_screen *)screen;
1837    unsigned num_pc_groups = 0;
1838 
1839    if (sscreen->perfcounters)
1840       num_pc_groups = sscreen->perfcounters->num_groups;
1841 
1842    if (!info)
1843       return num_pc_groups + SI_NUM_SW_QUERY_GROUPS;
1844 
1845    if (index < num_pc_groups)
1846       return si_get_perfcounter_group_info(sscreen, index, info);
1847 
1848    index -= num_pc_groups;
1849    if (index >= SI_NUM_SW_QUERY_GROUPS)
1850       return 0;
1851 
1852    info->name = "GPIN";
1853    info->max_active_queries = 5;
1854    info->num_queries = 5;
1855    return 1;
1856 }
1857 
si_init_query_functions(struct si_context * sctx)1858 void si_init_query_functions(struct si_context *sctx)
1859 {
1860    sctx->b.create_query = si_create_query;
1861    sctx->b.create_batch_query = si_create_batch_query;
1862    sctx->b.destroy_query = si_destroy_query;
1863    sctx->b.begin_query = si_begin_query;
1864    sctx->b.end_query = si_end_query;
1865    sctx->b.get_query_result = si_get_query_result;
1866    sctx->b.get_query_result_resource = si_get_query_result_resource;
1867 
1868    if (sctx->has_graphics) {
1869       sctx->atoms.s.render_cond.emit = si_emit_query_predication;
1870       sctx->b.render_condition = si_render_condition;
1871    }
1872 
1873    list_inithead(&sctx->active_queries);
1874 }
1875 
si_init_screen_query_functions(struct si_screen * sscreen)1876 void si_init_screen_query_functions(struct si_screen *sscreen)
1877 {
1878    sscreen->b.get_driver_query_info = si_get_driver_query_info;
1879    sscreen->b.get_driver_query_group_info = si_get_driver_query_group_info;
1880 }
1881