1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "radeonsi/si_pipe.h"
26 #include "r600_query.h"
27 #include "r600_cs.h"
28 #include "util/u_memory.h"
29 #include "util/u_upload_mgr.h"
30 #include "util/os_time.h"
31 #include "tgsi/tgsi_text.h"
32 #include "amd/common/sid.h"
33 
34 #define R600_MAX_STREAMS 4
35 
36 struct r600_hw_query_params {
37 	unsigned start_offset;
38 	unsigned end_offset;
39 	unsigned fence_offset;
40 	unsigned pair_stride;
41 	unsigned pair_count;
42 };
43 
44 /* Queries without buffer handling or suspend/resume. */
45 struct r600_query_sw {
46 	struct r600_query b;
47 
48 	uint64_t begin_result;
49 	uint64_t end_result;
50 
51 	uint64_t begin_time;
52 	uint64_t end_time;
53 
54 	/* Fence for GPU_FINISHED. */
55 	struct pipe_fence_handle *fence;
56 };
57 
r600_query_sw_destroy(struct si_screen * sscreen,struct r600_query * rquery)58 static void r600_query_sw_destroy(struct si_screen *sscreen,
59 				  struct r600_query *rquery)
60 {
61 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
62 
63 	sscreen->b.fence_reference(&sscreen->b, &query->fence, NULL);
64 	FREE(query);
65 }
66 
winsys_id_from_type(unsigned type)67 static enum radeon_value_id winsys_id_from_type(unsigned type)
68 {
69 	switch (type) {
70 	case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
71 	case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
72 	case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
73 	case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
74 	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
75 	case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
76 	case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
77 	case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
78 	case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
79 	case R600_QUERY_GFX_IB_SIZE: return RADEON_GFX_IB_SIZE_COUNTER;
80 	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
81 	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
82 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
83 	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
84 	case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
85 	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
86 	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
87 	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
88 	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
89 	case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
90 	default: unreachable("query type does not correspond to winsys id");
91 	}
92 }
93 
r600_query_sw_begin(struct r600_common_context * rctx,struct r600_query * rquery)94 static bool r600_query_sw_begin(struct r600_common_context *rctx,
95 				struct r600_query *rquery)
96 {
97 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
98 	enum radeon_value_id ws_id;
99 
100 	switch(query->b.type) {
101 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
102 	case PIPE_QUERY_GPU_FINISHED:
103 		break;
104 	case R600_QUERY_DRAW_CALLS:
105 		query->begin_result = rctx->num_draw_calls;
106 		break;
107 	case R600_QUERY_DECOMPRESS_CALLS:
108 		query->begin_result = rctx->num_decompress_calls;
109 		break;
110 	case R600_QUERY_MRT_DRAW_CALLS:
111 		query->begin_result = rctx->num_mrt_draw_calls;
112 		break;
113 	case R600_QUERY_PRIM_RESTART_CALLS:
114 		query->begin_result = rctx->num_prim_restart_calls;
115 		break;
116 	case R600_QUERY_SPILL_DRAW_CALLS:
117 		query->begin_result = rctx->num_spill_draw_calls;
118 		break;
119 	case R600_QUERY_COMPUTE_CALLS:
120 		query->begin_result = rctx->num_compute_calls;
121 		break;
122 	case R600_QUERY_SPILL_COMPUTE_CALLS:
123 		query->begin_result = rctx->num_spill_compute_calls;
124 		break;
125 	case R600_QUERY_DMA_CALLS:
126 		query->begin_result = rctx->num_dma_calls;
127 		break;
128 	case R600_QUERY_CP_DMA_CALLS:
129 		query->begin_result = rctx->num_cp_dma_calls;
130 		break;
131 	case R600_QUERY_NUM_VS_FLUSHES:
132 		query->begin_result = rctx->num_vs_flushes;
133 		break;
134 	case R600_QUERY_NUM_PS_FLUSHES:
135 		query->begin_result = rctx->num_ps_flushes;
136 		break;
137 	case R600_QUERY_NUM_CS_FLUSHES:
138 		query->begin_result = rctx->num_cs_flushes;
139 		break;
140 	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
141 		query->begin_result = rctx->num_cb_cache_flushes;
142 		break;
143 	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
144 		query->begin_result = rctx->num_db_cache_flushes;
145 		break;
146 	case R600_QUERY_NUM_L2_INVALIDATES:
147 		query->begin_result = rctx->num_L2_invalidates;
148 		break;
149 	case R600_QUERY_NUM_L2_WRITEBACKS:
150 		query->begin_result = rctx->num_L2_writebacks;
151 		break;
152 	case R600_QUERY_NUM_RESIDENT_HANDLES:
153 		query->begin_result = rctx->num_resident_handles;
154 		break;
155 	case R600_QUERY_TC_OFFLOADED_SLOTS:
156 		query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
157 		break;
158 	case R600_QUERY_TC_DIRECT_SLOTS:
159 		query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
160 		break;
161 	case R600_QUERY_TC_NUM_SYNCS:
162 		query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
163 		break;
164 	case R600_QUERY_REQUESTED_VRAM:
165 	case R600_QUERY_REQUESTED_GTT:
166 	case R600_QUERY_MAPPED_VRAM:
167 	case R600_QUERY_MAPPED_GTT:
168 	case R600_QUERY_VRAM_USAGE:
169 	case R600_QUERY_VRAM_VIS_USAGE:
170 	case R600_QUERY_GTT_USAGE:
171 	case R600_QUERY_GPU_TEMPERATURE:
172 	case R600_QUERY_CURRENT_GPU_SCLK:
173 	case R600_QUERY_CURRENT_GPU_MCLK:
174 	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
175 	case R600_QUERY_NUM_MAPPED_BUFFERS:
176 		query->begin_result = 0;
177 		break;
178 	case R600_QUERY_BUFFER_WAIT_TIME:
179 	case R600_QUERY_GFX_IB_SIZE:
180 	case R600_QUERY_NUM_GFX_IBS:
181 	case R600_QUERY_NUM_SDMA_IBS:
182 	case R600_QUERY_NUM_BYTES_MOVED:
183 	case R600_QUERY_NUM_EVICTIONS:
184 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
185 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
186 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
187 		break;
188 	}
189 	case R600_QUERY_GFX_BO_LIST_SIZE:
190 		ws_id = winsys_id_from_type(query->b.type);
191 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
192 		query->begin_time = rctx->ws->query_value(rctx->ws,
193 							  RADEON_NUM_GFX_IBS);
194 		break;
195 	case R600_QUERY_CS_THREAD_BUSY:
196 		ws_id = winsys_id_from_type(query->b.type);
197 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
198 		query->begin_time = os_time_get_nano();
199 		break;
200 	case R600_QUERY_GALLIUM_THREAD_BUSY:
201 		query->begin_result =
202 			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
203 		query->begin_time = os_time_get_nano();
204 		break;
205 	case R600_QUERY_GPU_LOAD:
206 	case R600_QUERY_GPU_SHADERS_BUSY:
207 	case R600_QUERY_GPU_TA_BUSY:
208 	case R600_QUERY_GPU_GDS_BUSY:
209 	case R600_QUERY_GPU_VGT_BUSY:
210 	case R600_QUERY_GPU_IA_BUSY:
211 	case R600_QUERY_GPU_SX_BUSY:
212 	case R600_QUERY_GPU_WD_BUSY:
213 	case R600_QUERY_GPU_BCI_BUSY:
214 	case R600_QUERY_GPU_SC_BUSY:
215 	case R600_QUERY_GPU_PA_BUSY:
216 	case R600_QUERY_GPU_DB_BUSY:
217 	case R600_QUERY_GPU_CP_BUSY:
218 	case R600_QUERY_GPU_CB_BUSY:
219 	case R600_QUERY_GPU_SDMA_BUSY:
220 	case R600_QUERY_GPU_PFP_BUSY:
221 	case R600_QUERY_GPU_MEQ_BUSY:
222 	case R600_QUERY_GPU_ME_BUSY:
223 	case R600_QUERY_GPU_SURF_SYNC_BUSY:
224 	case R600_QUERY_GPU_CP_DMA_BUSY:
225 	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
226 		query->begin_result = si_begin_counter(rctx->screen,
227 							 query->b.type);
228 		break;
229 	case R600_QUERY_NUM_COMPILATIONS:
230 		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
231 		break;
232 	case R600_QUERY_NUM_SHADERS_CREATED:
233 		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
234 		break;
235 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
236 		query->begin_result =
237 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
238 		break;
239 	case R600_QUERY_GPIN_ASIC_ID:
240 	case R600_QUERY_GPIN_NUM_SIMD:
241 	case R600_QUERY_GPIN_NUM_RB:
242 	case R600_QUERY_GPIN_NUM_SPI:
243 	case R600_QUERY_GPIN_NUM_SE:
244 		break;
245 	default:
246 		unreachable("r600_query_sw_begin: bad query type");
247 	}
248 
249 	return true;
250 }
251 
r600_query_sw_end(struct r600_common_context * rctx,struct r600_query * rquery)252 static bool r600_query_sw_end(struct r600_common_context *rctx,
253 			      struct r600_query *rquery)
254 {
255 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
256 	enum radeon_value_id ws_id;
257 
258 	switch(query->b.type) {
259 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
260 		break;
261 	case PIPE_QUERY_GPU_FINISHED:
262 		rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
263 		break;
264 	case R600_QUERY_DRAW_CALLS:
265 		query->end_result = rctx->num_draw_calls;
266 		break;
267 	case R600_QUERY_DECOMPRESS_CALLS:
268 		query->end_result = rctx->num_decompress_calls;
269 		break;
270 	case R600_QUERY_MRT_DRAW_CALLS:
271 		query->end_result = rctx->num_mrt_draw_calls;
272 		break;
273 	case R600_QUERY_PRIM_RESTART_CALLS:
274 		query->end_result = rctx->num_prim_restart_calls;
275 		break;
276 	case R600_QUERY_SPILL_DRAW_CALLS:
277 		query->end_result = rctx->num_spill_draw_calls;
278 		break;
279 	case R600_QUERY_COMPUTE_CALLS:
280 		query->end_result = rctx->num_compute_calls;
281 		break;
282 	case R600_QUERY_SPILL_COMPUTE_CALLS:
283 		query->end_result = rctx->num_spill_compute_calls;
284 		break;
285 	case R600_QUERY_DMA_CALLS:
286 		query->end_result = rctx->num_dma_calls;
287 		break;
288 	case R600_QUERY_CP_DMA_CALLS:
289 		query->end_result = rctx->num_cp_dma_calls;
290 		break;
291 	case R600_QUERY_NUM_VS_FLUSHES:
292 		query->end_result = rctx->num_vs_flushes;
293 		break;
294 	case R600_QUERY_NUM_PS_FLUSHES:
295 		query->end_result = rctx->num_ps_flushes;
296 		break;
297 	case R600_QUERY_NUM_CS_FLUSHES:
298 		query->end_result = rctx->num_cs_flushes;
299 		break;
300 	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
301 		query->end_result = rctx->num_cb_cache_flushes;
302 		break;
303 	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
304 		query->end_result = rctx->num_db_cache_flushes;
305 		break;
306 	case R600_QUERY_NUM_L2_INVALIDATES:
307 		query->end_result = rctx->num_L2_invalidates;
308 		break;
309 	case R600_QUERY_NUM_L2_WRITEBACKS:
310 		query->end_result = rctx->num_L2_writebacks;
311 		break;
312 	case R600_QUERY_NUM_RESIDENT_HANDLES:
313 		query->end_result = rctx->num_resident_handles;
314 		break;
315 	case R600_QUERY_TC_OFFLOADED_SLOTS:
316 		query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
317 		break;
318 	case R600_QUERY_TC_DIRECT_SLOTS:
319 		query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
320 		break;
321 	case R600_QUERY_TC_NUM_SYNCS:
322 		query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
323 		break;
324 	case R600_QUERY_REQUESTED_VRAM:
325 	case R600_QUERY_REQUESTED_GTT:
326 	case R600_QUERY_MAPPED_VRAM:
327 	case R600_QUERY_MAPPED_GTT:
328 	case R600_QUERY_VRAM_USAGE:
329 	case R600_QUERY_VRAM_VIS_USAGE:
330 	case R600_QUERY_GTT_USAGE:
331 	case R600_QUERY_GPU_TEMPERATURE:
332 	case R600_QUERY_CURRENT_GPU_SCLK:
333 	case R600_QUERY_CURRENT_GPU_MCLK:
334 	case R600_QUERY_BUFFER_WAIT_TIME:
335 	case R600_QUERY_GFX_IB_SIZE:
336 	case R600_QUERY_NUM_MAPPED_BUFFERS:
337 	case R600_QUERY_NUM_GFX_IBS:
338 	case R600_QUERY_NUM_SDMA_IBS:
339 	case R600_QUERY_NUM_BYTES_MOVED:
340 	case R600_QUERY_NUM_EVICTIONS:
341 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
342 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
343 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
344 		break;
345 	}
346 	case R600_QUERY_GFX_BO_LIST_SIZE:
347 		ws_id = winsys_id_from_type(query->b.type);
348 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
349 		query->end_time = rctx->ws->query_value(rctx->ws,
350 							RADEON_NUM_GFX_IBS);
351 		break;
352 	case R600_QUERY_CS_THREAD_BUSY:
353 		ws_id = winsys_id_from_type(query->b.type);
354 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
355 		query->end_time = os_time_get_nano();
356 		break;
357 	case R600_QUERY_GALLIUM_THREAD_BUSY:
358 		query->end_result =
359 			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
360 		query->end_time = os_time_get_nano();
361 		break;
362 	case R600_QUERY_GPU_LOAD:
363 	case R600_QUERY_GPU_SHADERS_BUSY:
364 	case R600_QUERY_GPU_TA_BUSY:
365 	case R600_QUERY_GPU_GDS_BUSY:
366 	case R600_QUERY_GPU_VGT_BUSY:
367 	case R600_QUERY_GPU_IA_BUSY:
368 	case R600_QUERY_GPU_SX_BUSY:
369 	case R600_QUERY_GPU_WD_BUSY:
370 	case R600_QUERY_GPU_BCI_BUSY:
371 	case R600_QUERY_GPU_SC_BUSY:
372 	case R600_QUERY_GPU_PA_BUSY:
373 	case R600_QUERY_GPU_DB_BUSY:
374 	case R600_QUERY_GPU_CP_BUSY:
375 	case R600_QUERY_GPU_CB_BUSY:
376 	case R600_QUERY_GPU_SDMA_BUSY:
377 	case R600_QUERY_GPU_PFP_BUSY:
378 	case R600_QUERY_GPU_MEQ_BUSY:
379 	case R600_QUERY_GPU_ME_BUSY:
380 	case R600_QUERY_GPU_SURF_SYNC_BUSY:
381 	case R600_QUERY_GPU_CP_DMA_BUSY:
382 	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
383 		query->end_result = si_end_counter(rctx->screen,
384 						     query->b.type,
385 						     query->begin_result);
386 		query->begin_result = 0;
387 		break;
388 	case R600_QUERY_NUM_COMPILATIONS:
389 		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
390 		break;
391 	case R600_QUERY_NUM_SHADERS_CREATED:
392 		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
393 		break;
394 	case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
395 		query->end_result = rctx->last_tex_ps_draw_ratio;
396 		break;
397 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
398 		query->end_result =
399 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
400 		break;
401 	case R600_QUERY_GPIN_ASIC_ID:
402 	case R600_QUERY_GPIN_NUM_SIMD:
403 	case R600_QUERY_GPIN_NUM_RB:
404 	case R600_QUERY_GPIN_NUM_SPI:
405 	case R600_QUERY_GPIN_NUM_SE:
406 		break;
407 	default:
408 		unreachable("r600_query_sw_end: bad query type");
409 	}
410 
411 	return true;
412 }
413 
r600_query_sw_get_result(struct r600_common_context * rctx,struct r600_query * rquery,bool wait,union pipe_query_result * result)414 static bool r600_query_sw_get_result(struct r600_common_context *rctx,
415 				     struct r600_query *rquery,
416 				     bool wait,
417 				     union pipe_query_result *result)
418 {
419 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
420 
421 	switch (query->b.type) {
422 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
423 		/* Convert from cycles per millisecond to cycles per second (Hz). */
424 		result->timestamp_disjoint.frequency =
425 			(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
426 		result->timestamp_disjoint.disjoint = false;
427 		return true;
428 	case PIPE_QUERY_GPU_FINISHED: {
429 		struct pipe_screen *screen = rctx->b.screen;
430 		struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
431 
432 		result->b = screen->fence_finish(screen, ctx, query->fence,
433 						 wait ? PIPE_TIMEOUT_INFINITE : 0);
434 		return result->b;
435 	}
436 
437 	case R600_QUERY_GFX_BO_LIST_SIZE:
438 		result->u64 = (query->end_result - query->begin_result) /
439 			      (query->end_time - query->begin_time);
440 		return true;
441 	case R600_QUERY_CS_THREAD_BUSY:
442 	case R600_QUERY_GALLIUM_THREAD_BUSY:
443 		result->u64 = (query->end_result - query->begin_result) * 100 /
444 			      (query->end_time - query->begin_time);
445 		return true;
446 	case R600_QUERY_GPIN_ASIC_ID:
447 		result->u32 = 0;
448 		return true;
449 	case R600_QUERY_GPIN_NUM_SIMD:
450 		result->u32 = rctx->screen->info.num_good_compute_units;
451 		return true;
452 	case R600_QUERY_GPIN_NUM_RB:
453 		result->u32 = rctx->screen->info.num_render_backends;
454 		return true;
455 	case R600_QUERY_GPIN_NUM_SPI:
456 		result->u32 = 1; /* all supported chips have one SPI per SE */
457 		return true;
458 	case R600_QUERY_GPIN_NUM_SE:
459 		result->u32 = rctx->screen->info.max_se;
460 		return true;
461 	}
462 
463 	result->u64 = query->end_result - query->begin_result;
464 
465 	switch (query->b.type) {
466 	case R600_QUERY_BUFFER_WAIT_TIME:
467 	case R600_QUERY_GPU_TEMPERATURE:
468 		result->u64 /= 1000;
469 		break;
470 	case R600_QUERY_CURRENT_GPU_SCLK:
471 	case R600_QUERY_CURRENT_GPU_MCLK:
472 		result->u64 *= 1000000;
473 		break;
474 	}
475 
476 	return true;
477 }
478 
479 
480 static struct r600_query_ops sw_query_ops = {
481 	.destroy = r600_query_sw_destroy,
482 	.begin = r600_query_sw_begin,
483 	.end = r600_query_sw_end,
484 	.get_result = r600_query_sw_get_result,
485 	.get_result_resource = NULL
486 };
487 
r600_query_sw_create(unsigned query_type)488 static struct pipe_query *r600_query_sw_create(unsigned query_type)
489 {
490 	struct r600_query_sw *query;
491 
492 	query = CALLOC_STRUCT(r600_query_sw);
493 	if (!query)
494 		return NULL;
495 
496 	query->b.type = query_type;
497 	query->b.ops = &sw_query_ops;
498 
499 	return (struct pipe_query *)query;
500 }
501 
si_query_hw_destroy(struct si_screen * sscreen,struct r600_query * rquery)502 void si_query_hw_destroy(struct si_screen *sscreen,
503 			 struct r600_query *rquery)
504 {
505 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
506 	struct r600_query_buffer *prev = query->buffer.previous;
507 
508 	/* Release all query buffers. */
509 	while (prev) {
510 		struct r600_query_buffer *qbuf = prev;
511 		prev = prev->previous;
512 		r600_resource_reference(&qbuf->buf, NULL);
513 		FREE(qbuf);
514 	}
515 
516 	r600_resource_reference(&query->buffer.buf, NULL);
517 	r600_resource_reference(&query->workaround_buf, NULL);
518 	FREE(rquery);
519 }
520 
r600_new_query_buffer(struct si_screen * sscreen,struct r600_query_hw * query)521 static struct r600_resource *r600_new_query_buffer(struct si_screen *sscreen,
522 						   struct r600_query_hw *query)
523 {
524 	unsigned buf_size = MAX2(query->result_size,
525 				 sscreen->info.min_alloc_size);
526 
527 	/* Queries are normally read by the CPU after
528 	 * being written by the gpu, hence staging is probably a good
529 	 * usage pattern.
530 	 */
531 	struct r600_resource *buf = (struct r600_resource*)
532 		pipe_buffer_create(&sscreen->b, 0,
533 				   PIPE_USAGE_STAGING, buf_size);
534 	if (!buf)
535 		return NULL;
536 
537 	if (!query->ops->prepare_buffer(sscreen, query, buf)) {
538 		r600_resource_reference(&buf, NULL);
539 		return NULL;
540 	}
541 
542 	return buf;
543 }
544 
r600_query_hw_prepare_buffer(struct si_screen * sscreen,struct r600_query_hw * query,struct r600_resource * buffer)545 static bool r600_query_hw_prepare_buffer(struct si_screen *sscreen,
546 					 struct r600_query_hw *query,
547 					 struct r600_resource *buffer)
548 {
549 	/* Callers ensure that the buffer is currently unused by the GPU. */
550 	uint32_t *results = sscreen->ws->buffer_map(buffer->buf, NULL,
551 						   PIPE_TRANSFER_WRITE |
552 						   PIPE_TRANSFER_UNSYNCHRONIZED);
553 	if (!results)
554 		return false;
555 
556 	memset(results, 0, buffer->b.b.width0);
557 
558 	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
559 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
560 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
561 		unsigned max_rbs = sscreen->info.num_render_backends;
562 		unsigned enabled_rb_mask = sscreen->info.enabled_rb_mask;
563 		unsigned num_results;
564 		unsigned i, j;
565 
566 		/* Set top bits for unused backends. */
567 		num_results = buffer->b.b.width0 / query->result_size;
568 		for (j = 0; j < num_results; j++) {
569 			for (i = 0; i < max_rbs; i++) {
570 				if (!(enabled_rb_mask & (1<<i))) {
571 					results[(i * 4)+1] = 0x80000000;
572 					results[(i * 4)+3] = 0x80000000;
573 				}
574 			}
575 			results += 4 * max_rbs;
576 		}
577 	}
578 
579 	return true;
580 }
581 
582 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
583                                               struct r600_query *rquery,
584                                               bool wait,
585                                               enum pipe_query_value_type result_type,
586                                               int index,
587                                               struct pipe_resource *resource,
588                                               unsigned offset);
589 
590 static struct r600_query_ops query_hw_ops = {
591 	.destroy = si_query_hw_destroy,
592 	.begin = si_query_hw_begin,
593 	.end = si_query_hw_end,
594 	.get_result = si_query_hw_get_result,
595 	.get_result_resource = r600_query_hw_get_result_resource,
596 };
597 
598 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
599 					struct r600_query_hw *query,
600 					struct r600_resource *buffer,
601 					uint64_t va);
602 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
603 				       struct r600_query_hw *query,
604 				       struct r600_resource *buffer,
605 				       uint64_t va);
606 static void r600_query_hw_add_result(struct si_screen *sscreen,
607 				     struct r600_query_hw *, void *buffer,
608 				     union pipe_query_result *result);
609 static void r600_query_hw_clear_result(struct r600_query_hw *,
610 				       union pipe_query_result *);
611 
612 static struct r600_query_hw_ops query_hw_default_hw_ops = {
613 	.prepare_buffer = r600_query_hw_prepare_buffer,
614 	.emit_start = r600_query_hw_do_emit_start,
615 	.emit_stop = r600_query_hw_do_emit_stop,
616 	.clear_result = r600_query_hw_clear_result,
617 	.add_result = r600_query_hw_add_result,
618 };
619 
si_query_hw_init(struct si_screen * sscreen,struct r600_query_hw * query)620 bool si_query_hw_init(struct si_screen *sscreen,
621 		      struct r600_query_hw *query)
622 {
623 	query->buffer.buf = r600_new_query_buffer(sscreen, query);
624 	if (!query->buffer.buf)
625 		return false;
626 
627 	return true;
628 }
629 
r600_query_hw_create(struct si_screen * sscreen,unsigned query_type,unsigned index)630 static struct pipe_query *r600_query_hw_create(struct si_screen *sscreen,
631 					       unsigned query_type,
632 					       unsigned index)
633 {
634 	struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
635 	if (!query)
636 		return NULL;
637 
638 	query->b.type = query_type;
639 	query->b.ops = &query_hw_ops;
640 	query->ops = &query_hw_default_hw_ops;
641 
642 	switch (query_type) {
643 	case PIPE_QUERY_OCCLUSION_COUNTER:
644 	case PIPE_QUERY_OCCLUSION_PREDICATE:
645 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
646 		query->result_size = 16 * sscreen->info.num_render_backends;
647 		query->result_size += 16; /* for the fence + alignment */
648 		query->num_cs_dw_begin = 6;
649 		query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(sscreen);
650 		break;
651 	case PIPE_QUERY_TIME_ELAPSED:
652 		query->result_size = 24;
653 		query->num_cs_dw_begin = 8;
654 		query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen);
655 		break;
656 	case PIPE_QUERY_TIMESTAMP:
657 		query->result_size = 16;
658 		query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(sscreen);
659 		query->flags = R600_QUERY_HW_FLAG_NO_START;
660 		break;
661 	case PIPE_QUERY_PRIMITIVES_EMITTED:
662 	case PIPE_QUERY_PRIMITIVES_GENERATED:
663 	case PIPE_QUERY_SO_STATISTICS:
664 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
665 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
666 		query->result_size = 32;
667 		query->num_cs_dw_begin = 6;
668 		query->num_cs_dw_end = 6;
669 		query->stream = index;
670 		break;
671 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
672 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
673 		query->result_size = 32 * R600_MAX_STREAMS;
674 		query->num_cs_dw_begin = 6 * R600_MAX_STREAMS;
675 		query->num_cs_dw_end = 6 * R600_MAX_STREAMS;
676 		break;
677 	case PIPE_QUERY_PIPELINE_STATISTICS:
678 		/* 11 values on GCN. */
679 		query->result_size = 11 * 16;
680 		query->result_size += 8; /* for the fence + alignment */
681 		query->num_cs_dw_begin = 6;
682 		query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(sscreen);
683 		break;
684 	default:
685 		assert(0);
686 		FREE(query);
687 		return NULL;
688 	}
689 
690 	if (!si_query_hw_init(sscreen, query)) {
691 		FREE(query);
692 		return NULL;
693 	}
694 
695 	return (struct pipe_query *)query;
696 }
697 
r600_update_occlusion_query_state(struct r600_common_context * rctx,unsigned type,int diff)698 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
699 					      unsigned type, int diff)
700 {
701 	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
702 	    type == PIPE_QUERY_OCCLUSION_PREDICATE ||
703 	    type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
704 		bool old_enable = rctx->num_occlusion_queries != 0;
705 		bool old_perfect_enable =
706 			rctx->num_perfect_occlusion_queries != 0;
707 		bool enable, perfect_enable;
708 
709 		rctx->num_occlusion_queries += diff;
710 		assert(rctx->num_occlusion_queries >= 0);
711 
712 		if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
713 			rctx->num_perfect_occlusion_queries += diff;
714 			assert(rctx->num_perfect_occlusion_queries >= 0);
715 		}
716 
717 		enable = rctx->num_occlusion_queries != 0;
718 		perfect_enable = rctx->num_perfect_occlusion_queries != 0;
719 
720 		if (enable != old_enable || perfect_enable != old_perfect_enable) {
721 			rctx->set_occlusion_query_state(&rctx->b, old_enable,
722 							old_perfect_enable);
723 		}
724 	}
725 }
726 
event_type_for_stream(unsigned stream)727 static unsigned event_type_for_stream(unsigned stream)
728 {
729 	switch (stream) {
730 	default:
731 	case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
732 	case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
733 	case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
734 	case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
735 	}
736 }
737 
emit_sample_streamout(struct radeon_winsys_cs * cs,uint64_t va,unsigned stream)738 static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
739 				  unsigned stream)
740 {
741 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
742 	radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
743 	radeon_emit(cs, va);
744 	radeon_emit(cs, va >> 32);
745 }
746 
r600_query_hw_do_emit_start(struct r600_common_context * ctx,struct r600_query_hw * query,struct r600_resource * buffer,uint64_t va)747 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
748 					struct r600_query_hw *query,
749 					struct r600_resource *buffer,
750 					uint64_t va)
751 {
752 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
753 
754 	switch (query->b.type) {
755 	case PIPE_QUERY_OCCLUSION_COUNTER:
756 	case PIPE_QUERY_OCCLUSION_PREDICATE:
757 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
758 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
759 		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
760 		radeon_emit(cs, va);
761 		radeon_emit(cs, va >> 32);
762 		break;
763 	case PIPE_QUERY_PRIMITIVES_EMITTED:
764 	case PIPE_QUERY_PRIMITIVES_GENERATED:
765 	case PIPE_QUERY_SO_STATISTICS:
766 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
767 		emit_sample_streamout(cs, va, query->stream);
768 		break;
769 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
770 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
771 			emit_sample_streamout(cs, va + 32 * stream, stream);
772 		break;
773 	case PIPE_QUERY_TIME_ELAPSED:
774 		/* Write the timestamp from the CP not waiting for
775 		 * outstanding draws (top-of-pipe).
776 		 */
777 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
778 		radeon_emit(cs, COPY_DATA_COUNT_SEL |
779 				COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
780 				COPY_DATA_DST_SEL(COPY_DATA_MEM_ASYNC));
781 		radeon_emit(cs, 0);
782 		radeon_emit(cs, 0);
783 		radeon_emit(cs, va);
784 		radeon_emit(cs, va >> 32);
785 		break;
786 	case PIPE_QUERY_PIPELINE_STATISTICS:
787 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
788 		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
789 		radeon_emit(cs, va);
790 		radeon_emit(cs, va >> 32);
791 		break;
792 	default:
793 		assert(0);
794 	}
795 	radeon_add_to_buffer_list(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
796 				  RADEON_PRIO_QUERY);
797 }
798 
r600_query_hw_emit_start(struct r600_common_context * ctx,struct r600_query_hw * query)799 static void r600_query_hw_emit_start(struct r600_common_context *ctx,
800 				     struct r600_query_hw *query)
801 {
802 	uint64_t va;
803 
804 	if (!query->buffer.buf)
805 		return; // previous buffer allocation failure
806 
807 	r600_update_occlusion_query_state(ctx, query->b.type, 1);
808 	si_update_prims_generated_query_state((void*)ctx, query->b.type, 1);
809 
810 	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
811 			       true);
812 
813 	/* Get a new query buffer if needed. */
814 	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
815 		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
816 		*qbuf = query->buffer;
817 		query->buffer.results_end = 0;
818 		query->buffer.previous = qbuf;
819 		query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
820 		if (!query->buffer.buf)
821 			return;
822 	}
823 
824 	/* emit begin query */
825 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
826 
827 	query->ops->emit_start(ctx, query, query->buffer.buf, va);
828 
829 	ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
830 }
831 
r600_query_hw_do_emit_stop(struct r600_common_context * ctx,struct r600_query_hw * query,struct r600_resource * buffer,uint64_t va)832 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
833 				       struct r600_query_hw *query,
834 				       struct r600_resource *buffer,
835 				       uint64_t va)
836 {
837 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
838 	uint64_t fence_va = 0;
839 
840 	switch (query->b.type) {
841 	case PIPE_QUERY_OCCLUSION_COUNTER:
842 	case PIPE_QUERY_OCCLUSION_PREDICATE:
843 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
844 		va += 8;
845 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
846 		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
847 		radeon_emit(cs, va);
848 		radeon_emit(cs, va >> 32);
849 
850 		fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
851 		break;
852 	case PIPE_QUERY_PRIMITIVES_EMITTED:
853 	case PIPE_QUERY_PRIMITIVES_GENERATED:
854 	case PIPE_QUERY_SO_STATISTICS:
855 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
856 		va += 16;
857 		emit_sample_streamout(cs, va, query->stream);
858 		break;
859 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
860 		va += 16;
861 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
862 			emit_sample_streamout(cs, va + 32 * stream, stream);
863 		break;
864 	case PIPE_QUERY_TIME_ELAPSED:
865 		va += 8;
866 		/* fall through */
867 	case PIPE_QUERY_TIMESTAMP:
868 		si_gfx_write_event_eop(ctx, V_028A90_BOTTOM_OF_PIPE_TS,
869 					 0, EOP_DATA_SEL_TIMESTAMP, NULL, va,
870 					 0, query->b.type);
871 		fence_va = va + 8;
872 		break;
873 	case PIPE_QUERY_PIPELINE_STATISTICS: {
874 		unsigned sample_size = (query->result_size - 8) / 2;
875 
876 		va += sample_size;
877 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
878 		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
879 		radeon_emit(cs, va);
880 		radeon_emit(cs, va >> 32);
881 
882 		fence_va = va + sample_size;
883 		break;
884 	}
885 	default:
886 		assert(0);
887 	}
888 	radeon_add_to_buffer_list(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
889 				  RADEON_PRIO_QUERY);
890 
891 	if (fence_va)
892 		si_gfx_write_event_eop(ctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
893 					 EOP_DATA_SEL_VALUE_32BIT,
894 					 query->buffer.buf, fence_va, 0x80000000,
895 					 query->b.type);
896 }
897 
r600_query_hw_emit_stop(struct r600_common_context * ctx,struct r600_query_hw * query)898 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
899 				    struct r600_query_hw *query)
900 {
901 	uint64_t va;
902 
903 	if (!query->buffer.buf)
904 		return; // previous buffer allocation failure
905 
906 	/* The queries which need begin already called this in begin_query. */
907 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
908 		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
909 	}
910 
911 	/* emit end query */
912 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
913 
914 	query->ops->emit_stop(ctx, query, query->buffer.buf, va);
915 
916 	query->buffer.results_end += query->result_size;
917 
918 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
919 		ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
920 
921 	r600_update_occlusion_query_state(ctx, query->b.type, -1);
922 	si_update_prims_generated_query_state((void*)ctx, query->b.type, -1);
923 }
924 
emit_set_predicate(struct r600_common_context * ctx,struct r600_resource * buf,uint64_t va,uint32_t op)925 static void emit_set_predicate(struct r600_common_context *ctx,
926 			       struct r600_resource *buf, uint64_t va,
927 			       uint32_t op)
928 {
929 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
930 
931 	if (ctx->chip_class >= GFX9) {
932 		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
933 		radeon_emit(cs, op);
934 		radeon_emit(cs, va);
935 		radeon_emit(cs, va >> 32);
936 	} else {
937 		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
938 		radeon_emit(cs, va);
939 		radeon_emit(cs, op | ((va >> 32) & 0xFF));
940 	}
941 	radeon_add_to_buffer_list(ctx, &ctx->gfx, buf, RADEON_USAGE_READ,
942 				  RADEON_PRIO_QUERY);
943 }
944 
r600_emit_query_predication(struct r600_common_context * ctx,struct r600_atom * atom)945 static void r600_emit_query_predication(struct r600_common_context *ctx,
946 					struct r600_atom *atom)
947 {
948 	struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
949 	struct r600_query_buffer *qbuf;
950 	uint32_t op;
951 	bool flag_wait, invert;
952 
953 	if (!query)
954 		return;
955 
956 	invert = ctx->render_cond_invert;
957 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
958 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
959 
960 	if (query->workaround_buf) {
961 		op = PRED_OP(PREDICATION_OP_BOOL64);
962 	} else {
963 		switch (query->b.type) {
964 		case PIPE_QUERY_OCCLUSION_COUNTER:
965 		case PIPE_QUERY_OCCLUSION_PREDICATE:
966 		case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
967 			op = PRED_OP(PREDICATION_OP_ZPASS);
968 			break;
969 		case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
970 		case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
971 			op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
972 			invert = !invert;
973 			break;
974 		default:
975 			assert(0);
976 			return;
977 		}
978 	}
979 
980 	/* if true then invert, see GL_ARB_conditional_render_inverted */
981 	if (invert)
982 		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
983 	else
984 		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
985 
986 	/* Use the value written by compute shader as a workaround. Note that
987 	 * the wait flag does not apply in this predication mode.
988 	 *
989 	 * The shader outputs the result value to L2. Workarounds only affect VI
990 	 * and later, where the CP reads data from L2, so we don't need an
991 	 * additional flush.
992 	 */
993 	if (query->workaround_buf) {
994 		uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
995 		emit_set_predicate(ctx, query->workaround_buf, va, op);
996 		return;
997 	}
998 
999 	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
1000 
1001 	/* emit predicate packets for all data blocks */
1002 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1003 		unsigned results_base = 0;
1004 		uint64_t va_base = qbuf->buf->gpu_address;
1005 
1006 		while (results_base < qbuf->results_end) {
1007 			uint64_t va = va_base + results_base;
1008 
1009 			if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1010 				for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
1011 					emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1012 
1013 					/* set CONTINUE bit for all packets except the first */
1014 					op |= PREDICATION_CONTINUE;
1015 				}
1016 			} else {
1017 				emit_set_predicate(ctx, qbuf->buf, va, op);
1018 				op |= PREDICATION_CONTINUE;
1019 			}
1020 
1021 			results_base += query->result_size;
1022 		}
1023 	}
1024 }
1025 
r600_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)1026 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
1027 {
1028 	struct si_screen *sscreen =
1029 		(struct si_screen *)ctx->screen;
1030 
1031 	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
1032 	    query_type == PIPE_QUERY_GPU_FINISHED ||
1033 	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
1034 		return r600_query_sw_create(query_type);
1035 
1036 	return r600_query_hw_create(sscreen, query_type, index);
1037 }
1038 
r600_destroy_query(struct pipe_context * ctx,struct pipe_query * query)1039 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1040 {
1041 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1042 	struct r600_query *rquery = (struct r600_query *)query;
1043 
1044 	rquery->ops->destroy(rctx->screen, rquery);
1045 }
1046 
r600_begin_query(struct pipe_context * ctx,struct pipe_query * query)1047 static boolean r600_begin_query(struct pipe_context *ctx,
1048                                 struct pipe_query *query)
1049 {
1050 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1051 	struct r600_query *rquery = (struct r600_query *)query;
1052 
1053 	return rquery->ops->begin(rctx, rquery);
1054 }
1055 
si_query_hw_reset_buffers(struct r600_common_context * rctx,struct r600_query_hw * query)1056 void si_query_hw_reset_buffers(struct r600_common_context *rctx,
1057 			       struct r600_query_hw *query)
1058 {
1059 	struct r600_query_buffer *prev = query->buffer.previous;
1060 
1061 	/* Discard the old query buffers. */
1062 	while (prev) {
1063 		struct r600_query_buffer *qbuf = prev;
1064 		prev = prev->previous;
1065 		r600_resource_reference(&qbuf->buf, NULL);
1066 		FREE(qbuf);
1067 	}
1068 
1069 	query->buffer.results_end = 0;
1070 	query->buffer.previous = NULL;
1071 
1072 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
1073 	if (si_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
1074 	    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
1075 		r600_resource_reference(&query->buffer.buf, NULL);
1076 		query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
1077 	} else {
1078 		if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
1079 			r600_resource_reference(&query->buffer.buf, NULL);
1080 	}
1081 }
1082 
si_query_hw_begin(struct r600_common_context * rctx,struct r600_query * rquery)1083 bool si_query_hw_begin(struct r600_common_context *rctx,
1084 		       struct r600_query *rquery)
1085 {
1086 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1087 
1088 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
1089 		assert(0);
1090 		return false;
1091 	}
1092 
1093 	if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
1094 		si_query_hw_reset_buffers(rctx, query);
1095 
1096 	r600_resource_reference(&query->workaround_buf, NULL);
1097 
1098 	r600_query_hw_emit_start(rctx, query);
1099 	if (!query->buffer.buf)
1100 		return false;
1101 
1102 	LIST_ADDTAIL(&query->list, &rctx->active_queries);
1103 	return true;
1104 }
1105 
r600_end_query(struct pipe_context * ctx,struct pipe_query * query)1106 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
1107 {
1108 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1109 	struct r600_query *rquery = (struct r600_query *)query;
1110 
1111 	return rquery->ops->end(rctx, rquery);
1112 }
1113 
si_query_hw_end(struct r600_common_context * rctx,struct r600_query * rquery)1114 bool si_query_hw_end(struct r600_common_context *rctx,
1115 		     struct r600_query *rquery)
1116 {
1117 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1118 
1119 	if (query->flags & R600_QUERY_HW_FLAG_NO_START)
1120 		si_query_hw_reset_buffers(rctx, query);
1121 
1122 	r600_query_hw_emit_stop(rctx, query);
1123 
1124 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
1125 		LIST_DELINIT(&query->list);
1126 
1127 	if (!query->buffer.buf)
1128 		return false;
1129 
1130 	return true;
1131 }
1132 
r600_get_hw_query_params(struct r600_common_context * rctx,struct r600_query_hw * rquery,int index,struct r600_hw_query_params * params)1133 static void r600_get_hw_query_params(struct r600_common_context *rctx,
1134 				     struct r600_query_hw *rquery, int index,
1135 				     struct r600_hw_query_params *params)
1136 {
1137 	unsigned max_rbs = rctx->screen->info.num_render_backends;
1138 
1139 	params->pair_stride = 0;
1140 	params->pair_count = 1;
1141 
1142 	switch (rquery->b.type) {
1143 	case PIPE_QUERY_OCCLUSION_COUNTER:
1144 	case PIPE_QUERY_OCCLUSION_PREDICATE:
1145 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1146 		params->start_offset = 0;
1147 		params->end_offset = 8;
1148 		params->fence_offset = max_rbs * 16;
1149 		params->pair_stride = 16;
1150 		params->pair_count = max_rbs;
1151 		break;
1152 	case PIPE_QUERY_TIME_ELAPSED:
1153 		params->start_offset = 0;
1154 		params->end_offset = 8;
1155 		params->fence_offset = 16;
1156 		break;
1157 	case PIPE_QUERY_TIMESTAMP:
1158 		params->start_offset = 0;
1159 		params->end_offset = 0;
1160 		params->fence_offset = 8;
1161 		break;
1162 	case PIPE_QUERY_PRIMITIVES_EMITTED:
1163 		params->start_offset = 8;
1164 		params->end_offset = 24;
1165 		params->fence_offset = params->end_offset + 4;
1166 		break;
1167 	case PIPE_QUERY_PRIMITIVES_GENERATED:
1168 		params->start_offset = 0;
1169 		params->end_offset = 16;
1170 		params->fence_offset = params->end_offset + 4;
1171 		break;
1172 	case PIPE_QUERY_SO_STATISTICS:
1173 		params->start_offset = 8 - index * 8;
1174 		params->end_offset = 24 - index * 8;
1175 		params->fence_offset = params->end_offset + 4;
1176 		break;
1177 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1178 		params->pair_count = R600_MAX_STREAMS;
1179 		params->pair_stride = 32;
1180 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1181 		params->start_offset = 0;
1182 		params->end_offset = 16;
1183 
1184 		/* We can re-use the high dword of the last 64-bit value as a
1185 		 * fence: it is initialized as 0, and the high bit is set by
1186 		 * the write of the streamout stats event.
1187 		 */
1188 		params->fence_offset = rquery->result_size - 4;
1189 		break;
1190 	case PIPE_QUERY_PIPELINE_STATISTICS:
1191 	{
1192 		/* Offsets apply to EG+ */
1193 		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1194 		params->start_offset = offsets[index];
1195 		params->end_offset = 88 + offsets[index];
1196 		params->fence_offset = 2 * 88;
1197 		break;
1198 	}
1199 	default:
1200 		unreachable("r600_get_hw_query_params unsupported");
1201 	}
1202 }
1203 
r600_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1204 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
1205 				       bool test_status_bit)
1206 {
1207 	uint32_t *current_result = (uint32_t*)map;
1208 	uint64_t start, end;
1209 
1210 	start = (uint64_t)current_result[start_index] |
1211 		(uint64_t)current_result[start_index+1] << 32;
1212 	end = (uint64_t)current_result[end_index] |
1213 	      (uint64_t)current_result[end_index+1] << 32;
1214 
1215 	if (!test_status_bit ||
1216 	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1217 		return end - start;
1218 	}
1219 	return 0;
1220 }
1221 
r600_query_hw_add_result(struct si_screen * sscreen,struct r600_query_hw * query,void * buffer,union pipe_query_result * result)1222 static void r600_query_hw_add_result(struct si_screen *sscreen,
1223 				     struct r600_query_hw *query,
1224 				     void *buffer,
1225 				     union pipe_query_result *result)
1226 {
1227 	unsigned max_rbs = sscreen->info.num_render_backends;
1228 
1229 	switch (query->b.type) {
1230 	case PIPE_QUERY_OCCLUSION_COUNTER: {
1231 		for (unsigned i = 0; i < max_rbs; ++i) {
1232 			unsigned results_base = i * 16;
1233 			result->u64 +=
1234 				r600_query_read_result(buffer + results_base, 0, 2, true);
1235 		}
1236 		break;
1237 	}
1238 	case PIPE_QUERY_OCCLUSION_PREDICATE:
1239 	case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1240 		for (unsigned i = 0; i < max_rbs; ++i) {
1241 			unsigned results_base = i * 16;
1242 			result->b = result->b ||
1243 				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
1244 		}
1245 		break;
1246 	}
1247 	case PIPE_QUERY_TIME_ELAPSED:
1248 		result->u64 += r600_query_read_result(buffer, 0, 2, false);
1249 		break;
1250 	case PIPE_QUERY_TIMESTAMP:
1251 		result->u64 = *(uint64_t*)buffer;
1252 		break;
1253 	case PIPE_QUERY_PRIMITIVES_EMITTED:
1254 		/* SAMPLE_STREAMOUTSTATS stores this structure:
1255 		 * {
1256 		 *    u64 NumPrimitivesWritten;
1257 		 *    u64 PrimitiveStorageNeeded;
1258 		 * }
1259 		 * We only need NumPrimitivesWritten here. */
1260 		result->u64 += r600_query_read_result(buffer, 2, 6, true);
1261 		break;
1262 	case PIPE_QUERY_PRIMITIVES_GENERATED:
1263 		/* Here we read PrimitiveStorageNeeded. */
1264 		result->u64 += r600_query_read_result(buffer, 0, 4, true);
1265 		break;
1266 	case PIPE_QUERY_SO_STATISTICS:
1267 		result->so_statistics.num_primitives_written +=
1268 			r600_query_read_result(buffer, 2, 6, true);
1269 		result->so_statistics.primitives_storage_needed +=
1270 			r600_query_read_result(buffer, 0, 4, true);
1271 		break;
1272 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1273 		result->b = result->b ||
1274 			r600_query_read_result(buffer, 2, 6, true) !=
1275 			r600_query_read_result(buffer, 0, 4, true);
1276 		break;
1277 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1278 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
1279 			result->b = result->b ||
1280 				r600_query_read_result(buffer, 2, 6, true) !=
1281 				r600_query_read_result(buffer, 0, 4, true);
1282 			buffer = (char *)buffer + 32;
1283 		}
1284 		break;
1285 	case PIPE_QUERY_PIPELINE_STATISTICS:
1286 		result->pipeline_statistics.ps_invocations +=
1287 			r600_query_read_result(buffer, 0, 22, false);
1288 		result->pipeline_statistics.c_primitives +=
1289 			r600_query_read_result(buffer, 2, 24, false);
1290 		result->pipeline_statistics.c_invocations +=
1291 			r600_query_read_result(buffer, 4, 26, false);
1292 		result->pipeline_statistics.vs_invocations +=
1293 			r600_query_read_result(buffer, 6, 28, false);
1294 		result->pipeline_statistics.gs_invocations +=
1295 			r600_query_read_result(buffer, 8, 30, false);
1296 		result->pipeline_statistics.gs_primitives +=
1297 			r600_query_read_result(buffer, 10, 32, false);
1298 		result->pipeline_statistics.ia_primitives +=
1299 			r600_query_read_result(buffer, 12, 34, false);
1300 		result->pipeline_statistics.ia_vertices +=
1301 			r600_query_read_result(buffer, 14, 36, false);
1302 		result->pipeline_statistics.hs_invocations +=
1303 			r600_query_read_result(buffer, 16, 38, false);
1304 		result->pipeline_statistics.ds_invocations +=
1305 			r600_query_read_result(buffer, 18, 40, false);
1306 		result->pipeline_statistics.cs_invocations +=
1307 			r600_query_read_result(buffer, 20, 42, false);
1308 #if 0 /* for testing */
1309 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1310 		       "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1311 		       "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1312 		       result->pipeline_statistics.ia_vertices,
1313 		       result->pipeline_statistics.ia_primitives,
1314 		       result->pipeline_statistics.vs_invocations,
1315 		       result->pipeline_statistics.hs_invocations,
1316 		       result->pipeline_statistics.ds_invocations,
1317 		       result->pipeline_statistics.gs_invocations,
1318 		       result->pipeline_statistics.gs_primitives,
1319 		       result->pipeline_statistics.c_invocations,
1320 		       result->pipeline_statistics.c_primitives,
1321 		       result->pipeline_statistics.ps_invocations,
1322 		       result->pipeline_statistics.cs_invocations);
1323 #endif
1324 		break;
1325 	default:
1326 		assert(0);
1327 	}
1328 }
1329 
r600_get_query_result(struct pipe_context * ctx,struct pipe_query * query,boolean wait,union pipe_query_result * result)1330 static boolean r600_get_query_result(struct pipe_context *ctx,
1331 				     struct pipe_query *query, boolean wait,
1332 				     union pipe_query_result *result)
1333 {
1334 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1335 	struct r600_query *rquery = (struct r600_query *)query;
1336 
1337 	return rquery->ops->get_result(rctx, rquery, wait, result);
1338 }
1339 
r600_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,boolean wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1340 static void r600_get_query_result_resource(struct pipe_context *ctx,
1341                                            struct pipe_query *query,
1342                                            boolean wait,
1343                                            enum pipe_query_value_type result_type,
1344                                            int index,
1345                                            struct pipe_resource *resource,
1346                                            unsigned offset)
1347 {
1348 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1349 	struct r600_query *rquery = (struct r600_query *)query;
1350 
1351 	rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
1352 	                                 resource, offset);
1353 }
1354 
r600_query_hw_clear_result(struct r600_query_hw * query,union pipe_query_result * result)1355 static void r600_query_hw_clear_result(struct r600_query_hw *query,
1356 				       union pipe_query_result *result)
1357 {
1358 	util_query_clear_result(result, query->b.type);
1359 }
1360 
si_query_hw_get_result(struct r600_common_context * rctx,struct r600_query * rquery,bool wait,union pipe_query_result * result)1361 bool si_query_hw_get_result(struct r600_common_context *rctx,
1362 			    struct r600_query *rquery,
1363 			    bool wait, union pipe_query_result *result)
1364 {
1365 	struct si_screen *sscreen = rctx->screen;
1366 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1367 	struct r600_query_buffer *qbuf;
1368 
1369 	query->ops->clear_result(query, result);
1370 
1371 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1372 		unsigned usage = PIPE_TRANSFER_READ |
1373 				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
1374 		unsigned results_base = 0;
1375 		void *map;
1376 
1377 		if (rquery->b.flushed)
1378 			map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1379 		else
1380 			map = si_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
1381 
1382 		if (!map)
1383 			return false;
1384 
1385 		while (results_base != qbuf->results_end) {
1386 			query->ops->add_result(sscreen, query, map + results_base,
1387 					       result);
1388 			results_base += query->result_size;
1389 		}
1390 	}
1391 
1392 	/* Convert the time to expected units. */
1393 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
1394 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
1395 		result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
1396 	}
1397 	return true;
1398 }
1399 
1400 /* Create the compute shader that is used to collect the results.
1401  *
1402  * One compute grid with a single thread is launched for every query result
1403  * buffer. The thread (optionally) reads a previous summary buffer, then
1404  * accumulates data from the query result buffer, and writes the result either
1405  * to a summary buffer to be consumed by the next grid invocation or to the
1406  * user-supplied buffer.
1407  *
1408  * Data layout:
1409  *
1410  * CONST
1411  *  0.x = end_offset
1412  *  0.y = result_stride
1413  *  0.z = result_count
1414  *  0.w = bit field:
1415  *          1: read previously accumulated values
1416  *          2: write accumulated values for chaining
1417  *          4: write result available
1418  *          8: convert result to boolean (0/1)
1419  *         16: only read one dword and use that as result
1420  *         32: apply timestamp conversion
1421  *         64: store full 64 bits result
1422  *        128: store signed 32 bits result
1423  *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
1424  *  1.x = fence_offset
1425  *  1.y = pair_stride
1426  *  1.z = pair_count
1427  *
1428  * BUFFER[0] = query result buffer
1429  * BUFFER[1] = previous summary buffer
1430  * BUFFER[2] = next summary buffer or user-supplied buffer
1431  */
r600_create_query_result_shader(struct r600_common_context * rctx)1432 static void r600_create_query_result_shader(struct r600_common_context *rctx)
1433 {
1434 	/* TEMP[0].xy = accumulated result so far
1435 	 * TEMP[0].z = result not available
1436 	 *
1437 	 * TEMP[1].x = current result index
1438 	 * TEMP[1].y = current pair index
1439 	 */
1440 	static const char text_tmpl[] =
1441 		"COMP\n"
1442 		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
1443 		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
1444 		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
1445 		"DCL BUFFER[0]\n"
1446 		"DCL BUFFER[1]\n"
1447 		"DCL BUFFER[2]\n"
1448 		"DCL CONST[0][0..1]\n"
1449 		"DCL TEMP[0..5]\n"
1450 		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
1451 		"IMM[1] UINT32 {1, 2, 4, 8}\n"
1452 		"IMM[2] UINT32 {16, 32, 64, 128}\n"
1453 		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
1454 		"IMM[4] UINT32 {256, 0, 0, 0}\n"
1455 
1456 		"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
1457 		"UIF TEMP[5]\n"
1458 			/* Check result availability. */
1459 			"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
1460 			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
1461 			"MOV TEMP[1], TEMP[0].zzzz\n"
1462 			"NOT TEMP[0].z, TEMP[0].zzzz\n"
1463 
1464 			/* Load result if available. */
1465 			"UIF TEMP[1]\n"
1466 				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
1467 			"ENDIF\n"
1468 		"ELSE\n"
1469 			/* Load previously accumulated result if requested. */
1470 			"MOV TEMP[0], IMM[0].xxxx\n"
1471 			"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
1472 			"UIF TEMP[4]\n"
1473 				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
1474 			"ENDIF\n"
1475 
1476 			"MOV TEMP[1].x, IMM[0].xxxx\n"
1477 			"BGNLOOP\n"
1478 				/* Break if accumulated result so far is not available. */
1479 				"UIF TEMP[0].zzzz\n"
1480 					"BRK\n"
1481 				"ENDIF\n"
1482 
1483 				/* Break if result_index >= result_count. */
1484 				"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
1485 				"UIF TEMP[5]\n"
1486 					"BRK\n"
1487 				"ENDIF\n"
1488 
1489 				/* Load fence and check result availability */
1490 				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
1491 				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
1492 				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
1493 				"NOT TEMP[0].z, TEMP[0].zzzz\n"
1494 				"UIF TEMP[0].zzzz\n"
1495 					"BRK\n"
1496 				"ENDIF\n"
1497 
1498 				"MOV TEMP[1].y, IMM[0].xxxx\n"
1499 				"BGNLOOP\n"
1500 					/* Load start and end. */
1501 					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
1502 					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
1503 					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1504 
1505 					"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
1506 					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
1507 
1508 					"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
1509 
1510 					"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
1511 					"UIF TEMP[5].zzzz\n"
1512 						/* Load second start/end half-pair and
1513 						 * take the difference
1514 						 */
1515 						"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
1516 						"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1517 						"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
1518 
1519 						"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
1520 						"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
1521 					"ENDIF\n"
1522 
1523 					"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
1524 
1525 					/* Increment pair index */
1526 					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
1527 					"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
1528 					"UIF TEMP[5]\n"
1529 						"BRK\n"
1530 					"ENDIF\n"
1531 				"ENDLOOP\n"
1532 
1533 				/* Increment result index */
1534 				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
1535 			"ENDLOOP\n"
1536 		"ENDIF\n"
1537 
1538 		"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
1539 		"UIF TEMP[4]\n"
1540 			/* Store accumulated data for chaining. */
1541 			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
1542 		"ELSE\n"
1543 			"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
1544 			"UIF TEMP[4]\n"
1545 				/* Store result availability. */
1546 				"NOT TEMP[0].z, TEMP[0]\n"
1547 				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
1548 				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
1549 
1550 				"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
1551 				"UIF TEMP[4]\n"
1552 					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
1553 				"ENDIF\n"
1554 			"ELSE\n"
1555 				/* Store result if it is available. */
1556 				"NOT TEMP[4], TEMP[0].zzzz\n"
1557 				"UIF TEMP[4]\n"
1558 					/* Apply timestamp conversion */
1559 					"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
1560 					"UIF TEMP[4]\n"
1561 						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
1562 						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
1563 					"ENDIF\n"
1564 
1565 					/* Convert to boolean */
1566 					"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
1567 					"UIF TEMP[4]\n"
1568 						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
1569 						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
1570 						"MOV TEMP[0].y, IMM[0].xxxx\n"
1571 					"ENDIF\n"
1572 
1573 					"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
1574 					"UIF TEMP[4]\n"
1575 						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
1576 					"ELSE\n"
1577 						/* Clamping */
1578 						"UIF TEMP[0].yyyy\n"
1579 							"MOV TEMP[0].x, IMM[0].wwww\n"
1580 						"ENDIF\n"
1581 
1582 						"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
1583 						"UIF TEMP[4]\n"
1584 							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
1585 						"ENDIF\n"
1586 
1587 						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
1588 					"ENDIF\n"
1589 				"ENDIF\n"
1590 			"ENDIF\n"
1591 		"ENDIF\n"
1592 
1593 		"END\n";
1594 
1595 	char text[sizeof(text_tmpl) + 32];
1596 	struct tgsi_token tokens[1024];
1597 	struct pipe_compute_state state = {};
1598 
1599 	/* Hard code the frequency into the shader so that the backend can
1600 	 * use the full range of optimizations for divide-by-constant.
1601 	 */
1602 	snprintf(text, sizeof(text), text_tmpl,
1603 		 rctx->screen->info.clock_crystal_freq);
1604 
1605 	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
1606 		assert(false);
1607 		return;
1608 	}
1609 
1610 	state.ir_type = PIPE_SHADER_IR_TGSI;
1611 	state.prog = tokens;
1612 
1613 	rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
1614 }
1615 
r600_restore_qbo_state(struct r600_common_context * rctx,struct r600_qbo_state * st)1616 static void r600_restore_qbo_state(struct r600_common_context *rctx,
1617 				   struct r600_qbo_state *st)
1618 {
1619 	rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
1620 
1621 	rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1622 	pipe_resource_reference(&st->saved_const0.buffer, NULL);
1623 
1624 	rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
1625 	for (unsigned i = 0; i < 3; ++i)
1626 		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
1627 }
1628 
r600_query_hw_get_result_resource(struct r600_common_context * rctx,struct r600_query * rquery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1629 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
1630                                               struct r600_query *rquery,
1631                                               bool wait,
1632                                               enum pipe_query_value_type result_type,
1633                                               int index,
1634                                               struct pipe_resource *resource,
1635                                               unsigned offset)
1636 {
1637 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1638 	struct r600_query_buffer *qbuf;
1639 	struct r600_query_buffer *qbuf_prev;
1640 	struct pipe_resource *tmp_buffer = NULL;
1641 	unsigned tmp_buffer_offset = 0;
1642 	struct r600_qbo_state saved_state = {};
1643 	struct pipe_grid_info grid = {};
1644 	struct pipe_constant_buffer constant_buffer = {};
1645 	struct pipe_shader_buffer ssbo[3];
1646 	struct r600_hw_query_params params;
1647 	struct {
1648 		uint32_t end_offset;
1649 		uint32_t result_stride;
1650 		uint32_t result_count;
1651 		uint32_t config;
1652 		uint32_t fence_offset;
1653 		uint32_t pair_stride;
1654 		uint32_t pair_count;
1655 	} consts;
1656 
1657 	if (!rctx->query_result_shader) {
1658 		r600_create_query_result_shader(rctx);
1659 		if (!rctx->query_result_shader)
1660 			return;
1661 	}
1662 
1663 	if (query->buffer.previous) {
1664 		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
1665 				     &tmp_buffer_offset, &tmp_buffer);
1666 		if (!tmp_buffer)
1667 			return;
1668 	}
1669 
1670 	rctx->save_qbo_state(&rctx->b, &saved_state);
1671 
1672 	r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
1673 	consts.end_offset = params.end_offset - params.start_offset;
1674 	consts.fence_offset = params.fence_offset - params.start_offset;
1675 	consts.result_stride = query->result_size;
1676 	consts.pair_stride = params.pair_stride;
1677 	consts.pair_count = params.pair_count;
1678 
1679 	constant_buffer.buffer_size = sizeof(consts);
1680 	constant_buffer.user_buffer = &consts;
1681 
1682 	ssbo[1].buffer = tmp_buffer;
1683 	ssbo[1].buffer_offset = tmp_buffer_offset;
1684 	ssbo[1].buffer_size = 16;
1685 
1686 	ssbo[2] = ssbo[1];
1687 
1688 	rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
1689 
1690 	grid.block[0] = 1;
1691 	grid.block[1] = 1;
1692 	grid.block[2] = 1;
1693 	grid.grid[0] = 1;
1694 	grid.grid[1] = 1;
1695 	grid.grid[2] = 1;
1696 
1697 	consts.config = 0;
1698 	if (index < 0)
1699 		consts.config |= 4;
1700 	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1701 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1702 		consts.config |= 8;
1703 	else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1704 		 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1705 		consts.config |= 8 | 256;
1706 	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
1707 		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
1708 		consts.config |= 32;
1709 
1710 	switch (result_type) {
1711 	case PIPE_QUERY_TYPE_U64:
1712 	case PIPE_QUERY_TYPE_I64:
1713 		consts.config |= 64;
1714 		break;
1715 	case PIPE_QUERY_TYPE_I32:
1716 		consts.config |= 128;
1717 		break;
1718 	case PIPE_QUERY_TYPE_U32:
1719 		break;
1720 	}
1721 
1722 	rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
1723 
1724 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1725 		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1726 			qbuf_prev = qbuf->previous;
1727 			consts.result_count = qbuf->results_end / query->result_size;
1728 			consts.config &= ~3;
1729 			if (qbuf != &query->buffer)
1730 				consts.config |= 1;
1731 			if (qbuf->previous)
1732 				consts.config |= 2;
1733 		} else {
1734 			/* Only read the last timestamp. */
1735 			qbuf_prev = NULL;
1736 			consts.result_count = 0;
1737 			consts.config |= 16;
1738 			params.start_offset += qbuf->results_end - query->result_size;
1739 		}
1740 
1741 		rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
1742 
1743 		ssbo[0].buffer = &qbuf->buf->b.b;
1744 		ssbo[0].buffer_offset = params.start_offset;
1745 		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1746 
1747 		if (!qbuf->previous) {
1748 			ssbo[2].buffer = resource;
1749 			ssbo[2].buffer_offset = offset;
1750 			ssbo[2].buffer_size = 8;
1751 
1752 			((struct r600_resource *)resource)->TC_L2_dirty = true;
1753 		}
1754 
1755 		rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
1756 
1757 		if (wait && qbuf == &query->buffer) {
1758 			uint64_t va;
1759 
1760 			/* Wait for result availability. Wait only for readiness
1761 			 * of the last entry, since the fence writes should be
1762 			 * serialized in the CP.
1763 			 */
1764 			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1765 			va += params.fence_offset;
1766 
1767 			si_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
1768 		}
1769 
1770 		rctx->b.launch_grid(&rctx->b, &grid);
1771 		rctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
1772 	}
1773 
1774 	r600_restore_qbo_state(rctx, &saved_state);
1775 	pipe_resource_reference(&tmp_buffer, NULL);
1776 }
1777 
r600_render_condition(struct pipe_context * ctx,struct pipe_query * query,boolean condition,enum pipe_render_cond_flag mode)1778 static void r600_render_condition(struct pipe_context *ctx,
1779 				  struct pipe_query *query,
1780 				  boolean condition,
1781 				  enum pipe_render_cond_flag mode)
1782 {
1783 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1784 	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
1785 	struct r600_atom *atom = &rctx->render_cond_atom;
1786 
1787 	if (query) {
1788 		bool needs_workaround = false;
1789 
1790 		/* There was a firmware regression in VI which causes successive
1791 		 * SET_PREDICATION packets to give the wrong answer for
1792 		 * non-inverted stream overflow predication.
1793 		 */
1794 		if (((rctx->chip_class == VI && rctx->screen->info.pfp_fw_feature < 49) ||
1795 		     (rctx->chip_class == GFX9 && rctx->screen->info.pfp_fw_feature < 38)) &&
1796 		    !condition &&
1797 		    (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1798 		     (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1799 		      (rquery->buffer.previous ||
1800 		       rquery->buffer.results_end > rquery->result_size)))) {
1801 			needs_workaround = true;
1802 		}
1803 
1804 		if (needs_workaround && !rquery->workaround_buf) {
1805 			bool old_force_off = rctx->render_cond_force_off;
1806 			rctx->render_cond_force_off = true;
1807 
1808 			u_suballocator_alloc(
1809 				rctx->allocator_zeroed_memory, 8, 8,
1810 				&rquery->workaround_offset,
1811 				(struct pipe_resource **)&rquery->workaround_buf);
1812 
1813 			/* Reset to NULL to avoid a redundant SET_PREDICATION
1814 			 * from launching the compute grid.
1815 			 */
1816 			rctx->render_cond = NULL;
1817 
1818 			ctx->get_query_result_resource(
1819 				ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1820 				&rquery->workaround_buf->b.b, rquery->workaround_offset);
1821 
1822 			/* Settings this in the render cond atom is too late,
1823 			 * so set it here. */
1824 			rctx->flags |= rctx->screen->barrier_flags.L2_to_cp |
1825 				       SI_CONTEXT_FLUSH_FOR_RENDER_COND;
1826 
1827 			rctx->render_cond_force_off = old_force_off;
1828 		}
1829 	}
1830 
1831 	rctx->render_cond = query;
1832 	rctx->render_cond_invert = condition;
1833 	rctx->render_cond_mode = mode;
1834 
1835 	rctx->set_atom_dirty(rctx, atom, query != NULL);
1836 }
1837 
si_suspend_queries(struct r600_common_context * ctx)1838 void si_suspend_queries(struct r600_common_context *ctx)
1839 {
1840 	struct r600_query_hw *query;
1841 
1842 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1843 		r600_query_hw_emit_stop(ctx, query);
1844 	}
1845 	assert(ctx->num_cs_dw_queries_suspend == 0);
1846 }
1847 
r600_queries_num_cs_dw_for_resuming(struct r600_common_context * ctx,struct list_head * query_list)1848 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
1849 						    struct list_head *query_list)
1850 {
1851 	struct r600_query_hw *query;
1852 	unsigned num_dw = 0;
1853 
1854 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
1855 		/* begin + end */
1856 		num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
1857 
1858 		/* Workaround for the fact that
1859 		 * num_cs_dw_nontimer_queries_suspend is incremented for every
1860 		 * resumed query, which raises the bar in need_cs_space for
1861 		 * queries about to be resumed.
1862 		 */
1863 		num_dw += query->num_cs_dw_end;
1864 	}
1865 	/* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
1866 	num_dw += 13;
1867 
1868 	return num_dw;
1869 }
1870 
si_resume_queries(struct r600_common_context * ctx)1871 void si_resume_queries(struct r600_common_context *ctx)
1872 {
1873 	struct r600_query_hw *query;
1874 	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
1875 
1876 	assert(ctx->num_cs_dw_queries_suspend == 0);
1877 
1878 	/* Check CS space here. Resuming must not be interrupted by flushes. */
1879 	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
1880 
1881 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1882 		r600_query_hw_emit_start(ctx, query);
1883 	}
1884 }
1885 
1886 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1887 	{ \
1888 		.name = name_, \
1889 		.query_type = R600_QUERY_##query_type_, \
1890 		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1891 		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
1892 		.group_id = group_id_ \
1893 	}
1894 
1895 #define X(name_, query_type_, type_, result_type_) \
1896 	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1897 
1898 #define XG(group_, name_, query_type_, type_, result_type_) \
1899 	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
1900 
1901 static struct pipe_driver_query_info r600_driver_query_list[] = {
1902 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
1903 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
1904 	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
1905 	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
1906 	X("decompress-calls",		DECOMPRESS_CALLS,	UINT64, AVERAGE),
1907 	X("MRT-draw-calls",		MRT_DRAW_CALLS,		UINT64, AVERAGE),
1908 	X("prim-restart-calls",		PRIM_RESTART_CALLS,	UINT64, AVERAGE),
1909 	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
1910 	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
1911 	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
1912 	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
1913 	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
1914 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
1915 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
1916 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
1917 	X("num-CB-cache-flushes",	NUM_CB_CACHE_FLUSHES,	UINT64, AVERAGE),
1918 	X("num-DB-cache-flushes",	NUM_DB_CACHE_FLUSHES,	UINT64, AVERAGE),
1919 	X("num-L2-invalidates",		NUM_L2_INVALIDATES,	UINT64, AVERAGE),
1920 	X("num-L2-writebacks",		NUM_L2_WRITEBACKS,	UINT64, AVERAGE),
1921 	X("num-resident-handles",	NUM_RESIDENT_HANDLES,	UINT64, AVERAGE),
1922 	X("tc-offloaded-slots",		TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
1923 	X("tc-direct-slots",		TC_DIRECT_SLOTS,	UINT64, AVERAGE),
1924 	X("tc-num-syncs",		TC_NUM_SYNCS,		UINT64, AVERAGE),
1925 	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
1926 	X("gallium-thread-busy",	GALLIUM_THREAD_BUSY,	UINT64, AVERAGE),
1927 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
1928 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
1929 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
1930 	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
1931 	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
1932 	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
1933 	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
1934 	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
1935 	X("GFX-BO-list-size",		GFX_BO_LIST_SIZE,	UINT64, AVERAGE),
1936 	X("GFX-IB-size",		GFX_IB_SIZE,		UINT64, AVERAGE),
1937 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
1938 	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
1939 	X("VRAM-CPU-page-faults",	NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1940 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
1941 	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
1942 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
1943 	X("back-buffer-ps-draw-ratio",	BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1944 
1945 	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1946 	 * which use it as a fallback path to detect the GPU type.
1947 	 *
1948 	 * Note: The names of these queries are significant for GPUPerfStudio
1949 	 * (and possibly their order as well). */
1950 	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
1951 	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
1952 	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
1953 	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
1954 	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
1955 
1956 	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
1957 	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
1958 	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
1959 
1960 	/* The following queries must be at the end of the list because their
1961 	 * availability is adjusted dynamically based on the DRM version. */
1962 	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
1963 	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
1964 	X("GPU-ta-busy",		GPU_TA_BUSY,		UINT64, AVERAGE),
1965 	X("GPU-gds-busy",		GPU_GDS_BUSY,		UINT64, AVERAGE),
1966 	X("GPU-vgt-busy",		GPU_VGT_BUSY,		UINT64, AVERAGE),
1967 	X("GPU-ia-busy",		GPU_IA_BUSY,		UINT64, AVERAGE),
1968 	X("GPU-sx-busy",		GPU_SX_BUSY,		UINT64, AVERAGE),
1969 	X("GPU-wd-busy",		GPU_WD_BUSY,		UINT64, AVERAGE),
1970 	X("GPU-bci-busy",		GPU_BCI_BUSY,		UINT64, AVERAGE),
1971 	X("GPU-sc-busy",		GPU_SC_BUSY,		UINT64, AVERAGE),
1972 	X("GPU-pa-busy",		GPU_PA_BUSY,		UINT64, AVERAGE),
1973 	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
1974 	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
1975 	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
1976 	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
1977 	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
1978 	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
1979 	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
1980 	X("GPU-surf-sync-busy",		GPU_SURF_SYNC_BUSY,	UINT64, AVERAGE),
1981 	X("GPU-cp-dma-busy",		GPU_CP_DMA_BUSY,	UINT64, AVERAGE),
1982 	X("GPU-scratch-ram-busy",	GPU_SCRATCH_RAM_BUSY,	UINT64, AVERAGE),
1983 };
1984 
1985 #undef X
1986 #undef XG
1987 #undef XFULL
1988 
r600_get_num_queries(struct si_screen * sscreen)1989 static unsigned r600_get_num_queries(struct si_screen *sscreen)
1990 {
1991 	if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 42)
1992 		return ARRAY_SIZE(r600_driver_query_list);
1993 	else if (sscreen->info.drm_major == 3) {
1994 		if (sscreen->info.chip_class >= VI)
1995 			return ARRAY_SIZE(r600_driver_query_list);
1996 		else
1997 			return ARRAY_SIZE(r600_driver_query_list) - 7;
1998 	}
1999 	else
2000 		return ARRAY_SIZE(r600_driver_query_list) - 25;
2001 }
2002 
r600_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)2003 static int r600_get_driver_query_info(struct pipe_screen *screen,
2004 				      unsigned index,
2005 				      struct pipe_driver_query_info *info)
2006 {
2007 	struct si_screen *sscreen = (struct si_screen*)screen;
2008 	unsigned num_queries = r600_get_num_queries(sscreen);
2009 
2010 	if (!info) {
2011 		unsigned num_perfcounters =
2012 			si_get_perfcounter_info(sscreen, 0, NULL);
2013 
2014 		return num_queries + num_perfcounters;
2015 	}
2016 
2017 	if (index >= num_queries)
2018 		return si_get_perfcounter_info(sscreen, index - num_queries, info);
2019 
2020 	*info = r600_driver_query_list[index];
2021 
2022 	switch (info->query_type) {
2023 	case R600_QUERY_REQUESTED_VRAM:
2024 	case R600_QUERY_VRAM_USAGE:
2025 	case R600_QUERY_MAPPED_VRAM:
2026 		info->max_value.u64 = sscreen->info.vram_size;
2027 		break;
2028 	case R600_QUERY_REQUESTED_GTT:
2029 	case R600_QUERY_GTT_USAGE:
2030 	case R600_QUERY_MAPPED_GTT:
2031 		info->max_value.u64 = sscreen->info.gart_size;
2032 		break;
2033 	case R600_QUERY_GPU_TEMPERATURE:
2034 		info->max_value.u64 = 125;
2035 		break;
2036 	case R600_QUERY_VRAM_VIS_USAGE:
2037 		info->max_value.u64 = sscreen->info.vram_vis_size;
2038 		break;
2039 	}
2040 
2041 	if (info->group_id != ~(unsigned)0 && sscreen->perfcounters)
2042 		info->group_id += sscreen->perfcounters->num_groups;
2043 
2044 	return 1;
2045 }
2046 
2047 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
2048  * performance counter groups, so be careful when changing this and related
2049  * functions.
2050  */
r600_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)2051 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
2052 					    unsigned index,
2053 					    struct pipe_driver_query_group_info *info)
2054 {
2055 	struct si_screen *sscreen = (struct si_screen *)screen;
2056 	unsigned num_pc_groups = 0;
2057 
2058 	if (sscreen->perfcounters)
2059 		num_pc_groups = sscreen->perfcounters->num_groups;
2060 
2061 	if (!info)
2062 		return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
2063 
2064 	if (index < num_pc_groups)
2065 		return si_get_perfcounter_group_info(sscreen, index, info);
2066 
2067 	index -= num_pc_groups;
2068 	if (index >= R600_NUM_SW_QUERY_GROUPS)
2069 		return 0;
2070 
2071 	info->name = "GPIN";
2072 	info->max_active_queries = 5;
2073 	info->num_queries = 5;
2074 	return 1;
2075 }
2076 
si_init_query_functions(struct r600_common_context * rctx)2077 void si_init_query_functions(struct r600_common_context *rctx)
2078 {
2079 	rctx->b.create_query = r600_create_query;
2080 	rctx->b.create_batch_query = si_create_batch_query;
2081 	rctx->b.destroy_query = r600_destroy_query;
2082 	rctx->b.begin_query = r600_begin_query;
2083 	rctx->b.end_query = r600_end_query;
2084 	rctx->b.get_query_result = r600_get_query_result;
2085 	rctx->b.get_query_result_resource = r600_get_query_result_resource;
2086 	rctx->render_cond_atom.emit = r600_emit_query_predication;
2087 
2088 	if (((struct si_screen*)rctx->b.screen)->info.num_render_backends > 0)
2089 	    rctx->b.render_condition = r600_render_condition;
2090 
2091 	LIST_INITHEAD(&rctx->active_queries);
2092 }
2093 
si_init_screen_query_functions(struct si_screen * sscreen)2094 void si_init_screen_query_functions(struct si_screen *sscreen)
2095 {
2096 	sscreen->b.get_driver_query_info = r600_get_driver_query_info;
2097 	sscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
2098 }
2099