1 /*
2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3  * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "r600_query.h"
26 #include "r600_pipe.h"
27 #include "r600_cs.h"
28 #include "util/u_memory.h"
29 #include "util/u_upload_mgr.h"
30 #include "util/os_time.h"
31 #include "tgsi/tgsi_text.h"
32 
33 #define R600_MAX_STREAMS 4
34 
35 struct r600_hw_query_params {
36 	unsigned start_offset;
37 	unsigned end_offset;
38 	unsigned fence_offset;
39 	unsigned pair_stride;
40 	unsigned pair_count;
41 };
42 
43 /* Queries without buffer handling or suspend/resume. */
44 struct r600_query_sw {
45 	struct r600_query b;
46 
47 	uint64_t begin_result;
48 	uint64_t end_result;
49 
50 	uint64_t begin_time;
51 	uint64_t end_time;
52 
53 	/* Fence for GPU_FINISHED. */
54 	struct pipe_fence_handle *fence;
55 };
56 
r600_query_sw_destroy(struct r600_common_screen * rscreen,struct r600_query * rquery)57 static void r600_query_sw_destroy(struct r600_common_screen *rscreen,
58 				  struct r600_query *rquery)
59 {
60 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
61 
62 	rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);
63 	FREE(query);
64 }
65 
winsys_id_from_type(unsigned type)66 static enum radeon_value_id winsys_id_from_type(unsigned type)
67 {
68 	switch (type) {
69 	case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
70 	case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
71 	case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
72 	case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
73 	case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
74 	case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
75 	case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
76 	case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
77 	case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
78 	case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
79 	case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
80 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
81 	case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
82 	case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
83 	case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
84 	case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
85 	case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
86 	case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
87 	case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
88 	default: unreachable("query type does not correspond to winsys id");
89 	}
90 }
91 
r600_query_sw_begin(struct r600_common_context * rctx,struct r600_query * rquery)92 static bool r600_query_sw_begin(struct r600_common_context *rctx,
93 				struct r600_query *rquery)
94 {
95 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
96 	enum radeon_value_id ws_id;
97 
98 	switch(query->b.type) {
99 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
100 	case PIPE_QUERY_GPU_FINISHED:
101 		break;
102 	case R600_QUERY_DRAW_CALLS:
103 		query->begin_result = rctx->num_draw_calls;
104 		break;
105 	case R600_QUERY_DECOMPRESS_CALLS:
106 		query->begin_result = rctx->num_decompress_calls;
107 		break;
108 	case R600_QUERY_MRT_DRAW_CALLS:
109 		query->begin_result = rctx->num_mrt_draw_calls;
110 		break;
111 	case R600_QUERY_PRIM_RESTART_CALLS:
112 		query->begin_result = rctx->num_prim_restart_calls;
113 		break;
114 	case R600_QUERY_SPILL_DRAW_CALLS:
115 		query->begin_result = rctx->num_spill_draw_calls;
116 		break;
117 	case R600_QUERY_COMPUTE_CALLS:
118 		query->begin_result = rctx->num_compute_calls;
119 		break;
120 	case R600_QUERY_SPILL_COMPUTE_CALLS:
121 		query->begin_result = rctx->num_spill_compute_calls;
122 		break;
123 	case R600_QUERY_DMA_CALLS:
124 		query->begin_result = rctx->num_dma_calls;
125 		break;
126 	case R600_QUERY_CP_DMA_CALLS:
127 		query->begin_result = rctx->num_cp_dma_calls;
128 		break;
129 	case R600_QUERY_NUM_VS_FLUSHES:
130 		query->begin_result = rctx->num_vs_flushes;
131 		break;
132 	case R600_QUERY_NUM_PS_FLUSHES:
133 		query->begin_result = rctx->num_ps_flushes;
134 		break;
135 	case R600_QUERY_NUM_CS_FLUSHES:
136 		query->begin_result = rctx->num_cs_flushes;
137 		break;
138 	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
139 		query->begin_result = rctx->num_cb_cache_flushes;
140 		break;
141 	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
142 		query->begin_result = rctx->num_db_cache_flushes;
143 		break;
144 	case R600_QUERY_NUM_RESIDENT_HANDLES:
145 		query->begin_result = rctx->num_resident_handles;
146 		break;
147 	case R600_QUERY_TC_OFFLOADED_SLOTS:
148 		query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
149 		break;
150 	case R600_QUERY_TC_DIRECT_SLOTS:
151 		query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
152 		break;
153 	case R600_QUERY_TC_NUM_SYNCS:
154 		query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
155 		break;
156 	case R600_QUERY_REQUESTED_VRAM:
157 	case R600_QUERY_REQUESTED_GTT:
158 	case R600_QUERY_MAPPED_VRAM:
159 	case R600_QUERY_MAPPED_GTT:
160 	case R600_QUERY_VRAM_USAGE:
161 	case R600_QUERY_VRAM_VIS_USAGE:
162 	case R600_QUERY_GTT_USAGE:
163 	case R600_QUERY_GPU_TEMPERATURE:
164 	case R600_QUERY_CURRENT_GPU_SCLK:
165 	case R600_QUERY_CURRENT_GPU_MCLK:
166 	case R600_QUERY_NUM_MAPPED_BUFFERS:
167 		query->begin_result = 0;
168 		break;
169 	case R600_QUERY_BUFFER_WAIT_TIME:
170 	case R600_QUERY_NUM_GFX_IBS:
171 	case R600_QUERY_NUM_SDMA_IBS:
172 	case R600_QUERY_NUM_BYTES_MOVED:
173 	case R600_QUERY_NUM_EVICTIONS:
174 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
175 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
176 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
177 		break;
178 	}
179 	case R600_QUERY_GFX_BO_LIST_SIZE:
180 		ws_id = winsys_id_from_type(query->b.type);
181 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
182 		query->begin_time = rctx->ws->query_value(rctx->ws,
183 							  RADEON_NUM_GFX_IBS);
184 		break;
185 	case R600_QUERY_CS_THREAD_BUSY:
186 		ws_id = winsys_id_from_type(query->b.type);
187 		query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
188 		query->begin_time = os_time_get_nano();
189 		break;
190 	case R600_QUERY_GALLIUM_THREAD_BUSY:
191 		query->begin_result =
192 			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
193 		query->begin_time = os_time_get_nano();
194 		break;
195 	case R600_QUERY_GPU_LOAD:
196 	case R600_QUERY_GPU_SHADERS_BUSY:
197 	case R600_QUERY_GPU_TA_BUSY:
198 	case R600_QUERY_GPU_GDS_BUSY:
199 	case R600_QUERY_GPU_VGT_BUSY:
200 	case R600_QUERY_GPU_IA_BUSY:
201 	case R600_QUERY_GPU_SX_BUSY:
202 	case R600_QUERY_GPU_WD_BUSY:
203 	case R600_QUERY_GPU_BCI_BUSY:
204 	case R600_QUERY_GPU_SC_BUSY:
205 	case R600_QUERY_GPU_PA_BUSY:
206 	case R600_QUERY_GPU_DB_BUSY:
207 	case R600_QUERY_GPU_CP_BUSY:
208 	case R600_QUERY_GPU_CB_BUSY:
209 	case R600_QUERY_GPU_SDMA_BUSY:
210 	case R600_QUERY_GPU_PFP_BUSY:
211 	case R600_QUERY_GPU_MEQ_BUSY:
212 	case R600_QUERY_GPU_ME_BUSY:
213 	case R600_QUERY_GPU_SURF_SYNC_BUSY:
214 	case R600_QUERY_GPU_CP_DMA_BUSY:
215 	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
216 		query->begin_result = r600_begin_counter(rctx->screen,
217 							 query->b.type);
218 		break;
219 	case R600_QUERY_NUM_COMPILATIONS:
220 		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
221 		break;
222 	case R600_QUERY_NUM_SHADERS_CREATED:
223 		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
224 		break;
225 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
226 		query->begin_result =
227 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
228 		break;
229 	case R600_QUERY_GPIN_ASIC_ID:
230 	case R600_QUERY_GPIN_NUM_SIMD:
231 	case R600_QUERY_GPIN_NUM_RB:
232 	case R600_QUERY_GPIN_NUM_SPI:
233 	case R600_QUERY_GPIN_NUM_SE:
234 		break;
235 	default:
236 		unreachable("r600_query_sw_begin: bad query type");
237 	}
238 
239 	return true;
240 }
241 
r600_query_sw_end(struct r600_common_context * rctx,struct r600_query * rquery)242 static bool r600_query_sw_end(struct r600_common_context *rctx,
243 			      struct r600_query *rquery)
244 {
245 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
246 	enum radeon_value_id ws_id;
247 
248 	switch(query->b.type) {
249 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
250 		break;
251 	case PIPE_QUERY_GPU_FINISHED:
252 		rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
253 		break;
254 	case R600_QUERY_DRAW_CALLS:
255 		query->end_result = rctx->num_draw_calls;
256 		break;
257 	case R600_QUERY_DECOMPRESS_CALLS:
258 		query->end_result = rctx->num_decompress_calls;
259 		break;
260 	case R600_QUERY_MRT_DRAW_CALLS:
261 		query->end_result = rctx->num_mrt_draw_calls;
262 		break;
263 	case R600_QUERY_PRIM_RESTART_CALLS:
264 		query->end_result = rctx->num_prim_restart_calls;
265 		break;
266 	case R600_QUERY_SPILL_DRAW_CALLS:
267 		query->end_result = rctx->num_spill_draw_calls;
268 		break;
269 	case R600_QUERY_COMPUTE_CALLS:
270 		query->end_result = rctx->num_compute_calls;
271 		break;
272 	case R600_QUERY_SPILL_COMPUTE_CALLS:
273 		query->end_result = rctx->num_spill_compute_calls;
274 		break;
275 	case R600_QUERY_DMA_CALLS:
276 		query->end_result = rctx->num_dma_calls;
277 		break;
278 	case R600_QUERY_CP_DMA_CALLS:
279 		query->end_result = rctx->num_cp_dma_calls;
280 		break;
281 	case R600_QUERY_NUM_VS_FLUSHES:
282 		query->end_result = rctx->num_vs_flushes;
283 		break;
284 	case R600_QUERY_NUM_PS_FLUSHES:
285 		query->end_result = rctx->num_ps_flushes;
286 		break;
287 	case R600_QUERY_NUM_CS_FLUSHES:
288 		query->end_result = rctx->num_cs_flushes;
289 		break;
290 	case R600_QUERY_NUM_CB_CACHE_FLUSHES:
291 		query->end_result = rctx->num_cb_cache_flushes;
292 		break;
293 	case R600_QUERY_NUM_DB_CACHE_FLUSHES:
294 		query->end_result = rctx->num_db_cache_flushes;
295 		break;
296 	case R600_QUERY_NUM_RESIDENT_HANDLES:
297 		query->end_result = rctx->num_resident_handles;
298 		break;
299 	case R600_QUERY_TC_OFFLOADED_SLOTS:
300 		query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
301 		break;
302 	case R600_QUERY_TC_DIRECT_SLOTS:
303 		query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
304 		break;
305 	case R600_QUERY_TC_NUM_SYNCS:
306 		query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
307 		break;
308 	case R600_QUERY_REQUESTED_VRAM:
309 	case R600_QUERY_REQUESTED_GTT:
310 	case R600_QUERY_MAPPED_VRAM:
311 	case R600_QUERY_MAPPED_GTT:
312 	case R600_QUERY_VRAM_USAGE:
313 	case R600_QUERY_VRAM_VIS_USAGE:
314 	case R600_QUERY_GTT_USAGE:
315 	case R600_QUERY_GPU_TEMPERATURE:
316 	case R600_QUERY_CURRENT_GPU_SCLK:
317 	case R600_QUERY_CURRENT_GPU_MCLK:
318 	case R600_QUERY_BUFFER_WAIT_TIME:
319 	case R600_QUERY_NUM_MAPPED_BUFFERS:
320 	case R600_QUERY_NUM_GFX_IBS:
321 	case R600_QUERY_NUM_SDMA_IBS:
322 	case R600_QUERY_NUM_BYTES_MOVED:
323 	case R600_QUERY_NUM_EVICTIONS:
324 	case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
325 		enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
326 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
327 		break;
328 	}
329 	case R600_QUERY_GFX_BO_LIST_SIZE:
330 		ws_id = winsys_id_from_type(query->b.type);
331 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
332 		query->end_time = rctx->ws->query_value(rctx->ws,
333 							RADEON_NUM_GFX_IBS);
334 		break;
335 	case R600_QUERY_CS_THREAD_BUSY:
336 		ws_id = winsys_id_from_type(query->b.type);
337 		query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
338 		query->end_time = os_time_get_nano();
339 		break;
340 	case R600_QUERY_GALLIUM_THREAD_BUSY:
341 		query->end_result =
342 			rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
343 		query->end_time = os_time_get_nano();
344 		break;
345 	case R600_QUERY_GPU_LOAD:
346 	case R600_QUERY_GPU_SHADERS_BUSY:
347 	case R600_QUERY_GPU_TA_BUSY:
348 	case R600_QUERY_GPU_GDS_BUSY:
349 	case R600_QUERY_GPU_VGT_BUSY:
350 	case R600_QUERY_GPU_IA_BUSY:
351 	case R600_QUERY_GPU_SX_BUSY:
352 	case R600_QUERY_GPU_WD_BUSY:
353 	case R600_QUERY_GPU_BCI_BUSY:
354 	case R600_QUERY_GPU_SC_BUSY:
355 	case R600_QUERY_GPU_PA_BUSY:
356 	case R600_QUERY_GPU_DB_BUSY:
357 	case R600_QUERY_GPU_CP_BUSY:
358 	case R600_QUERY_GPU_CB_BUSY:
359 	case R600_QUERY_GPU_SDMA_BUSY:
360 	case R600_QUERY_GPU_PFP_BUSY:
361 	case R600_QUERY_GPU_MEQ_BUSY:
362 	case R600_QUERY_GPU_ME_BUSY:
363 	case R600_QUERY_GPU_SURF_SYNC_BUSY:
364 	case R600_QUERY_GPU_CP_DMA_BUSY:
365 	case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
366 		query->end_result = r600_end_counter(rctx->screen,
367 						     query->b.type,
368 						     query->begin_result);
369 		query->begin_result = 0;
370 		break;
371 	case R600_QUERY_NUM_COMPILATIONS:
372 		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
373 		break;
374 	case R600_QUERY_NUM_SHADERS_CREATED:
375 		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
376 		break;
377 	case R600_QUERY_NUM_SHADER_CACHE_HITS:
378 		query->end_result =
379 			p_atomic_read(&rctx->screen->num_shader_cache_hits);
380 		break;
381 	case R600_QUERY_GPIN_ASIC_ID:
382 	case R600_QUERY_GPIN_NUM_SIMD:
383 	case R600_QUERY_GPIN_NUM_RB:
384 	case R600_QUERY_GPIN_NUM_SPI:
385 	case R600_QUERY_GPIN_NUM_SE:
386 		break;
387 	default:
388 		unreachable("r600_query_sw_end: bad query type");
389 	}
390 
391 	return true;
392 }
393 
r600_query_sw_get_result(struct r600_common_context * rctx,struct r600_query * rquery,bool wait,union pipe_query_result * result)394 static bool r600_query_sw_get_result(struct r600_common_context *rctx,
395 				     struct r600_query *rquery,
396 				     bool wait,
397 				     union pipe_query_result *result)
398 {
399 	struct r600_query_sw *query = (struct r600_query_sw *)rquery;
400 
401 	switch (query->b.type) {
402 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
403 		/* Convert from cycles per millisecond to cycles per second (Hz). */
404 		result->timestamp_disjoint.frequency =
405 			(uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
406 		result->timestamp_disjoint.disjoint = false;
407 		return true;
408 	case PIPE_QUERY_GPU_FINISHED: {
409 		struct pipe_screen *screen = rctx->b.screen;
410 		struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
411 
412 		result->b = screen->fence_finish(screen, ctx, query->fence,
413 						 wait ? PIPE_TIMEOUT_INFINITE : 0);
414 		return result->b;
415 	}
416 
417 	case R600_QUERY_GFX_BO_LIST_SIZE:
418 		result->u64 = (query->end_result - query->begin_result) /
419 			      (query->end_time - query->begin_time);
420 		return true;
421 	case R600_QUERY_CS_THREAD_BUSY:
422 	case R600_QUERY_GALLIUM_THREAD_BUSY:
423 		result->u64 = (query->end_result - query->begin_result) * 100 /
424 			      (query->end_time - query->begin_time);
425 		return true;
426 	case R600_QUERY_GPIN_ASIC_ID:
427 		result->u32 = 0;
428 		return true;
429 	case R600_QUERY_GPIN_NUM_SIMD:
430 		result->u32 = rctx->screen->info.num_good_compute_units;
431 		return true;
432 	case R600_QUERY_GPIN_NUM_RB:
433 		result->u32 = rctx->screen->info.num_render_backends;
434 		return true;
435 	case R600_QUERY_GPIN_NUM_SPI:
436 		result->u32 = 1; /* all supported chips have one SPI per SE */
437 		return true;
438 	case R600_QUERY_GPIN_NUM_SE:
439 		result->u32 = rctx->screen->info.max_se;
440 		return true;
441 	}
442 
443 	result->u64 = query->end_result - query->begin_result;
444 
445 	switch (query->b.type) {
446 	case R600_QUERY_BUFFER_WAIT_TIME:
447 	case R600_QUERY_GPU_TEMPERATURE:
448 		result->u64 /= 1000;
449 		break;
450 	case R600_QUERY_CURRENT_GPU_SCLK:
451 	case R600_QUERY_CURRENT_GPU_MCLK:
452 		result->u64 *= 1000000;
453 		break;
454 	}
455 
456 	return true;
457 }
458 
459 
460 static struct r600_query_ops sw_query_ops = {
461 	.destroy = r600_query_sw_destroy,
462 	.begin = r600_query_sw_begin,
463 	.end = r600_query_sw_end,
464 	.get_result = r600_query_sw_get_result,
465 	.get_result_resource = NULL
466 };
467 
r600_query_sw_create(unsigned query_type)468 static struct pipe_query *r600_query_sw_create(unsigned query_type)
469 {
470 	struct r600_query_sw *query;
471 
472 	query = CALLOC_STRUCT(r600_query_sw);
473 	if (!query)
474 		return NULL;
475 
476 	query->b.type = query_type;
477 	query->b.ops = &sw_query_ops;
478 
479 	return (struct pipe_query *)query;
480 }
481 
r600_query_hw_destroy(struct r600_common_screen * rscreen,struct r600_query * rquery)482 void r600_query_hw_destroy(struct r600_common_screen *rscreen,
483 			   struct r600_query *rquery)
484 {
485 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
486 	struct r600_query_buffer *prev = query->buffer.previous;
487 
488 	/* Release all query buffers. */
489 	while (prev) {
490 		struct r600_query_buffer *qbuf = prev;
491 		prev = prev->previous;
492 		r600_resource_reference(&qbuf->buf, NULL);
493 		FREE(qbuf);
494 	}
495 
496 	r600_resource_reference(&query->buffer.buf, NULL);
497 	FREE(rquery);
498 }
499 
r600_new_query_buffer(struct r600_common_screen * rscreen,struct r600_query_hw * query)500 static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,
501 						   struct r600_query_hw *query)
502 {
503 	unsigned buf_size = MAX2(query->result_size,
504 				 rscreen->info.min_alloc_size);
505 
506 	/* Queries are normally read by the CPU after
507 	 * being written by the gpu, hence staging is probably a good
508 	 * usage pattern.
509 	 */
510 	struct r600_resource *buf = (struct r600_resource*)
511 		pipe_buffer_create(&rscreen->b, 0,
512 				   PIPE_USAGE_STAGING, buf_size);
513 	if (!buf)
514 		return NULL;
515 
516 	if (!query->ops->prepare_buffer(rscreen, query, buf)) {
517 		r600_resource_reference(&buf, NULL);
518 		return NULL;
519 	}
520 
521 	return buf;
522 }
523 
r600_query_hw_prepare_buffer(struct r600_common_screen * rscreen,struct r600_query_hw * query,struct r600_resource * buffer)524 static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,
525 					 struct r600_query_hw *query,
526 					 struct r600_resource *buffer)
527 {
528 	/* Callers ensure that the buffer is currently unused by the GPU. */
529 	uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL,
530 						   PIPE_TRANSFER_WRITE |
531 						   PIPE_TRANSFER_UNSYNCHRONIZED);
532 	if (!results)
533 		return false;
534 
535 	memset(results, 0, buffer->b.b.width0);
536 
537 	if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
538 	    query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
539 		unsigned max_rbs = rscreen->info.num_render_backends;
540 		unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
541 		unsigned num_results;
542 		unsigned i, j;
543 
544 		/* Set top bits for unused backends. */
545 		num_results = buffer->b.b.width0 / query->result_size;
546 		for (j = 0; j < num_results; j++) {
547 			for (i = 0; i < max_rbs; i++) {
548 				if (!(enabled_rb_mask & (1<<i))) {
549 					results[(i * 4)+1] = 0x80000000;
550 					results[(i * 4)+3] = 0x80000000;
551 				}
552 			}
553 			results += 4 * max_rbs;
554 		}
555 	}
556 
557 	return true;
558 }
559 
560 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
561                                               struct r600_query *rquery,
562                                               bool wait,
563                                               enum pipe_query_value_type result_type,
564                                               int index,
565                                               struct pipe_resource *resource,
566                                               unsigned offset);
567 
568 static struct r600_query_ops query_hw_ops = {
569 	.destroy = r600_query_hw_destroy,
570 	.begin = r600_query_hw_begin,
571 	.end = r600_query_hw_end,
572 	.get_result = r600_query_hw_get_result,
573 	.get_result_resource = r600_query_hw_get_result_resource,
574 };
575 
576 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
577 					struct r600_query_hw *query,
578 					struct r600_resource *buffer,
579 					uint64_t va);
580 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
581 				       struct r600_query_hw *query,
582 				       struct r600_resource *buffer,
583 				       uint64_t va);
584 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
585 				     struct r600_query_hw *, void *buffer,
586 				     union pipe_query_result *result);
587 static void r600_query_hw_clear_result(struct r600_query_hw *,
588 				       union pipe_query_result *);
589 
590 static struct r600_query_hw_ops query_hw_default_hw_ops = {
591 	.prepare_buffer = r600_query_hw_prepare_buffer,
592 	.emit_start = r600_query_hw_do_emit_start,
593 	.emit_stop = r600_query_hw_do_emit_stop,
594 	.clear_result = r600_query_hw_clear_result,
595 	.add_result = r600_query_hw_add_result,
596 };
597 
r600_query_hw_init(struct r600_common_screen * rscreen,struct r600_query_hw * query)598 bool r600_query_hw_init(struct r600_common_screen *rscreen,
599 			struct r600_query_hw *query)
600 {
601 	query->buffer.buf = r600_new_query_buffer(rscreen, query);
602 	if (!query->buffer.buf)
603 		return false;
604 
605 	return true;
606 }
607 
r600_query_hw_create(struct r600_common_screen * rscreen,unsigned query_type,unsigned index)608 static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,
609 					       unsigned query_type,
610 					       unsigned index)
611 {
612 	struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
613 	if (!query)
614 		return NULL;
615 
616 	query->b.type = query_type;
617 	query->b.ops = &query_hw_ops;
618 	query->ops = &query_hw_default_hw_ops;
619 
620 	switch (query_type) {
621 	case PIPE_QUERY_OCCLUSION_COUNTER:
622 	case PIPE_QUERY_OCCLUSION_PREDICATE:
623 		query->result_size = 16 * rscreen->info.num_render_backends;
624 		query->result_size += 16; /* for the fence + alignment */
625 		query->num_cs_dw_begin = 6;
626 		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
627 		break;
628 	case PIPE_QUERY_TIME_ELAPSED:
629 		query->result_size = 24;
630 		query->num_cs_dw_begin = 8;
631 		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
632 		break;
633 	case PIPE_QUERY_TIMESTAMP:
634 		query->result_size = 16;
635 		query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
636 		query->flags = R600_QUERY_HW_FLAG_NO_START;
637 		break;
638 	case PIPE_QUERY_PRIMITIVES_EMITTED:
639 	case PIPE_QUERY_PRIMITIVES_GENERATED:
640 	case PIPE_QUERY_SO_STATISTICS:
641 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
642 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
643 		query->result_size = 32;
644 		query->num_cs_dw_begin = 6;
645 		query->num_cs_dw_end = 6;
646 		query->stream = index;
647 		break;
648 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
649 		/* NumPrimitivesWritten, PrimitiveStorageNeeded. */
650 		query->result_size = 32 * R600_MAX_STREAMS;
651 		query->num_cs_dw_begin = 6 * R600_MAX_STREAMS;
652 		query->num_cs_dw_end = 6 * R600_MAX_STREAMS;
653 		break;
654 	case PIPE_QUERY_PIPELINE_STATISTICS:
655 		/* 11 values on EG, 8 on R600. */
656 		query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;
657 		query->result_size += 8; /* for the fence + alignment */
658 		query->num_cs_dw_begin = 6;
659 		query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
660 		break;
661 	default:
662 		assert(0);
663 		FREE(query);
664 		return NULL;
665 	}
666 
667 	if (!r600_query_hw_init(rscreen, query)) {
668 		FREE(query);
669 		return NULL;
670 	}
671 
672 	return (struct pipe_query *)query;
673 }
674 
r600_update_occlusion_query_state(struct r600_common_context * rctx,unsigned type,int diff)675 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
676 					      unsigned type, int diff)
677 {
678 	if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
679 	    type == PIPE_QUERY_OCCLUSION_PREDICATE) {
680 		bool old_enable = rctx->num_occlusion_queries != 0;
681 		bool old_perfect_enable =
682 			rctx->num_perfect_occlusion_queries != 0;
683 		bool enable, perfect_enable;
684 
685 		rctx->num_occlusion_queries += diff;
686 		assert(rctx->num_occlusion_queries >= 0);
687 
688 		if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
689 			rctx->num_perfect_occlusion_queries += diff;
690 			assert(rctx->num_perfect_occlusion_queries >= 0);
691 		}
692 
693 		enable = rctx->num_occlusion_queries != 0;
694 		perfect_enable = rctx->num_perfect_occlusion_queries != 0;
695 
696 		if (enable != old_enable || perfect_enable != old_perfect_enable) {
697 			struct r600_context *ctx = (struct r600_context*)rctx;
698 			r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom);
699 		}
700 	}
701 }
702 
event_type_for_stream(unsigned stream)703 static unsigned event_type_for_stream(unsigned stream)
704 {
705 	switch (stream) {
706 	default:
707 	case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
708 	case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
709 	case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
710 	case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
711 	}
712 }
713 
emit_sample_streamout(struct radeon_winsys_cs * cs,uint64_t va,unsigned stream)714 static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
715 				  unsigned stream)
716 {
717 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
718 	radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
719 	radeon_emit(cs, va);
720 	radeon_emit(cs, va >> 32);
721 }
722 
r600_query_hw_do_emit_start(struct r600_common_context * ctx,struct r600_query_hw * query,struct r600_resource * buffer,uint64_t va)723 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
724 					struct r600_query_hw *query,
725 					struct r600_resource *buffer,
726 					uint64_t va)
727 {
728 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
729 
730 	switch (query->b.type) {
731 	case PIPE_QUERY_OCCLUSION_COUNTER:
732 	case PIPE_QUERY_OCCLUSION_PREDICATE:
733 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
734 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
735 		radeon_emit(cs, va);
736 		radeon_emit(cs, va >> 32);
737 		break;
738 	case PIPE_QUERY_PRIMITIVES_EMITTED:
739 	case PIPE_QUERY_PRIMITIVES_GENERATED:
740 	case PIPE_QUERY_SO_STATISTICS:
741 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
742 		emit_sample_streamout(cs, va, query->stream);
743 		break;
744 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
745 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
746 			emit_sample_streamout(cs, va + 32 * stream, stream);
747 		break;
748 	case PIPE_QUERY_TIME_ELAPSED:
749 		/* Write the timestamp after the last draw is done.
750 		 * (bottom-of-pipe)
751 		 */
752 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
753 					 0, EOP_DATA_SEL_TIMESTAMP,
754 					 NULL, va, 0, query->b.type);
755 		break;
756 	case PIPE_QUERY_PIPELINE_STATISTICS:
757 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
758 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
759 		radeon_emit(cs, va);
760 		radeon_emit(cs, va >> 32);
761 		break;
762 	default:
763 		assert(0);
764 	}
765 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
766 			RADEON_PRIO_QUERY);
767 }
768 
r600_query_hw_emit_start(struct r600_common_context * ctx,struct r600_query_hw * query)769 static void r600_query_hw_emit_start(struct r600_common_context *ctx,
770 				     struct r600_query_hw *query)
771 {
772 	uint64_t va;
773 
774 	if (!query->buffer.buf)
775 		return; // previous buffer allocation failure
776 
777 	r600_update_occlusion_query_state(ctx, query->b.type, 1);
778 	r600_update_prims_generated_query_state(ctx, query->b.type, 1);
779 
780 	ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
781 			       true);
782 
783 	/* Get a new query buffer if needed. */
784 	if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
785 		struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
786 		*qbuf = query->buffer;
787 		query->buffer.results_end = 0;
788 		query->buffer.previous = qbuf;
789 		query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
790 		if (!query->buffer.buf)
791 			return;
792 	}
793 
794 	/* emit begin query */
795 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
796 
797 	query->ops->emit_start(ctx, query, query->buffer.buf, va);
798 
799 	ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
800 }
801 
r600_query_hw_do_emit_stop(struct r600_common_context * ctx,struct r600_query_hw * query,struct r600_resource * buffer,uint64_t va)802 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
803 				       struct r600_query_hw *query,
804 				       struct r600_resource *buffer,
805 				       uint64_t va)
806 {
807 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
808 	uint64_t fence_va = 0;
809 
810 	switch (query->b.type) {
811 	case PIPE_QUERY_OCCLUSION_COUNTER:
812 	case PIPE_QUERY_OCCLUSION_PREDICATE:
813 		va += 8;
814 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
815 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
816 		radeon_emit(cs, va);
817 		radeon_emit(cs, va >> 32);
818 
819 		fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
820 		break;
821 	case PIPE_QUERY_PRIMITIVES_EMITTED:
822 	case PIPE_QUERY_PRIMITIVES_GENERATED:
823 	case PIPE_QUERY_SO_STATISTICS:
824 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
825 		va += 16;
826 		emit_sample_streamout(cs, va, query->stream);
827 		break;
828 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
829 		va += 16;
830 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
831 			emit_sample_streamout(cs, va + 32 * stream, stream);
832 		break;
833 	case PIPE_QUERY_TIME_ELAPSED:
834 		va += 8;
835 		/* fall through */
836 	case PIPE_QUERY_TIMESTAMP:
837 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
838 					 0, EOP_DATA_SEL_TIMESTAMP, NULL, va,
839 					 0, query->b.type);
840 		fence_va = va + 8;
841 		break;
842 	case PIPE_QUERY_PIPELINE_STATISTICS: {
843 		unsigned sample_size = (query->result_size - 8) / 2;
844 
845 		va += sample_size;
846 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
847 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
848 		radeon_emit(cs, va);
849 		radeon_emit(cs, va >> 32);
850 
851 		fence_va = va + sample_size;
852 		break;
853 	}
854 	default:
855 		assert(0);
856 	}
857 	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
858 			RADEON_PRIO_QUERY);
859 
860 	if (fence_va)
861 		r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
862 					 EOP_DATA_SEL_VALUE_32BIT,
863 					 query->buffer.buf, fence_va, 0x80000000,
864 					 query->b.type);
865 }
866 
r600_query_hw_emit_stop(struct r600_common_context * ctx,struct r600_query_hw * query)867 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
868 				    struct r600_query_hw *query)
869 {
870 	uint64_t va;
871 
872 	if (!query->buffer.buf)
873 		return; // previous buffer allocation failure
874 
875 	/* The queries which need begin already called this in begin_query. */
876 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
877 		ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
878 	}
879 
880 	/* emit end query */
881 	va = query->buffer.buf->gpu_address + query->buffer.results_end;
882 
883 	query->ops->emit_stop(ctx, query, query->buffer.buf, va);
884 
885 	query->buffer.results_end += query->result_size;
886 
887 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
888 		ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
889 
890 	r600_update_occlusion_query_state(ctx, query->b.type, -1);
891 	r600_update_prims_generated_query_state(ctx, query->b.type, -1);
892 }
893 
emit_set_predicate(struct r600_common_context * ctx,struct r600_resource * buf,uint64_t va,uint32_t op)894 static void emit_set_predicate(struct r600_common_context *ctx,
895 			       struct r600_resource *buf, uint64_t va,
896 			       uint32_t op)
897 {
898 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
899 
900 	radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
901 	radeon_emit(cs, va);
902 	radeon_emit(cs, op | ((va >> 32) & 0xFF));
903 	r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ,
904 			RADEON_PRIO_QUERY);
905 }
906 
r600_emit_query_predication(struct r600_common_context * ctx,struct r600_atom * atom)907 static void r600_emit_query_predication(struct r600_common_context *ctx,
908 					struct r600_atom *atom)
909 {
910 	struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
911 	struct r600_query_buffer *qbuf;
912 	uint32_t op;
913 	bool flag_wait, invert;
914 
915 	if (!query)
916 		return;
917 
918 	invert = ctx->render_cond_invert;
919 	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
920 		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
921 
922 	switch (query->b.type) {
923 	case PIPE_QUERY_OCCLUSION_COUNTER:
924 	case PIPE_QUERY_OCCLUSION_PREDICATE:
925 		op = PRED_OP(PREDICATION_OP_ZPASS);
926 		break;
927 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
928 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
929 		op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
930 		invert = !invert;
931 		break;
932 	default:
933 		assert(0);
934 		return;
935 	}
936 
937 	/* if true then invert, see GL_ARB_conditional_render_inverted */
938 	if (invert)
939 		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
940 	else
941 		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
942 
943 	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
944 
945 	/* emit predicate packets for all data blocks */
946 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
947 		unsigned results_base = 0;
948 		uint64_t va_base = qbuf->buf->gpu_address;
949 
950 		while (results_base < qbuf->results_end) {
951 			uint64_t va = va_base + results_base;
952 
953 			if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
954 				for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
955 					emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
956 
957 					/* set CONTINUE bit for all packets except the first */
958 					op |= PREDICATION_CONTINUE;
959 				}
960 			} else {
961 				emit_set_predicate(ctx, qbuf->buf, va, op);
962 				op |= PREDICATION_CONTINUE;
963 			}
964 
965 			results_base += query->result_size;
966 		}
967 	}
968 }
969 
r600_create_query(struct pipe_context * ctx,unsigned query_type,unsigned index)970 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
971 {
972 	struct r600_common_screen *rscreen =
973 		(struct r600_common_screen *)ctx->screen;
974 
975 	if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
976 	    query_type == PIPE_QUERY_GPU_FINISHED ||
977 	    query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
978 		return r600_query_sw_create(query_type);
979 
980 	return r600_query_hw_create(rscreen, query_type, index);
981 }
982 
r600_destroy_query(struct pipe_context * ctx,struct pipe_query * query)983 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
984 {
985 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
986 	struct r600_query *rquery = (struct r600_query *)query;
987 
988 	rquery->ops->destroy(rctx->screen, rquery);
989 }
990 
r600_begin_query(struct pipe_context * ctx,struct pipe_query * query)991 static boolean r600_begin_query(struct pipe_context *ctx,
992                                 struct pipe_query *query)
993 {
994 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
995 	struct r600_query *rquery = (struct r600_query *)query;
996 
997 	return rquery->ops->begin(rctx, rquery);
998 }
999 
r600_query_hw_reset_buffers(struct r600_common_context * rctx,struct r600_query_hw * query)1000 void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
1001 				 struct r600_query_hw *query)
1002 {
1003 	struct r600_query_buffer *prev = query->buffer.previous;
1004 
1005 	/* Discard the old query buffers. */
1006 	while (prev) {
1007 		struct r600_query_buffer *qbuf = prev;
1008 		prev = prev->previous;
1009 		r600_resource_reference(&qbuf->buf, NULL);
1010 		FREE(qbuf);
1011 	}
1012 
1013 	query->buffer.results_end = 0;
1014 	query->buffer.previous = NULL;
1015 
1016 	/* Obtain a new buffer if the current one can't be mapped without a stall. */
1017 	if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
1018 	    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
1019 		r600_resource_reference(&query->buffer.buf, NULL);
1020 		query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
1021 	} else {
1022 		if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
1023 			r600_resource_reference(&query->buffer.buf, NULL);
1024 	}
1025 }
1026 
r600_query_hw_begin(struct r600_common_context * rctx,struct r600_query * rquery)1027 bool r600_query_hw_begin(struct r600_common_context *rctx,
1028 			 struct r600_query *rquery)
1029 {
1030 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1031 
1032 	if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
1033 		assert(0);
1034 		return false;
1035 	}
1036 
1037 	if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
1038 		r600_query_hw_reset_buffers(rctx, query);
1039 
1040 	r600_query_hw_emit_start(rctx, query);
1041 	if (!query->buffer.buf)
1042 		return false;
1043 
1044 	LIST_ADDTAIL(&query->list, &rctx->active_queries);
1045 	return true;
1046 }
1047 
r600_end_query(struct pipe_context * ctx,struct pipe_query * query)1048 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
1049 {
1050 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1051 	struct r600_query *rquery = (struct r600_query *)query;
1052 
1053 	return rquery->ops->end(rctx, rquery);
1054 }
1055 
r600_query_hw_end(struct r600_common_context * rctx,struct r600_query * rquery)1056 bool r600_query_hw_end(struct r600_common_context *rctx,
1057 		       struct r600_query *rquery)
1058 {
1059 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1060 
1061 	if (query->flags & R600_QUERY_HW_FLAG_NO_START)
1062 		r600_query_hw_reset_buffers(rctx, query);
1063 
1064 	r600_query_hw_emit_stop(rctx, query);
1065 
1066 	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
1067 		LIST_DELINIT(&query->list);
1068 
1069 	if (!query->buffer.buf)
1070 		return false;
1071 
1072 	return true;
1073 }
1074 
r600_get_hw_query_params(struct r600_common_context * rctx,struct r600_query_hw * rquery,int index,struct r600_hw_query_params * params)1075 static void r600_get_hw_query_params(struct r600_common_context *rctx,
1076 				     struct r600_query_hw *rquery, int index,
1077 				     struct r600_hw_query_params *params)
1078 {
1079 	unsigned max_rbs = rctx->screen->info.num_render_backends;
1080 
1081 	params->pair_stride = 0;
1082 	params->pair_count = 1;
1083 
1084 	switch (rquery->b.type) {
1085 	case PIPE_QUERY_OCCLUSION_COUNTER:
1086 	case PIPE_QUERY_OCCLUSION_PREDICATE:
1087 		params->start_offset = 0;
1088 		params->end_offset = 8;
1089 		params->fence_offset = max_rbs * 16;
1090 		params->pair_stride = 16;
1091 		params->pair_count = max_rbs;
1092 		break;
1093 	case PIPE_QUERY_TIME_ELAPSED:
1094 		params->start_offset = 0;
1095 		params->end_offset = 8;
1096 		params->fence_offset = 16;
1097 		break;
1098 	case PIPE_QUERY_TIMESTAMP:
1099 		params->start_offset = 0;
1100 		params->end_offset = 0;
1101 		params->fence_offset = 8;
1102 		break;
1103 	case PIPE_QUERY_PRIMITIVES_EMITTED:
1104 		params->start_offset = 8;
1105 		params->end_offset = 24;
1106 		params->fence_offset = params->end_offset + 4;
1107 		break;
1108 	case PIPE_QUERY_PRIMITIVES_GENERATED:
1109 		params->start_offset = 0;
1110 		params->end_offset = 16;
1111 		params->fence_offset = params->end_offset + 4;
1112 		break;
1113 	case PIPE_QUERY_SO_STATISTICS:
1114 		params->start_offset = 8 - index * 8;
1115 		params->end_offset = 24 - index * 8;
1116 		params->fence_offset = params->end_offset + 4;
1117 		break;
1118 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1119 		params->pair_count = R600_MAX_STREAMS;
1120 		params->pair_stride = 32;
1121 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1122 		params->start_offset = 0;
1123 		params->end_offset = 16;
1124 
1125 		/* We can re-use the high dword of the last 64-bit value as a
1126 		 * fence: it is initialized as 0, and the high bit is set by
1127 		 * the write of the streamout stats event.
1128 		 */
1129 		params->fence_offset = rquery->result_size - 4;
1130 		break;
1131 	case PIPE_QUERY_PIPELINE_STATISTICS:
1132 	{
1133 		/* Offsets apply to EG+ */
1134 		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1135 		params->start_offset = offsets[index];
1136 		params->end_offset = 88 + offsets[index];
1137 		params->fence_offset = 2 * 88;
1138 		break;
1139 	}
1140 	default:
1141 		unreachable("r600_get_hw_query_params unsupported");
1142 	}
1143 }
1144 
r600_query_read_result(void * map,unsigned start_index,unsigned end_index,bool test_status_bit)1145 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
1146 				       bool test_status_bit)
1147 {
1148 	uint32_t *current_result = (uint32_t*)map;
1149 	uint64_t start, end;
1150 
1151 	start = (uint64_t)current_result[start_index] |
1152 		(uint64_t)current_result[start_index+1] << 32;
1153 	end = (uint64_t)current_result[end_index] |
1154 	      (uint64_t)current_result[end_index+1] << 32;
1155 
1156 	if (!test_status_bit ||
1157 	    ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1158 		return end - start;
1159 	}
1160 	return 0;
1161 }
1162 
r600_query_hw_add_result(struct r600_common_screen * rscreen,struct r600_query_hw * query,void * buffer,union pipe_query_result * result)1163 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
1164 				     struct r600_query_hw *query,
1165 				     void *buffer,
1166 				     union pipe_query_result *result)
1167 {
1168 	unsigned max_rbs = rscreen->info.num_render_backends;
1169 
1170 	switch (query->b.type) {
1171 	case PIPE_QUERY_OCCLUSION_COUNTER: {
1172 		for (unsigned i = 0; i < max_rbs; ++i) {
1173 			unsigned results_base = i * 16;
1174 			result->u64 +=
1175 				r600_query_read_result(buffer + results_base, 0, 2, true);
1176 		}
1177 		break;
1178 	}
1179 	case PIPE_QUERY_OCCLUSION_PREDICATE: {
1180 		for (unsigned i = 0; i < max_rbs; ++i) {
1181 			unsigned results_base = i * 16;
1182 			result->b = result->b ||
1183 				r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
1184 		}
1185 		break;
1186 	}
1187 	case PIPE_QUERY_TIME_ELAPSED:
1188 		result->u64 += r600_query_read_result(buffer, 0, 2, false);
1189 		break;
1190 	case PIPE_QUERY_TIMESTAMP:
1191 		result->u64 = *(uint64_t*)buffer;
1192 		break;
1193 	case PIPE_QUERY_PRIMITIVES_EMITTED:
1194 		/* SAMPLE_STREAMOUTSTATS stores this structure:
1195 		 * {
1196 		 *    u64 NumPrimitivesWritten;
1197 		 *    u64 PrimitiveStorageNeeded;
1198 		 * }
1199 		 * We only need NumPrimitivesWritten here. */
1200 		result->u64 += r600_query_read_result(buffer, 2, 6, true);
1201 		break;
1202 	case PIPE_QUERY_PRIMITIVES_GENERATED:
1203 		/* Here we read PrimitiveStorageNeeded. */
1204 		result->u64 += r600_query_read_result(buffer, 0, 4, true);
1205 		break;
1206 	case PIPE_QUERY_SO_STATISTICS:
1207 		result->so_statistics.num_primitives_written +=
1208 			r600_query_read_result(buffer, 2, 6, true);
1209 		result->so_statistics.primitives_storage_needed +=
1210 			r600_query_read_result(buffer, 0, 4, true);
1211 		break;
1212 	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1213 		result->b = result->b ||
1214 			r600_query_read_result(buffer, 2, 6, true) !=
1215 			r600_query_read_result(buffer, 0, 4, true);
1216 		break;
1217 	case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1218 		for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
1219 			result->b = result->b ||
1220 				r600_query_read_result(buffer, 2, 6, true) !=
1221 				r600_query_read_result(buffer, 0, 4, true);
1222 			buffer = (char *)buffer + 32;
1223 		}
1224 		break;
1225 	case PIPE_QUERY_PIPELINE_STATISTICS:
1226 		if (rscreen->chip_class >= EVERGREEN) {
1227 			result->pipeline_statistics.ps_invocations +=
1228 				r600_query_read_result(buffer, 0, 22, false);
1229 			result->pipeline_statistics.c_primitives +=
1230 				r600_query_read_result(buffer, 2, 24, false);
1231 			result->pipeline_statistics.c_invocations +=
1232 				r600_query_read_result(buffer, 4, 26, false);
1233 			result->pipeline_statistics.vs_invocations +=
1234 				r600_query_read_result(buffer, 6, 28, false);
1235 			result->pipeline_statistics.gs_invocations +=
1236 				r600_query_read_result(buffer, 8, 30, false);
1237 			result->pipeline_statistics.gs_primitives +=
1238 				r600_query_read_result(buffer, 10, 32, false);
1239 			result->pipeline_statistics.ia_primitives +=
1240 				r600_query_read_result(buffer, 12, 34, false);
1241 			result->pipeline_statistics.ia_vertices +=
1242 				r600_query_read_result(buffer, 14, 36, false);
1243 			result->pipeline_statistics.hs_invocations +=
1244 				r600_query_read_result(buffer, 16, 38, false);
1245 			result->pipeline_statistics.ds_invocations +=
1246 				r600_query_read_result(buffer, 18, 40, false);
1247 			result->pipeline_statistics.cs_invocations +=
1248 				r600_query_read_result(buffer, 20, 42, false);
1249 		} else {
1250 			result->pipeline_statistics.ps_invocations +=
1251 				r600_query_read_result(buffer, 0, 16, false);
1252 			result->pipeline_statistics.c_primitives +=
1253 				r600_query_read_result(buffer, 2, 18, false);
1254 			result->pipeline_statistics.c_invocations +=
1255 				r600_query_read_result(buffer, 4, 20, false);
1256 			result->pipeline_statistics.vs_invocations +=
1257 				r600_query_read_result(buffer, 6, 22, false);
1258 			result->pipeline_statistics.gs_invocations +=
1259 				r600_query_read_result(buffer, 8, 24, false);
1260 			result->pipeline_statistics.gs_primitives +=
1261 				r600_query_read_result(buffer, 10, 26, false);
1262 			result->pipeline_statistics.ia_primitives +=
1263 				r600_query_read_result(buffer, 12, 28, false);
1264 			result->pipeline_statistics.ia_vertices +=
1265 				r600_query_read_result(buffer, 14, 30, false);
1266 		}
1267 #if 0 /* for testing */
1268 		printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1269 		       "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1270 		       "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1271 		       result->pipeline_statistics.ia_vertices,
1272 		       result->pipeline_statistics.ia_primitives,
1273 		       result->pipeline_statistics.vs_invocations,
1274 		       result->pipeline_statistics.hs_invocations,
1275 		       result->pipeline_statistics.ds_invocations,
1276 		       result->pipeline_statistics.gs_invocations,
1277 		       result->pipeline_statistics.gs_primitives,
1278 		       result->pipeline_statistics.c_invocations,
1279 		       result->pipeline_statistics.c_primitives,
1280 		       result->pipeline_statistics.ps_invocations,
1281 		       result->pipeline_statistics.cs_invocations);
1282 #endif
1283 		break;
1284 	default:
1285 		assert(0);
1286 	}
1287 }
1288 
r600_get_query_result(struct pipe_context * ctx,struct pipe_query * query,boolean wait,union pipe_query_result * result)1289 static boolean r600_get_query_result(struct pipe_context *ctx,
1290 				     struct pipe_query *query, boolean wait,
1291 				     union pipe_query_result *result)
1292 {
1293 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1294 	struct r600_query *rquery = (struct r600_query *)query;
1295 
1296 	return rquery->ops->get_result(rctx, rquery, wait, result);
1297 }
1298 
r600_get_query_result_resource(struct pipe_context * ctx,struct pipe_query * query,boolean wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1299 static void r600_get_query_result_resource(struct pipe_context *ctx,
1300                                            struct pipe_query *query,
1301                                            boolean wait,
1302                                            enum pipe_query_value_type result_type,
1303                                            int index,
1304                                            struct pipe_resource *resource,
1305                                            unsigned offset)
1306 {
1307 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1308 	struct r600_query *rquery = (struct r600_query *)query;
1309 
1310 	rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
1311 	                                 resource, offset);
1312 }
1313 
r600_query_hw_clear_result(struct r600_query_hw * query,union pipe_query_result * result)1314 static void r600_query_hw_clear_result(struct r600_query_hw *query,
1315 				       union pipe_query_result *result)
1316 {
1317 	util_query_clear_result(result, query->b.type);
1318 }
1319 
r600_query_hw_get_result(struct r600_common_context * rctx,struct r600_query * rquery,bool wait,union pipe_query_result * result)1320 bool r600_query_hw_get_result(struct r600_common_context *rctx,
1321 			      struct r600_query *rquery,
1322 			      bool wait, union pipe_query_result *result)
1323 {
1324 	struct r600_common_screen *rscreen = rctx->screen;
1325 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1326 	struct r600_query_buffer *qbuf;
1327 
1328 	query->ops->clear_result(query, result);
1329 
1330 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1331 		unsigned usage = PIPE_TRANSFER_READ |
1332 				 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
1333 		unsigned results_base = 0;
1334 		void *map;
1335 
1336 		if (rquery->b.flushed)
1337 			map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1338 		else
1339 			map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
1340 
1341 		if (!map)
1342 			return false;
1343 
1344 		while (results_base != qbuf->results_end) {
1345 			query->ops->add_result(rscreen, query, map + results_base,
1346 					       result);
1347 			results_base += query->result_size;
1348 		}
1349 	}
1350 
1351 	/* Convert the time to expected units. */
1352 	if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
1353 	    rquery->type == PIPE_QUERY_TIMESTAMP) {
1354 		result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;
1355 	}
1356 	return true;
1357 }
1358 
1359 /* Create the compute shader that is used to collect the results.
1360  *
1361  * One compute grid with a single thread is launched for every query result
1362  * buffer. The thread (optionally) reads a previous summary buffer, then
1363  * accumulates data from the query result buffer, and writes the result either
1364  * to a summary buffer to be consumed by the next grid invocation or to the
1365  * user-supplied buffer.
1366  *
1367  * Data layout:
1368  *
1369  * CONST
1370  *  0.x = end_offset
1371  *  0.y = result_stride
1372  *  0.z = result_count
1373  *  0.w = bit field:
1374  *          1: read previously accumulated values
1375  *          2: write accumulated values for chaining
1376  *          4: write result available
1377  *          8: convert result to boolean (0/1)
1378  *         16: only read one dword and use that as result
1379  *         32: apply timestamp conversion
1380  *         64: store full 64 bits result
1381  *        128: store signed 32 bits result
1382  *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
1383  *  1.x = fence_offset
1384  *  1.y = pair_stride
1385  *  1.z = pair_count
1386  *
1387  * BUFFER[0] = query result buffer
1388  * BUFFER[1] = previous summary buffer
1389  * BUFFER[2] = next summary buffer or user-supplied buffer
1390  */
r600_create_query_result_shader(struct r600_common_context * rctx)1391 static void r600_create_query_result_shader(struct r600_common_context *rctx)
1392 {
1393 	/* TEMP[0].xy = accumulated result so far
1394 	 * TEMP[0].z = result not available
1395 	 *
1396 	 * TEMP[1].x = current result index
1397 	 * TEMP[1].y = current pair index
1398 	 */
1399 	static const char text_tmpl[] =
1400 		"COMP\n"
1401 		"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
1402 		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
1403 		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
1404 		"DCL BUFFER[0]\n"
1405 		"DCL BUFFER[1]\n"
1406 		"DCL BUFFER[2]\n"
1407 		"DCL CONST[0][0..1]\n"
1408 		"DCL TEMP[0..5]\n"
1409 		"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
1410 		"IMM[1] UINT32 {1, 2, 4, 8}\n"
1411 		"IMM[2] UINT32 {16, 32, 64, 128}\n"
1412 		"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
1413 		"IMM[4] UINT32 {256, 0, 0, 0}\n"
1414 
1415 		"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
1416 		"UIF TEMP[5]\n"
1417 			/* Check result availability. */
1418 			"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
1419 			"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
1420 			"MOV TEMP[1], TEMP[0].zzzz\n"
1421 			"NOT TEMP[0].z, TEMP[0].zzzz\n"
1422 
1423 			/* Load result if available. */
1424 			"UIF TEMP[1]\n"
1425 				"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
1426 			"ENDIF\n"
1427 		"ELSE\n"
1428 			/* Load previously accumulated result if requested. */
1429 			"MOV TEMP[0], IMM[0].xxxx\n"
1430 			"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
1431 			"UIF TEMP[4]\n"
1432 				"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
1433 			"ENDIF\n"
1434 
1435 			"MOV TEMP[1].x, IMM[0].xxxx\n"
1436 			"BGNLOOP\n"
1437 				/* Break if accumulated result so far is not available. */
1438 				"UIF TEMP[0].zzzz\n"
1439 					"BRK\n"
1440 				"ENDIF\n"
1441 
1442 				/* Break if result_index >= result_count. */
1443 				"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
1444 				"UIF TEMP[5]\n"
1445 					"BRK\n"
1446 				"ENDIF\n"
1447 
1448 				/* Load fence and check result availability */
1449 				"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
1450 				"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
1451 				"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
1452 				"NOT TEMP[0].z, TEMP[0].zzzz\n"
1453 				"UIF TEMP[0].zzzz\n"
1454 					"BRK\n"
1455 				"ENDIF\n"
1456 
1457 				"MOV TEMP[1].y, IMM[0].xxxx\n"
1458 				"BGNLOOP\n"
1459 					/* Load start and end. */
1460 					"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
1461 					"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
1462 					"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1463 
1464 					"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
1465 					"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
1466 
1467 					"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
1468 
1469 					"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
1470 					"UIF TEMP[5].zzzz\n"
1471 						/* Load second start/end half-pair and
1472 						 * take the difference
1473 						 */
1474 						"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
1475 						"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1476 						"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
1477 
1478 						"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
1479 						"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
1480 					"ENDIF\n"
1481 
1482 					"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
1483 
1484 					/* Increment pair index */
1485 					"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
1486 					"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
1487 					"UIF TEMP[5]\n"
1488 						"BRK\n"
1489 					"ENDIF\n"
1490 				"ENDLOOP\n"
1491 
1492 				/* Increment result index */
1493 				"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
1494 			"ENDLOOP\n"
1495 		"ENDIF\n"
1496 
1497 		"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
1498 		"UIF TEMP[4]\n"
1499 			/* Store accumulated data for chaining. */
1500 			"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
1501 		"ELSE\n"
1502 			"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
1503 			"UIF TEMP[4]\n"
1504 				/* Store result availability. */
1505 				"NOT TEMP[0].z, TEMP[0]\n"
1506 				"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
1507 				"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
1508 
1509 				"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
1510 				"UIF TEMP[4]\n"
1511 					"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
1512 				"ENDIF\n"
1513 			"ELSE\n"
1514 				/* Store result if it is available. */
1515 				"NOT TEMP[4], TEMP[0].zzzz\n"
1516 				"UIF TEMP[4]\n"
1517 					/* Apply timestamp conversion */
1518 					"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
1519 					"UIF TEMP[4]\n"
1520 						"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
1521 						"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
1522 					"ENDIF\n"
1523 
1524 					/* Convert to boolean */
1525 					"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
1526 					"UIF TEMP[4]\n"
1527 						"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
1528 						"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
1529 						"MOV TEMP[0].y, IMM[0].xxxx\n"
1530 					"ENDIF\n"
1531 
1532 					"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
1533 					"UIF TEMP[4]\n"
1534 						"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
1535 					"ELSE\n"
1536 						/* Clamping */
1537 						"UIF TEMP[0].yyyy\n"
1538 							"MOV TEMP[0].x, IMM[0].wwww\n"
1539 						"ENDIF\n"
1540 
1541 						"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
1542 						"UIF TEMP[4]\n"
1543 							"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
1544 						"ENDIF\n"
1545 
1546 						"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
1547 					"ENDIF\n"
1548 				"ENDIF\n"
1549 			"ENDIF\n"
1550 		"ENDIF\n"
1551 
1552 		"END\n";
1553 
1554 	char text[sizeof(text_tmpl) + 32];
1555 	struct tgsi_token tokens[1024];
1556 	struct pipe_compute_state state = {};
1557 
1558 	/* Hard code the frequency into the shader so that the backend can
1559 	 * use the full range of optimizations for divide-by-constant.
1560 	 */
1561 	snprintf(text, sizeof(text), text_tmpl,
1562 		 rctx->screen->info.clock_crystal_freq);
1563 
1564 	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
1565 		assert(false);
1566 		return;
1567 	}
1568 
1569 	state.ir_type = PIPE_SHADER_IR_TGSI;
1570 	state.prog = tokens;
1571 
1572 	rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
1573 }
1574 
r600_restore_qbo_state(struct r600_common_context * rctx,struct r600_qbo_state * st)1575 static void r600_restore_qbo_state(struct r600_common_context *rctx,
1576 				   struct r600_qbo_state *st)
1577 {
1578 	rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
1579 
1580 	rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1581 	pipe_resource_reference(&st->saved_const0.buffer, NULL);
1582 
1583 	rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
1584 	for (unsigned i = 0; i < 3; ++i)
1585 		pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
1586 }
1587 
r600_query_hw_get_result_resource(struct r600_common_context * rctx,struct r600_query * rquery,bool wait,enum pipe_query_value_type result_type,int index,struct pipe_resource * resource,unsigned offset)1588 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
1589                                               struct r600_query *rquery,
1590                                               bool wait,
1591                                               enum pipe_query_value_type result_type,
1592                                               int index,
1593                                               struct pipe_resource *resource,
1594                                               unsigned offset)
1595 {
1596 	struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1597 	struct r600_query_buffer *qbuf;
1598 	struct r600_query_buffer *qbuf_prev;
1599 	struct pipe_resource *tmp_buffer = NULL;
1600 	unsigned tmp_buffer_offset = 0;
1601 	struct r600_qbo_state saved_state = {};
1602 	struct pipe_grid_info grid = {};
1603 	struct pipe_constant_buffer constant_buffer = {};
1604 	struct pipe_shader_buffer ssbo[3];
1605 	struct r600_hw_query_params params;
1606 	struct {
1607 		uint32_t end_offset;
1608 		uint32_t result_stride;
1609 		uint32_t result_count;
1610 		uint32_t config;
1611 		uint32_t fence_offset;
1612 		uint32_t pair_stride;
1613 		uint32_t pair_count;
1614 	} consts;
1615 
1616 	if (!rctx->query_result_shader) {
1617 		r600_create_query_result_shader(rctx);
1618 		if (!rctx->query_result_shader)
1619 			return;
1620 	}
1621 
1622 	if (query->buffer.previous) {
1623 		u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
1624 				     &tmp_buffer_offset, &tmp_buffer);
1625 		if (!tmp_buffer)
1626 			return;
1627 	}
1628 
1629 	rctx->save_qbo_state(&rctx->b, &saved_state);
1630 
1631 	r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
1632 	consts.end_offset = params.end_offset - params.start_offset;
1633 	consts.fence_offset = params.fence_offset - params.start_offset;
1634 	consts.result_stride = query->result_size;
1635 	consts.pair_stride = params.pair_stride;
1636 	consts.pair_count = params.pair_count;
1637 
1638 	constant_buffer.buffer_size = sizeof(consts);
1639 	constant_buffer.user_buffer = &consts;
1640 
1641 	ssbo[1].buffer = tmp_buffer;
1642 	ssbo[1].buffer_offset = tmp_buffer_offset;
1643 	ssbo[1].buffer_size = 16;
1644 
1645 	ssbo[2] = ssbo[1];
1646 
1647 	rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
1648 
1649 	grid.block[0] = 1;
1650 	grid.block[1] = 1;
1651 	grid.block[2] = 1;
1652 	grid.grid[0] = 1;
1653 	grid.grid[1] = 1;
1654 	grid.grid[2] = 1;
1655 
1656 	consts.config = 0;
1657 	if (index < 0)
1658 		consts.config |= 4;
1659 	if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE)
1660 		consts.config |= 8;
1661 	else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1662 		 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1663 		consts.config |= 8 | 256;
1664 	else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
1665 		 query->b.type == PIPE_QUERY_TIME_ELAPSED)
1666 		consts.config |= 32;
1667 
1668 	switch (result_type) {
1669 	case PIPE_QUERY_TYPE_U64:
1670 	case PIPE_QUERY_TYPE_I64:
1671 		consts.config |= 64;
1672 		break;
1673 	case PIPE_QUERY_TYPE_I32:
1674 		consts.config |= 128;
1675 		break;
1676 	case PIPE_QUERY_TYPE_U32:
1677 		break;
1678 	}
1679 
1680 	rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
1681 
1682 	for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1683 		if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1684 			qbuf_prev = qbuf->previous;
1685 			consts.result_count = qbuf->results_end / query->result_size;
1686 			consts.config &= ~3;
1687 			if (qbuf != &query->buffer)
1688 				consts.config |= 1;
1689 			if (qbuf->previous)
1690 				consts.config |= 2;
1691 		} else {
1692 			/* Only read the last timestamp. */
1693 			qbuf_prev = NULL;
1694 			consts.result_count = 0;
1695 			consts.config |= 16;
1696 			params.start_offset += qbuf->results_end - query->result_size;
1697 		}
1698 
1699 		rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
1700 
1701 		ssbo[0].buffer = &qbuf->buf->b.b;
1702 		ssbo[0].buffer_offset = params.start_offset;
1703 		ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1704 
1705 		if (!qbuf->previous) {
1706 			ssbo[2].buffer = resource;
1707 			ssbo[2].buffer_offset = offset;
1708 			ssbo[2].buffer_size = 8;
1709 
1710 		}
1711 
1712 		rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
1713 
1714 		if (wait && qbuf == &query->buffer) {
1715 			uint64_t va;
1716 
1717 			/* Wait for result availability. Wait only for readiness
1718 			 * of the last entry, since the fence writes should be
1719 			 * serialized in the CP.
1720 			 */
1721 			va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1722 			va += params.fence_offset;
1723 
1724 			r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
1725 		}
1726 
1727 		rctx->b.launch_grid(&rctx->b, &grid);
1728 		rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
1729 	}
1730 
1731 	r600_restore_qbo_state(rctx, &saved_state);
1732 	pipe_resource_reference(&tmp_buffer, NULL);
1733 }
1734 
r600_render_condition(struct pipe_context * ctx,struct pipe_query * query,boolean condition,enum pipe_render_cond_flag mode)1735 static void r600_render_condition(struct pipe_context *ctx,
1736 				  struct pipe_query *query,
1737 				  boolean condition,
1738 				  enum pipe_render_cond_flag mode)
1739 {
1740 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1741 	struct r600_query_hw *rquery = (struct r600_query_hw *)query;
1742 	struct r600_query_buffer *qbuf;
1743 	struct r600_atom *atom = &rctx->render_cond_atom;
1744 
1745 	/* Compute the size of SET_PREDICATION packets. */
1746 	atom->num_dw = 0;
1747 	if (query) {
1748 		for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
1749 			atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
1750 
1751 		if (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1752 			atom->num_dw *= R600_MAX_STREAMS;
1753 	}
1754 
1755 	rctx->render_cond = query;
1756 	rctx->render_cond_invert = condition;
1757 	rctx->render_cond_mode = mode;
1758 
1759 	rctx->set_atom_dirty(rctx, atom, query != NULL);
1760 }
1761 
r600_suspend_queries(struct r600_common_context * ctx)1762 void r600_suspend_queries(struct r600_common_context *ctx)
1763 {
1764 	struct r600_query_hw *query;
1765 
1766 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1767 		r600_query_hw_emit_stop(ctx, query);
1768 	}
1769 	assert(ctx->num_cs_dw_queries_suspend == 0);
1770 }
1771 
r600_queries_num_cs_dw_for_resuming(struct r600_common_context * ctx,struct list_head * query_list)1772 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
1773 						    struct list_head *query_list)
1774 {
1775 	struct r600_query_hw *query;
1776 	unsigned num_dw = 0;
1777 
1778 	LIST_FOR_EACH_ENTRY(query, query_list, list) {
1779 		/* begin + end */
1780 		num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
1781 
1782 		/* Workaround for the fact that
1783 		 * num_cs_dw_nontimer_queries_suspend is incremented for every
1784 		 * resumed query, which raises the bar in need_cs_space for
1785 		 * queries about to be resumed.
1786 		 */
1787 		num_dw += query->num_cs_dw_end;
1788 	}
1789 	/* primitives generated query */
1790 	num_dw += ctx->streamout.enable_atom.num_dw;
1791 	/* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
1792 	num_dw += 13;
1793 
1794 	return num_dw;
1795 }
1796 
r600_resume_queries(struct r600_common_context * ctx)1797 void r600_resume_queries(struct r600_common_context *ctx)
1798 {
1799 	struct r600_query_hw *query;
1800 	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
1801 
1802 	assert(ctx->num_cs_dw_queries_suspend == 0);
1803 
1804 	/* Check CS space here. Resuming must not be interrupted by flushes. */
1805 	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
1806 
1807 	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1808 		r600_query_hw_emit_start(ctx, query);
1809 	}
1810 }
1811 
1812 /* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */
r600_query_fix_enabled_rb_mask(struct r600_common_screen * rscreen)1813 void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen)
1814 {
1815 	struct r600_common_context *ctx =
1816 		(struct r600_common_context*)rscreen->aux_context;
1817 	struct radeon_winsys_cs *cs = ctx->gfx.cs;
1818 	struct r600_resource *buffer;
1819 	uint32_t *results;
1820 	unsigned i, mask = 0;
1821 	unsigned max_rbs;
1822 
1823 	if (ctx->family == CHIP_JUNIPER) {
1824 		/*
1825 		 * Fix for predication lockups - the chip can only ever have
1826 		 * 4 RBs, however it looks like the predication logic assumes
1827 		 * there's 8, trying to read results from query buffers never
1828 		 * written to. By increasing this number we'll write the
1829 		 * status bit for these as per the normal disabled rb logic.
1830 		 */
1831 		ctx->screen->info.num_render_backends = 8;
1832 	}
1833 	max_rbs = ctx->screen->info.num_render_backends;
1834 
1835 	assert(rscreen->chip_class <= CAYMAN);
1836 
1837 	/*
1838 	 * if backend_map query is supported by the kernel.
1839 	 * Note the kernel drm driver for a long time never filled in the
1840 	 * associated data on eg/cm, only on r600/r700, hence ignore the valid
1841 	 * bit there if the map is zero.
1842 	 * (Albeit some chips with just one active rb can have a valid 0 map.)
1843 	 */
1844 	if (rscreen->info.r600_gb_backend_map_valid &&
1845 	    (ctx->chip_class < EVERGREEN || rscreen->info.r600_gb_backend_map != 0)) {
1846 		unsigned num_tile_pipes = rscreen->info.num_tile_pipes;
1847 		unsigned backend_map = rscreen->info.r600_gb_backend_map;
1848 		unsigned item_width, item_mask;
1849 
1850 		if (ctx->chip_class >= EVERGREEN) {
1851 			item_width = 4;
1852 			item_mask = 0x7;
1853 		} else {
1854 			item_width = 2;
1855 			item_mask = 0x3;
1856 		}
1857 
1858 		while (num_tile_pipes--) {
1859 			i = backend_map & item_mask;
1860 			mask |= (1<<i);
1861 			backend_map >>= item_width;
1862 		}
1863 		if (mask != 0) {
1864 			rscreen->info.enabled_rb_mask = mask;
1865 			return;
1866 		}
1867 	}
1868 
1869 	/* otherwise backup path for older kernels */
1870 
1871 	/* create buffer for event data */
1872 	buffer = (struct r600_resource*)
1873 		pipe_buffer_create(ctx->b.screen, 0,
1874 				   PIPE_USAGE_STAGING, max_rbs * 16);
1875 	if (!buffer)
1876 		return;
1877 
1878 	/* initialize buffer with zeroes */
1879 	results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
1880 	if (results) {
1881 		memset(results, 0, max_rbs * 4 * 4);
1882 
1883 		/* emit EVENT_WRITE for ZPASS_DONE */
1884 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1885 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
1886 		radeon_emit(cs, buffer->gpu_address);
1887 		radeon_emit(cs, buffer->gpu_address >> 32);
1888 
1889 		r600_emit_reloc(ctx, &ctx->gfx, buffer,
1890                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
1891 
1892 		/* analyze results */
1893 		results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
1894 		if (results) {
1895 			for(i = 0; i < max_rbs; i++) {
1896 				/* at least highest bit will be set if backend is used */
1897 				if (results[i*4 + 1])
1898 					mask |= (1<<i);
1899 			}
1900 		}
1901 	}
1902 
1903 	r600_resource_reference(&buffer, NULL);
1904 
1905 	if (mask) {
1906 		if (rscreen->debug_flags & DBG_INFO &&
1907 		    mask != rscreen->info.enabled_rb_mask) {
1908 			printf("enabled_rb_mask (fixed) = 0x%x\n", mask);
1909 		}
1910 		rscreen->info.enabled_rb_mask = mask;
1911 	}
1912 }
1913 
1914 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1915 	{ \
1916 		.name = name_, \
1917 		.query_type = R600_QUERY_##query_type_, \
1918 		.type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1919 		.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
1920 		.group_id = group_id_ \
1921 	}
1922 
1923 #define X(name_, query_type_, type_, result_type_) \
1924 	XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1925 
1926 #define XG(group_, name_, query_type_, type_, result_type_) \
1927 	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
1928 
1929 static struct pipe_driver_query_info r600_driver_query_list[] = {
1930 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
1931 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
1932 	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
1933 	X("draw-calls",			DRAW_CALLS,		UINT64, AVERAGE),
1934 	X("decompress-calls",		DECOMPRESS_CALLS,	UINT64, AVERAGE),
1935 	X("MRT-draw-calls",		MRT_DRAW_CALLS,		UINT64, AVERAGE),
1936 	X("prim-restart-calls",		PRIM_RESTART_CALLS,	UINT64, AVERAGE),
1937 	X("spill-draw-calls",		SPILL_DRAW_CALLS,	UINT64, AVERAGE),
1938 	X("compute-calls",		COMPUTE_CALLS,		UINT64, AVERAGE),
1939 	X("spill-compute-calls",	SPILL_COMPUTE_CALLS,	UINT64, AVERAGE),
1940 	X("dma-calls",			DMA_CALLS,		UINT64, AVERAGE),
1941 	X("cp-dma-calls",		CP_DMA_CALLS,		UINT64, AVERAGE),
1942 	X("num-vs-flushes",		NUM_VS_FLUSHES,		UINT64, AVERAGE),
1943 	X("num-ps-flushes",		NUM_PS_FLUSHES,		UINT64, AVERAGE),
1944 	X("num-cs-flushes",		NUM_CS_FLUSHES,		UINT64, AVERAGE),
1945 	X("num-CB-cache-flushes",	NUM_CB_CACHE_FLUSHES,	UINT64, AVERAGE),
1946 	X("num-DB-cache-flushes",	NUM_DB_CACHE_FLUSHES,	UINT64, AVERAGE),
1947 	X("num-resident-handles",	NUM_RESIDENT_HANDLES,	UINT64, AVERAGE),
1948 	X("tc-offloaded-slots",		TC_OFFLOADED_SLOTS,     UINT64, AVERAGE),
1949 	X("tc-direct-slots",		TC_DIRECT_SLOTS,	UINT64, AVERAGE),
1950 	X("tc-num-syncs",		TC_NUM_SYNCS,		UINT64, AVERAGE),
1951 	X("CS-thread-busy",		CS_THREAD_BUSY,		UINT64, AVERAGE),
1952 	X("gallium-thread-busy",	GALLIUM_THREAD_BUSY,	UINT64, AVERAGE),
1953 	X("requested-VRAM",		REQUESTED_VRAM,		BYTES, AVERAGE),
1954 	X("requested-GTT",		REQUESTED_GTT,		BYTES, AVERAGE),
1955 	X("mapped-VRAM",		MAPPED_VRAM,		BYTES, AVERAGE),
1956 	X("mapped-GTT",			MAPPED_GTT,		BYTES, AVERAGE),
1957 	X("buffer-wait-time",		BUFFER_WAIT_TIME,	MICROSECONDS, CUMULATIVE),
1958 	X("num-mapped-buffers",		NUM_MAPPED_BUFFERS,	UINT64, AVERAGE),
1959 	X("num-GFX-IBs",		NUM_GFX_IBS,		UINT64, AVERAGE),
1960 	X("num-SDMA-IBs",		NUM_SDMA_IBS,		UINT64, AVERAGE),
1961 	X("GFX-BO-list-size",		GFX_BO_LIST_SIZE,	UINT64, AVERAGE),
1962 	X("num-bytes-moved",		NUM_BYTES_MOVED,	BYTES, CUMULATIVE),
1963 	X("num-evictions",		NUM_EVICTIONS,		UINT64, CUMULATIVE),
1964 	X("VRAM-CPU-page-faults",	NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1965 	X("VRAM-usage",			VRAM_USAGE,		BYTES, AVERAGE),
1966 	X("VRAM-vis-usage",		VRAM_VIS_USAGE,		BYTES, AVERAGE),
1967 	X("GTT-usage",			GTT_USAGE,		BYTES, AVERAGE),
1968 
1969 	/* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1970 	 * which use it as a fallback path to detect the GPU type.
1971 	 *
1972 	 * Note: The names of these queries are significant for GPUPerfStudio
1973 	 * (and possibly their order as well). */
1974 	XG(GPIN, "GPIN_000",		GPIN_ASIC_ID,		UINT, AVERAGE),
1975 	XG(GPIN, "GPIN_001",		GPIN_NUM_SIMD,		UINT, AVERAGE),
1976 	XG(GPIN, "GPIN_002",		GPIN_NUM_RB,		UINT, AVERAGE),
1977 	XG(GPIN, "GPIN_003",		GPIN_NUM_SPI,		UINT, AVERAGE),
1978 	XG(GPIN, "GPIN_004",		GPIN_NUM_SE,		UINT, AVERAGE),
1979 
1980 	X("temperature",		GPU_TEMPERATURE,	UINT64, AVERAGE),
1981 	X("shader-clock",		CURRENT_GPU_SCLK,	HZ, AVERAGE),
1982 	X("memory-clock",		CURRENT_GPU_MCLK,	HZ, AVERAGE),
1983 
1984 	/* The following queries must be at the end of the list because their
1985 	 * availability is adjusted dynamically based on the DRM version. */
1986 	X("GPU-load",			GPU_LOAD,		UINT64, AVERAGE),
1987 	X("GPU-shaders-busy",		GPU_SHADERS_BUSY,	UINT64, AVERAGE),
1988 	X("GPU-ta-busy",		GPU_TA_BUSY,		UINT64, AVERAGE),
1989 	X("GPU-gds-busy",		GPU_GDS_BUSY,		UINT64, AVERAGE),
1990 	X("GPU-vgt-busy",		GPU_VGT_BUSY,		UINT64, AVERAGE),
1991 	X("GPU-ia-busy",		GPU_IA_BUSY,		UINT64, AVERAGE),
1992 	X("GPU-sx-busy",		GPU_SX_BUSY,		UINT64, AVERAGE),
1993 	X("GPU-wd-busy",		GPU_WD_BUSY,		UINT64, AVERAGE),
1994 	X("GPU-bci-busy",		GPU_BCI_BUSY,		UINT64, AVERAGE),
1995 	X("GPU-sc-busy",		GPU_SC_BUSY,		UINT64, AVERAGE),
1996 	X("GPU-pa-busy",		GPU_PA_BUSY,		UINT64, AVERAGE),
1997 	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
1998 	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
1999 	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
2000 	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
2001 	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
2002 	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
2003 	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
2004 	X("GPU-surf-sync-busy",		GPU_SURF_SYNC_BUSY,	UINT64, AVERAGE),
2005 	X("GPU-cp-dma-busy",		GPU_CP_DMA_BUSY,	UINT64, AVERAGE),
2006 	X("GPU-scratch-ram-busy",	GPU_SCRATCH_RAM_BUSY,	UINT64, AVERAGE),
2007 };
2008 
2009 #undef X
2010 #undef XG
2011 #undef XFULL
2012 
r600_get_num_queries(struct r600_common_screen * rscreen)2013 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
2014 {
2015 	if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
2016 		return ARRAY_SIZE(r600_driver_query_list);
2017 	else
2018 		return ARRAY_SIZE(r600_driver_query_list) - 25;
2019 }
2020 
r600_get_driver_query_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_info * info)2021 static int r600_get_driver_query_info(struct pipe_screen *screen,
2022 				      unsigned index,
2023 				      struct pipe_driver_query_info *info)
2024 {
2025 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
2026 	unsigned num_queries = r600_get_num_queries(rscreen);
2027 
2028 	if (!info) {
2029 		unsigned num_perfcounters =
2030 			r600_get_perfcounter_info(rscreen, 0, NULL);
2031 
2032 		return num_queries + num_perfcounters;
2033 	}
2034 
2035 	if (index >= num_queries)
2036 		return r600_get_perfcounter_info(rscreen, index - num_queries, info);
2037 
2038 	*info = r600_driver_query_list[index];
2039 
2040 	switch (info->query_type) {
2041 	case R600_QUERY_REQUESTED_VRAM:
2042 	case R600_QUERY_VRAM_USAGE:
2043 	case R600_QUERY_MAPPED_VRAM:
2044 		info->max_value.u64 = rscreen->info.vram_size;
2045 		break;
2046 	case R600_QUERY_REQUESTED_GTT:
2047 	case R600_QUERY_GTT_USAGE:
2048 	case R600_QUERY_MAPPED_GTT:
2049 		info->max_value.u64 = rscreen->info.gart_size;
2050 		break;
2051 	case R600_QUERY_GPU_TEMPERATURE:
2052 		info->max_value.u64 = 125;
2053 		break;
2054 	case R600_QUERY_VRAM_VIS_USAGE:
2055 		info->max_value.u64 = rscreen->info.vram_vis_size;
2056 		break;
2057 	}
2058 
2059 	if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
2060 		info->group_id += rscreen->perfcounters->num_groups;
2061 
2062 	return 1;
2063 }
2064 
2065 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
2066  * performance counter groups, so be careful when changing this and related
2067  * functions.
2068  */
r600_get_driver_query_group_info(struct pipe_screen * screen,unsigned index,struct pipe_driver_query_group_info * info)2069 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
2070 					    unsigned index,
2071 					    struct pipe_driver_query_group_info *info)
2072 {
2073 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
2074 	unsigned num_pc_groups = 0;
2075 
2076 	if (rscreen->perfcounters)
2077 		num_pc_groups = rscreen->perfcounters->num_groups;
2078 
2079 	if (!info)
2080 		return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
2081 
2082 	if (index < num_pc_groups)
2083 		return r600_get_perfcounter_group_info(rscreen, index, info);
2084 
2085 	index -= num_pc_groups;
2086 	if (index >= R600_NUM_SW_QUERY_GROUPS)
2087 		return 0;
2088 
2089 	info->name = "GPIN";
2090 	info->max_active_queries = 5;
2091 	info->num_queries = 5;
2092 	return 1;
2093 }
2094 
r600_query_init(struct r600_common_context * rctx)2095 void r600_query_init(struct r600_common_context *rctx)
2096 {
2097 	rctx->b.create_query = r600_create_query;
2098 	rctx->b.create_batch_query = r600_create_batch_query;
2099 	rctx->b.destroy_query = r600_destroy_query;
2100 	rctx->b.begin_query = r600_begin_query;
2101 	rctx->b.end_query = r600_end_query;
2102 	rctx->b.get_query_result = r600_get_query_result;
2103 	rctx->b.get_query_result_resource = r600_get_query_result_resource;
2104 	rctx->render_cond_atom.emit = r600_emit_query_predication;
2105 
2106 	if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
2107 	    rctx->b.render_condition = r600_render_condition;
2108 
2109 	LIST_INITHEAD(&rctx->active_queries);
2110 }
2111 
r600_init_screen_query_functions(struct r600_common_screen * rscreen)2112 void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
2113 {
2114 	rscreen->b.get_driver_query_info = r600_get_driver_query_info;
2115 	rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
2116 }
2117