1 /*
2  * Copyrigh 2016 Red Hat Inc.
3  * Based on anv:
4  * Copyright © 2015 Intel Corporation
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25 
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31 
32 #include "nir/nir_builder.h"
33 #include "radv_meta.h"
34 #include "radv_private.h"
35 #include "radv_cs.h"
36 #include "sid.h"
37 #include "util/u_atomic.h"
38 
39 #define TIMESTAMP_NOT_READY UINT64_MAX
40 
41 static const int pipelinestat_block_size = 11 * 8;
42 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
43 
44 static unsigned
radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)45 radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
46 {
47 	int offset = ffs(flag) - 1;
48 	assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
49 	return pipeline_statistics_indices[offset];
50 }
51 
nir_test_flag(nir_builder * b,nir_ssa_def * flags,uint32_t flag)52 static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
53 {
54 	return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
55 }
56 
radv_break_on_count(nir_builder * b,nir_variable * var,nir_ssa_def * count)57 static void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count)
58 {
59 	nir_ssa_def *counter = nir_load_var(b, var);
60 
61 	nir_push_if(b, nir_uge(b, counter, count));
62 	nir_jump(b, nir_jump_break);
63 	nir_pop_if(b, NULL);
64 
65 	counter = nir_iadd(b, counter, nir_imm_int(b, 1));
66 	nir_store_var(b, var, counter, 0x1);
67 }
68 
69 static struct nir_ssa_def *
radv_load_push_int(nir_builder * b,unsigned offset,const char * name)70 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
71 {
72 	nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
73 	nir_intrinsic_set_base(flags, 0);
74 	nir_intrinsic_set_range(flags, 16);
75 	flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
76 	flags->num_components = 1;
77 	nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
78 	nir_builder_instr_insert(b, &flags->instr);
79 	return &flags->dest.ssa;
80 }
81 
82 static void
radv_store_availability(nir_builder * b,nir_ssa_def * flags,nir_ssa_def * dst_buf,nir_ssa_def * offset,nir_ssa_def * value32)83 radv_store_availability(nir_builder *b, nir_ssa_def *flags, nir_ssa_def *dst_buf,
84                         nir_ssa_def *offset, nir_ssa_def *value32)
85 {
86 	nir_push_if(b, nir_test_flag(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
87 
88 	nir_push_if(b, nir_test_flag(b, flags, VK_QUERY_RESULT_64_BIT));
89 
90 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo);
91 	store->src[0] = nir_src_for_ssa(nir_vec2(b, value32, nir_imm_int(b, 0)));
92 	store->src[1] = nir_src_for_ssa(dst_buf);
93 	store->src[2] = nir_src_for_ssa(offset);
94 	nir_intrinsic_set_write_mask(store, 0x3);
95 	nir_intrinsic_set_align(store, 8, 0);
96 	store->num_components = 2;
97 	nir_builder_instr_insert(b, &store->instr);
98 
99 	nir_push_else(b, NULL);
100 
101 	store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_ssbo);
102 	store->src[0] = nir_src_for_ssa(value32);
103 	store->src[1] = nir_src_for_ssa(dst_buf);
104 	store->src[2] = nir_src_for_ssa(offset);
105 	nir_intrinsic_set_write_mask(store, 0x1);
106 	nir_intrinsic_set_align(store, 4, 0);
107 	store->num_components = 1;
108 	nir_builder_instr_insert(b, &store->instr);
109 
110 	nir_pop_if(b, NULL);
111 
112 	nir_pop_if(b, NULL);
113 }
114 
115 static nir_shader *
build_occlusion_query_shader(struct radv_device * device)116 build_occlusion_query_shader(struct radv_device *device) {
117 	/* the shader this builds is roughly
118 	 *
119 	 * push constants {
120 	 * 	uint32_t flags;
121 	 * 	uint32_t dst_stride;
122 	 * };
123 	 *
124 	 * uint32_t src_stride = 16 * db_count;
125 	 *
126 	 * location(binding = 0) buffer dst_buf;
127 	 * location(binding = 1) buffer src_buf;
128 	 *
129 	 * void main() {
130 	 * 	uint64_t result = 0;
131 	 * 	uint64_t src_offset = src_stride * global_id.x;
132 	 * 	uint64_t dst_offset = dst_stride * global_id.x;
133 	 * 	bool available = true;
134 	 * 	for (int i = 0; i < db_count; ++i) {
135 	 *		if (enabled_rb_mask & (1 << i)) {
136 	 *			uint64_t start = src_buf[src_offset + 16 * i];
137 	 *			uint64_t end = src_buf[src_offset + 16 * i + 8];
138 	 *			if ((start & (1ull << 63)) && (end & (1ull << 63)))
139 	 *				result += end - start;
140 	 *			else
141 	 *				available = false;
142 	 *		}
143 	 * 	}
144 	 * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
145 	 * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
146 	 * 		if (flags & VK_QUERY_RESULT_64_BIT)
147 	 * 			dst_buf[dst_offset] = result;
148 	 * 		else
149 	 * 			dst_buf[dst_offset] = (uint32_t)result.
150 	 * 	}
151 	 * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
152 	 * 		dst_buf[dst_offset + elem_size] = available;
153 	 * 	}
154 	 * }
155 	 */
156 	nir_builder b;
157 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
158 	b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
159 	b.shader->info.cs.local_size[0] = 64;
160 	b.shader->info.cs.local_size[1] = 1;
161 	b.shader->info.cs.local_size[2] = 1;
162 
163 	nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
164 	nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
165 	nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
166 	nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
167 	nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
168 	unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
169 	unsigned db_count = device->physical_device->rad_info.num_render_backends;
170 
171 	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
172 
173 	nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
174 	nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
175 
176 	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
177 	nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
178 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
179 	                                        b.shader->info.cs.local_size[0],
180 	                                        b.shader->info.cs.local_size[1],
181 	                                        b.shader->info.cs.local_size[2], 0);
182 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
183 	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
184 
185 	nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
186 	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
187 	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
188 	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
189 
190 
191 	nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
192 	nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
193 	nir_store_var(&b, available, nir_imm_true(&b), 0x1);
194 
195 	nir_push_loop(&b);
196 
197 	nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
198 	radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
199 
200 	nir_ssa_def *enabled_cond =
201 		nir_iand(&b, nir_imm_int(&b, enabled_rb_mask),
202 			     nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count));
203 
204 	nir_push_if(&b, nir_i2b(&b, enabled_cond));
205 
206 	nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
207 	load_offset = nir_iadd(&b, input_base, load_offset);
208 
209 	nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
210 	load->src[0] = nir_src_for_ssa(src_buf);
211 	load->src[1] = nir_src_for_ssa(load_offset);
212 	nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
213 	load->num_components = 2;
214 	nir_intrinsic_set_align(load, 16, 0);
215 	nir_builder_instr_insert(&b, &load->instr);
216 
217 	nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
218 	nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
219 
220 	nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
221 	nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
222 
223 	nir_push_if(&b, nir_iand(&b, start_done, end_done));
224 
225 	nir_store_var(&b, result,
226 	              nir_iadd(&b, nir_load_var(&b, result),
227 	                           nir_isub(&b, nir_load_var(&b, end),
228 	                                        nir_load_var(&b, start))), 0x1);
229 
230 	nir_push_else(&b, NULL);
231 
232 	nir_store_var(&b, available, nir_imm_false(&b), 0x1);
233 
234 	nir_pop_if(&b, NULL);
235 	nir_pop_if(&b, NULL);
236 	nir_pop_loop(&b, NULL);
237 
238 	/* Store the result if complete or if partial results have been requested. */
239 
240 	nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
241 	nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
242 	nir_push_if(&b,
243 		    nir_ior(&b,
244 			    nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
245 			    nir_load_var(&b, available)));
246 
247 	nir_push_if(&b, result_is_64bit);
248 
249 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
250 	store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
251 	store->src[1] = nir_src_for_ssa(dst_buf);
252 	store->src[2] = nir_src_for_ssa(output_base);
253 	nir_intrinsic_set_write_mask(store, 0x1);
254 	nir_intrinsic_set_align(store, 8, 0);
255 	store->num_components = 1;
256 	nir_builder_instr_insert(&b, &store->instr);
257 
258 	nir_push_else(&b, NULL);
259 
260 	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
261 	store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
262 	store->src[1] = nir_src_for_ssa(dst_buf);
263 	store->src[2] = nir_src_for_ssa(output_base);
264 	nir_intrinsic_set_write_mask(store, 0x1);
265 	nir_intrinsic_set_align(store, 4, 0);
266 	store->num_components = 1;
267 	nir_builder_instr_insert(&b, &store->instr);
268 
269 	nir_pop_if(&b, NULL);
270 	nir_pop_if(&b, NULL);
271 
272 	radv_store_availability(&b, flags, dst_buf,
273 	                        nir_iadd(&b, result_size, output_base),
274 	                        nir_b2i32(&b, nir_load_var(&b, available)));
275 
276 	return b.shader;
277 }
278 
279 static nir_shader *
build_pipeline_statistics_query_shader(struct radv_device * device)280 build_pipeline_statistics_query_shader(struct radv_device *device) {
281 	/* the shader this builds is roughly
282 	 *
283 	 * push constants {
284 	 * 	uint32_t flags;
285 	 * 	uint32_t dst_stride;
286 	 * 	uint32_t stats_mask;
287 	 * 	uint32_t avail_offset;
288 	 * };
289 	 *
290 	 * uint32_t src_stride = pipelinestat_block_size * 2;
291 	 *
292 	 * location(binding = 0) buffer dst_buf;
293 	 * location(binding = 1) buffer src_buf;
294 	 *
295 	 * void main() {
296 	 * 	uint64_t src_offset = src_stride * global_id.x;
297 	 * 	uint64_t dst_base = dst_stride * global_id.x;
298 	 * 	uint64_t dst_offset = dst_base;
299 	 * 	uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
300 	 * 	uint32_t elem_count = stats_mask >> 16;
301 	 * 	uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
302 	 * 	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
303 	 * 		dst_buf[dst_offset + elem_count * elem_size] = available32;
304 	 * 	}
305 	 * 	if ((bool)available32) {
306 	 * 		// repeat 11 times:
307 	 * 		if (stats_mask & (1 << 0)) {
308 	 * 			uint64_t start = src_buf[src_offset + 8 * indices[0]];
309 	 * 			uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size];
310 	 * 			uint64_t result = end - start;
311 	 * 			if (flags & VK_QUERY_RESULT_64_BIT)
312 	 * 				dst_buf[dst_offset] = result;
313 	 * 			else
314 	 * 				dst_buf[dst_offset] = (uint32_t)result.
315 	 * 			dst_offset += elem_size;
316 	 * 		}
317 	 * 	} else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
318 	 *              // Set everything to 0 as we don't know what is valid.
319 	 * 		for (int i = 0; i < elem_count; ++i)
320 	 * 			dst_buf[dst_base + elem_size * i] = 0;
321 	 * 	}
322 	 * }
323 	 */
324 	nir_builder b;
325 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
326 	b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
327 	b.shader->info.cs.local_size[0] = 64;
328 	b.shader->info.cs.local_size[1] = 1;
329 	b.shader->info.cs.local_size[2] = 1;
330 
331 	nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
332 
333 	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
334 	nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask");
335 	nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset");
336 
337 	nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
338 	nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
339 
340 	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
341 	nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
342 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
343 	                                        b.shader->info.cs.local_size[0],
344 	                                        b.shader->info.cs.local_size[1],
345 	                                        b.shader->info.cs.local_size[2], 0);
346 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
347 	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
348 
349 	nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
350 	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
351 	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
352 	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
353 
354 
355 	avail_offset = nir_iadd(&b, avail_offset,
356 	                            nir_imul(&b, global_id, nir_imm_int(&b, 4)));
357 
358 	nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
359 	load->src[0] = nir_src_for_ssa(src_buf);
360 	load->src[1] = nir_src_for_ssa(avail_offset);
361 	nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
362 	load->num_components = 1;
363 	nir_intrinsic_set_align(load, 4, 0);
364 	nir_builder_instr_insert(&b, &load->instr);
365 	nir_ssa_def *available32 = &load->dest.ssa;
366 
367 	nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
368 	nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
369 	nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16));
370 
371 	radv_store_availability(&b, flags, dst_buf,
372 	                        nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)),
373 				available32);
374 
375 	nir_push_if(&b, nir_i2b(&b, available32));
376 
377 	nir_store_var(&b, output_offset, output_base, 0x1);
378 	for (int i = 0; i < 11; ++i) {
379 		nir_push_if(&b, nir_test_flag(&b, stats_mask, 1u << i));
380 
381 		load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
382 		load->src[0] = nir_src_for_ssa(src_buf);
383 		load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
384 		                                            nir_imm_int(&b, pipeline_statistics_indices[i] * 8)));
385 		nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
386 		load->num_components = 1;
387 		nir_intrinsic_set_align(load, 8, 0);
388 		nir_builder_instr_insert(&b, &load->instr);
389 		nir_ssa_def *start = &load->dest.ssa;
390 
391 		load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
392 		load->src[0] = nir_src_for_ssa(src_buf);
393 		load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
394 		                                            nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)));
395 		nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
396 		load->num_components = 1;
397 		nir_intrinsic_set_align(load, 8, 0);
398 		nir_builder_instr_insert(&b, &load->instr);
399 		nir_ssa_def *end = &load->dest.ssa;
400 
401 		nir_ssa_def *result = nir_isub(&b, end, start);
402 
403 		/* Store result */
404 		nir_push_if(&b, result_is_64bit);
405 
406 		nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
407 		store->src[0] = nir_src_for_ssa(result);
408 		store->src[1] = nir_src_for_ssa(dst_buf);
409 		store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
410 		nir_intrinsic_set_write_mask(store, 0x1);
411 		nir_intrinsic_set_align(store, 8, 0);
412 		store->num_components = 1;
413 		nir_builder_instr_insert(&b, &store->instr);
414 
415 		nir_push_else(&b, NULL);
416 
417 		store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
418 		store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result));
419 		store->src[1] = nir_src_for_ssa(dst_buf);
420 		store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
421 		nir_intrinsic_set_write_mask(store, 0x1);
422 		nir_intrinsic_set_align(store, 4, 0);
423 		store->num_components = 1;
424 		nir_builder_instr_insert(&b, &store->instr);
425 
426 		nir_pop_if(&b, NULL);
427 
428 		nir_store_var(&b, output_offset,
429 		                  nir_iadd(&b, nir_load_var(&b, output_offset),
430 		                               elem_size), 0x1);
431 
432 		nir_pop_if(&b, NULL);
433 	}
434 
435 	nir_push_else(&b, NULL); /* nir_i2b(&b, available32) */
436 
437 	nir_push_if(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
438 
439 	/* Stores zeros in all outputs. */
440 
441 	nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
442 	nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
443 
444 	nir_loop *loop = nir_push_loop(&b);
445 
446 	nir_ssa_def *current_counter = nir_load_var(&b, counter);
447 	radv_break_on_count(&b, counter, elem_count);
448 
449 	nir_ssa_def *output_elem = nir_iadd(&b, output_base,
450 	                                        nir_imul(&b, elem_size, current_counter));
451 	nir_push_if(&b, result_is_64bit);
452 
453 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
454 	store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0));
455 	store->src[1] = nir_src_for_ssa(dst_buf);
456 	store->src[2] = nir_src_for_ssa(output_elem);
457 	nir_intrinsic_set_write_mask(store, 0x1);
458 	nir_intrinsic_set_align(store, 8, 0);
459 	store->num_components = 1;
460 	nir_builder_instr_insert(&b, &store->instr);
461 
462 	nir_push_else(&b, NULL);
463 
464 	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
465 	store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
466 	store->src[1] = nir_src_for_ssa(dst_buf);
467 	store->src[2] = nir_src_for_ssa(output_elem);
468 	nir_intrinsic_set_write_mask(store, 0x1);
469 	nir_intrinsic_set_align(store, 4, 0);
470 	store->num_components = 1;
471 	nir_builder_instr_insert(&b, &store->instr);
472 
473 	nir_pop_if(&b, NULL);
474 
475 	nir_pop_loop(&b, loop);
476 	nir_pop_if(&b, NULL); /* VK_QUERY_RESULT_PARTIAL_BIT */
477 	nir_pop_if(&b, NULL); /* nir_i2b(&b, available32) */
478 	return b.shader;
479 }
480 
481 static nir_shader *
build_tfb_query_shader(struct radv_device * device)482 build_tfb_query_shader(struct radv_device *device)
483 {
484 	/* the shader this builds is roughly
485 	 *
486 	 * uint32_t src_stride = 32;
487 	 *
488 	 * location(binding = 0) buffer dst_buf;
489 	 * location(binding = 1) buffer src_buf;
490 	 *
491 	 * void main() {
492 	 *	uint64_t result[2] = {};
493 	 *	bool available = false;
494 	 *	uint64_t src_offset = src_stride * global_id.x;
495 	 * 	uint64_t dst_offset = dst_stride * global_id.x;
496 	 * 	uint64_t *src_data = src_buf[src_offset];
497 	 *	uint32_t avail = (src_data[0] >> 32) &
498 	 *			 (src_data[1] >> 32) &
499 	 *			 (src_data[2] >> 32) &
500 	 *			 (src_data[3] >> 32);
501 	 *	if (avail & 0x80000000) {
502 	 *		result[0] = src_data[3] - src_data[1];
503 	 *		result[1] = src_data[2] - src_data[0];
504 	 *		available = true;
505 	 *	}
506 	 * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
507 	 * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
508 	 *		if (flags & VK_QUERY_RESULT_64_BIT) {
509 	 *			dst_buf[dst_offset] = result;
510 	 *		} else {
511 	 *			dst_buf[dst_offset] = (uint32_t)result;
512 	 *		}
513 	 *	}
514 	 *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
515 	 *		dst_buf[dst_offset + result_size] = available;
516 	 * 	}
517 	 * }
518 	 */
519 	nir_builder b;
520 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
521 	b.shader->info.name = ralloc_strdup(b.shader, "tfb_query");
522 	b.shader->info.cs.local_size[0] = 64;
523 	b.shader->info.cs.local_size[1] = 1;
524 	b.shader->info.cs.local_size[2] = 1;
525 
526 	/* Create and initialize local variables. */
527 	nir_variable *result =
528 		nir_local_variable_create(b.impl,
529 					  glsl_vector_type(GLSL_TYPE_UINT64, 2),
530 					  "result");
531 	nir_variable *available =
532 		nir_local_variable_create(b.impl, glsl_bool_type(), "available");
533 
534 	nir_store_var(&b, result,
535 		      nir_vec2(&b, nir_imm_int64(&b, 0),
536 				   nir_imm_int64(&b, 0)), 0x3);
537 	nir_store_var(&b, available, nir_imm_false(&b), 0x1);
538 
539 	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
540 
541 	/* Load resources. */
542 	nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
543 	nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
544 
545 	/* Compute global ID. */
546 	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
547 	nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
548 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
549 	                                        b.shader->info.cs.local_size[0],
550 	                                        b.shader->info.cs.local_size[1],
551 	                                        b.shader->info.cs.local_size[2], 0);
552 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
553 	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
554 
555 	/* Compute src/dst strides. */
556 	nir_ssa_def *input_stride = nir_imm_int(&b, 32);
557 	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
558 	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
559 	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
560 
561 	/* Load data from the query pool. */
562 	nir_intrinsic_instr *load1 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
563 	load1->src[0] = nir_src_for_ssa(src_buf);
564 	load1->src[1] = nir_src_for_ssa(input_base);
565 	nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL);
566 	load1->num_components = 4;
567 	nir_intrinsic_set_align(load1, 32, 0);
568 	nir_builder_instr_insert(&b, &load1->instr);
569 
570 	nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
571 	load2->src[0] = nir_src_for_ssa(src_buf);
572 	load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16)));
573 	nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL);
574 	load2->num_components = 4;
575 	nir_intrinsic_set_align(load2, 16, 0);
576 	nir_builder_instr_insert(&b, &load2->instr);
577 
578 	/* Check if result is available. */
579 	nir_ssa_def *avails[2];
580 	avails[0] = nir_iand(&b, nir_channel(&b, &load1->dest.ssa, 1),
581 				 nir_channel(&b, &load1->dest.ssa, 3));
582 	avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1),
583 				 nir_channel(&b, &load2->dest.ssa, 3));
584 	nir_ssa_def *result_is_available =
585 		nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
586 			                 nir_imm_int(&b, 0x80000000)));
587 
588 	/* Only compute result if available. */
589 	nir_push_if(&b, result_is_available);
590 
591 	/* Pack values. */
592 	nir_ssa_def *packed64[4];
593 	packed64[0] = nir_pack_64_2x32(&b, nir_vec2(&b,
594 						    nir_channel(&b, &load1->dest.ssa, 0),
595 						    nir_channel(&b, &load1->dest.ssa, 1)));
596 	packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b,
597 						    nir_channel(&b, &load1->dest.ssa, 2),
598 						    nir_channel(&b, &load1->dest.ssa, 3)));
599 	packed64[2] = nir_pack_64_2x32(&b, nir_vec2(&b,
600 						    nir_channel(&b, &load2->dest.ssa, 0),
601 						    nir_channel(&b, &load2->dest.ssa, 1)));
602 	packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b,
603 						    nir_channel(&b, &load2->dest.ssa, 2),
604 						    nir_channel(&b, &load2->dest.ssa, 3)));
605 
606 	/* Compute result. */
607 	nir_ssa_def *num_primitive_written =
608 		nir_isub(&b, packed64[3], packed64[1]);
609 	nir_ssa_def *primitive_storage_needed =
610 		nir_isub(&b, packed64[2], packed64[0]);
611 
612 	nir_store_var(&b, result,
613 		      nir_vec2(&b, num_primitive_written,
614 				   primitive_storage_needed), 0x3);
615 	nir_store_var(&b, available, nir_imm_true(&b), 0x1);
616 
617 	nir_pop_if(&b, NULL);
618 
619 	/* Determine if result is 64 or 32 bit. */
620 	nir_ssa_def *result_is_64bit =
621 		nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
622 	nir_ssa_def *result_size =
623 		nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16),
624 			  nir_imm_int(&b, 8));
625 
626 	/* Store the result if complete or partial results have been requested. */
627 	nir_push_if(&b,
628 		    nir_ior(&b,
629 			    nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
630 			    nir_load_var(&b, available)));
631 
632 	/* Store result. */
633 	nir_push_if(&b, result_is_64bit);
634 
635 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
636 	store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
637 	store->src[1] = nir_src_for_ssa(dst_buf);
638 	store->src[2] = nir_src_for_ssa(output_base);
639 	nir_intrinsic_set_write_mask(store, 0x3);
640 	nir_intrinsic_set_align(store, 8, 0);
641 	store->num_components = 2;
642 	nir_builder_instr_insert(&b, &store->instr);
643 
644 	nir_push_else(&b, NULL);
645 
646 	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
647 	store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
648 	store->src[1] = nir_src_for_ssa(dst_buf);
649 	store->src[2] = nir_src_for_ssa(output_base);
650 	nir_intrinsic_set_write_mask(store, 0x3);
651 	nir_intrinsic_set_align(store, 4, 0);
652 	store->num_components = 2;
653 	nir_builder_instr_insert(&b, &store->instr);
654 
655 	nir_pop_if(&b, NULL);
656 	nir_pop_if(&b, NULL);
657 
658 	radv_store_availability(&b, flags, dst_buf,
659 	                        nir_iadd(&b, result_size, output_base),
660 	                        nir_b2i32(&b, nir_load_var(&b, available)));
661 
662 	return b.shader;
663 }
664 
665 static nir_shader *
build_timestamp_query_shader(struct radv_device * device)666 build_timestamp_query_shader(struct radv_device *device)
667 {
668 	/* the shader this builds is roughly
669 	 *
670 	 * uint32_t src_stride = 8;
671 	 *
672 	 * location(binding = 0) buffer dst_buf;
673 	 * location(binding = 1) buffer src_buf;
674 	 *
675 	 * void main() {
676 	 *	uint64_t result = 0;
677 	 *	bool available = false;
678 	 *	uint64_t src_offset = src_stride * global_id.x;
679 	 * 	uint64_t dst_offset = dst_stride * global_id.x;
680 	 * 	uint64_t timestamp = src_buf[src_offset];
681 	 *	if (timestamp != TIMESTAMP_NOT_READY) {
682 	 *		result = timestamp;
683 	 *		available = true;
684 	 *	}
685 	 * 	uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
686 	 * 	if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
687 	 *		if (flags & VK_QUERY_RESULT_64_BIT) {
688 	 *			dst_buf[dst_offset] = result;
689 	 *		} else {
690 	 *			dst_buf[dst_offset] = (uint32_t)result;
691 	 *		}
692 	 *	}
693 	 *	if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
694 	 *		dst_buf[dst_offset + result_size] = available;
695 	 * 	}
696 	 * }
697 	 */
698 	nir_builder b;
699 	nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
700 	b.shader->info.name = ralloc_strdup(b.shader, "timestamp_query");
701 	b.shader->info.cs.local_size[0] = 64;
702 	b.shader->info.cs.local_size[1] = 1;
703 	b.shader->info.cs.local_size[2] = 1;
704 
705 	/* Create and initialize local variables. */
706 	nir_variable *result =
707 		nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
708 	nir_variable *available =
709 		nir_local_variable_create(b.impl, glsl_bool_type(), "available");
710 
711 	nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
712 	nir_store_var(&b, available, nir_imm_false(&b), 0x1);
713 
714 	nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
715 
716 	/* Load resources. */
717 	nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
718 	nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
719 
720 	/* Compute global ID. */
721 	nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
722 	nir_ssa_def *wg_id = nir_load_work_group_id(&b, 32);
723 	nir_ssa_def *block_size = nir_imm_ivec4(&b,
724 	                                        b.shader->info.cs.local_size[0],
725 	                                        b.shader->info.cs.local_size[1],
726 	                                        b.shader->info.cs.local_size[2], 0);
727 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
728 	global_id = nir_channel(&b, global_id, 0); // We only care about x here.
729 
730 	/* Compute src/dst strides. */
731 	nir_ssa_def *input_stride = nir_imm_int(&b, 8);
732 	nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
733 	nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
734 	nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
735 
736 	/* Load data from the query pool. */
737 	nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
738 	load->src[0] = nir_src_for_ssa(src_buf);
739 	load->src[1] = nir_src_for_ssa(input_base);
740 	nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL);
741 	load->num_components = 2;
742 	nir_intrinsic_set_align(load, 8, 0);
743 	nir_builder_instr_insert(&b, &load->instr);
744 
745 	/* Pack the timestamp. */
746 	nir_ssa_def *timestamp;
747 	timestamp = nir_pack_64_2x32(&b, nir_vec2(&b,
748 						  nir_channel(&b, &load->dest.ssa, 0),
749 						  nir_channel(&b, &load->dest.ssa, 1)));
750 
751 	/* Check if result is available. */
752 	nir_ssa_def *result_is_available =
753 		nir_i2b(&b, nir_ine(&b, timestamp,
754 			            nir_imm_int64(&b, TIMESTAMP_NOT_READY)));
755 
756 	/* Only store result if available. */
757 	nir_push_if(&b, result_is_available);
758 
759 	nir_store_var(&b, result, timestamp, 0x1);
760 	nir_store_var(&b, available, nir_imm_true(&b), 0x1);
761 
762 	nir_pop_if(&b, NULL);
763 
764 	/* Determine if result is 64 or 32 bit. */
765 	nir_ssa_def *result_is_64bit =
766 		nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
767 	nir_ssa_def *result_size =
768 		nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8),
769 			  nir_imm_int(&b, 4));
770 
771 	/* Store the result if complete or partial results have been requested. */
772 	nir_push_if(&b, nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
773 				nir_load_var(&b, available)));
774 
775 	/* Store result. */
776 	nir_push_if(&b, result_is_64bit);
777 
778 	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
779 	store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
780 	store->src[1] = nir_src_for_ssa(dst_buf);
781 	store->src[2] = nir_src_for_ssa(output_base);
782 	nir_intrinsic_set_write_mask(store, 0x1);
783 	nir_intrinsic_set_align(store, 8, 0);
784 	store->num_components = 1;
785 	nir_builder_instr_insert(&b, &store->instr);
786 
787 	nir_push_else(&b, NULL);
788 
789 	store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
790 	store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
791 	store->src[1] = nir_src_for_ssa(dst_buf);
792 	store->src[2] = nir_src_for_ssa(output_base);
793 	nir_intrinsic_set_write_mask(store, 0x1);
794 	nir_intrinsic_set_align(store, 4, 0);
795 	store->num_components = 1;
796 	nir_builder_instr_insert(&b, &store->instr);
797 
798 	nir_pop_if(&b, NULL);
799 
800 	nir_pop_if(&b, NULL);
801 
802 	radv_store_availability(&b, flags, dst_buf,
803 	                        nir_iadd(&b, result_size, output_base),
804 	                        nir_b2i32(&b, nir_load_var(&b, available)));
805 
806 	return b.shader;
807 }
808 
radv_device_init_meta_query_state_internal(struct radv_device * device)809 static VkResult radv_device_init_meta_query_state_internal(struct radv_device *device)
810 {
811 	VkResult result;
812 	struct radv_shader_module occlusion_cs = { .nir = NULL };
813 	struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
814 	struct radv_shader_module tfb_cs = { .nir = NULL };
815 	struct radv_shader_module timestamp_cs = { .nir = NULL };
816 
817 	mtx_lock(&device->meta_state.mtx);
818 	if (device->meta_state.query.pipeline_statistics_query_pipeline) {
819 		mtx_unlock(&device->meta_state.mtx);
820 		return VK_SUCCESS;
821 	}
822 	occlusion_cs.nir = build_occlusion_query_shader(device);
823 	pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
824 	tfb_cs.nir = build_tfb_query_shader(device);
825 	timestamp_cs.nir = build_timestamp_query_shader(device);
826 
827 	VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
828 		.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
829 		.flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
830 		.bindingCount = 2,
831 		.pBindings = (VkDescriptorSetLayoutBinding[]) {
832 			{
833 				.binding = 0,
834 				.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
835 				.descriptorCount = 1,
836 				.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
837 				.pImmutableSamplers = NULL
838 			},
839 			{
840 				.binding = 1,
841 				.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
842 				.descriptorCount = 1,
843 				.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
844 				.pImmutableSamplers = NULL
845 			},
846 		}
847 	};
848 
849 	result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
850 						&occlusion_ds_create_info,
851 						&device->meta_state.alloc,
852 						&device->meta_state.query.ds_layout);
853 	if (result != VK_SUCCESS)
854 		goto fail;
855 
856 	VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
857 		.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
858 		.setLayoutCount = 1,
859 		.pSetLayouts = &device->meta_state.query.ds_layout,
860 		.pushConstantRangeCount = 1,
861 		.pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
862 	};
863 
864 	result = radv_CreatePipelineLayout(radv_device_to_handle(device),
865 					  &occlusion_pl_create_info,
866 					  &device->meta_state.alloc,
867 					  &device->meta_state.query.p_layout);
868 	if (result != VK_SUCCESS)
869 		goto fail;
870 
871 	VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
872 		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
873 		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
874 		.module = radv_shader_module_to_handle(&occlusion_cs),
875 		.pName = "main",
876 		.pSpecializationInfo = NULL,
877 	};
878 
879 	VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
880 		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
881 		.stage = occlusion_pipeline_shader_stage,
882 		.flags = 0,
883 		.layout = device->meta_state.query.p_layout,
884 	};
885 
886 	result = radv_CreateComputePipelines(radv_device_to_handle(device),
887 					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
888 					     1, &occlusion_vk_pipeline_info, NULL,
889 					     &device->meta_state.query.occlusion_query_pipeline);
890 	if (result != VK_SUCCESS)
891 		goto fail;
892 
893 	VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
894 		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
895 		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
896 		.module = radv_shader_module_to_handle(&pipeline_statistics_cs),
897 		.pName = "main",
898 		.pSpecializationInfo = NULL,
899 	};
900 
901 	VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
902 		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
903 		.stage = pipeline_statistics_pipeline_shader_stage,
904 		.flags = 0,
905 		.layout = device->meta_state.query.p_layout,
906 	};
907 
908 	result = radv_CreateComputePipelines(radv_device_to_handle(device),
909 					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
910 					     1, &pipeline_statistics_vk_pipeline_info, NULL,
911 					     &device->meta_state.query.pipeline_statistics_query_pipeline);
912 	if (result != VK_SUCCESS)
913 		goto fail;
914 
915 	VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
916 		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
917 		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
918 		.module = radv_shader_module_to_handle(&tfb_cs),
919 		.pName = "main",
920 		.pSpecializationInfo = NULL,
921 	};
922 
923 	VkComputePipelineCreateInfo tfb_pipeline_info = {
924 		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
925 		.stage = tfb_pipeline_shader_stage,
926 		.flags = 0,
927 		.layout = device->meta_state.query.p_layout,
928 	};
929 
930 	result = radv_CreateComputePipelines(radv_device_to_handle(device),
931 					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
932 					     1, &tfb_pipeline_info, NULL,
933 					     &device->meta_state.query.tfb_query_pipeline);
934 	if (result != VK_SUCCESS)
935 		goto fail;
936 
937 	VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
938 		.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
939 		.stage = VK_SHADER_STAGE_COMPUTE_BIT,
940 		.module = radv_shader_module_to_handle(&timestamp_cs),
941 		.pName = "main",
942 		.pSpecializationInfo = NULL,
943 	};
944 
945 	VkComputePipelineCreateInfo timestamp_pipeline_info = {
946 		.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
947 		.stage = timestamp_pipeline_shader_stage,
948 		.flags = 0,
949 		.layout = device->meta_state.query.p_layout,
950 	};
951 
952 	result = radv_CreateComputePipelines(radv_device_to_handle(device),
953 					     radv_pipeline_cache_to_handle(&device->meta_state.cache),
954 					     1, &timestamp_pipeline_info, NULL,
955 					     &device->meta_state.query.timestamp_query_pipeline);
956 
957 fail:
958 	if (result != VK_SUCCESS)
959 		radv_device_finish_meta_query_state(device);
960 	ralloc_free(occlusion_cs.nir);
961 	ralloc_free(pipeline_statistics_cs.nir);
962 	ralloc_free(tfb_cs.nir);
963 	ralloc_free(timestamp_cs.nir);
964 	mtx_unlock(&device->meta_state.mtx);
965 	return result;
966 }
967 
radv_device_init_meta_query_state(struct radv_device * device,bool on_demand)968 VkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
969 {
970 	if (on_demand)
971 		return VK_SUCCESS;
972 
973 	return radv_device_init_meta_query_state_internal(device);
974 }
975 
radv_device_finish_meta_query_state(struct radv_device * device)976 void radv_device_finish_meta_query_state(struct radv_device *device)
977 {
978 	if (device->meta_state.query.tfb_query_pipeline)
979 		radv_DestroyPipeline(radv_device_to_handle(device),
980 				     device->meta_state.query.tfb_query_pipeline,
981 				     &device->meta_state.alloc);
982 
983 	if (device->meta_state.query.pipeline_statistics_query_pipeline)
984 		radv_DestroyPipeline(radv_device_to_handle(device),
985 				     device->meta_state.query.pipeline_statistics_query_pipeline,
986 				     &device->meta_state.alloc);
987 
988 	if (device->meta_state.query.occlusion_query_pipeline)
989 		radv_DestroyPipeline(radv_device_to_handle(device),
990 				     device->meta_state.query.occlusion_query_pipeline,
991 				     &device->meta_state.alloc);
992 
993 	if (device->meta_state.query.timestamp_query_pipeline)
994 		radv_DestroyPipeline(radv_device_to_handle(device),
995 				     device->meta_state.query.timestamp_query_pipeline,
996 				     &device->meta_state.alloc);
997 
998 	if (device->meta_state.query.p_layout)
999 		radv_DestroyPipelineLayout(radv_device_to_handle(device),
1000 					   device->meta_state.query.p_layout,
1001 					   &device->meta_state.alloc);
1002 
1003 	if (device->meta_state.query.ds_layout)
1004 		radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
1005 						device->meta_state.query.ds_layout,
1006 						&device->meta_state.alloc);
1007 }
1008 
radv_query_shader(struct radv_cmd_buffer * cmd_buffer,VkPipeline * pipeline,struct radeon_winsys_bo * src_bo,struct radeon_winsys_bo * dst_bo,uint64_t src_offset,uint64_t dst_offset,uint32_t src_stride,uint32_t dst_stride,uint32_t count,uint32_t flags,uint32_t pipeline_stats_mask,uint32_t avail_offset)1009 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
1010                               VkPipeline *pipeline,
1011                               struct radeon_winsys_bo *src_bo,
1012                               struct radeon_winsys_bo *dst_bo,
1013                               uint64_t src_offset, uint64_t dst_offset,
1014                               uint32_t src_stride, uint32_t dst_stride,
1015                               uint32_t count, uint32_t flags,
1016                               uint32_t pipeline_stats_mask, uint32_t avail_offset)
1017 {
1018 	struct radv_device *device = cmd_buffer->device;
1019 	struct radv_meta_saved_state saved_state;
1020 	bool old_predicating;
1021 
1022 	if (!*pipeline) {
1023 		VkResult ret = radv_device_init_meta_query_state_internal(device);
1024 		if (ret != VK_SUCCESS) {
1025 			cmd_buffer->record_result = ret;
1026 			return;
1027 		}
1028 	}
1029 
1030 	radv_meta_save(&saved_state, cmd_buffer,
1031 		       RADV_META_SAVE_COMPUTE_PIPELINE |
1032 		       RADV_META_SAVE_CONSTANTS |
1033 		       RADV_META_SAVE_DESCRIPTORS);
1034 
1035 	/* VK_EXT_conditional_rendering says that copy commands should not be
1036 	 * affected by conditional rendering.
1037 	 */
1038 	old_predicating = cmd_buffer->state.predicating;
1039 	cmd_buffer->state.predicating = false;
1040 
1041 	struct radv_buffer dst_buffer = {
1042 		.bo = dst_bo,
1043 		.offset = dst_offset,
1044 		.size = dst_stride * count
1045 	};
1046 
1047 	struct radv_buffer src_buffer = {
1048 		.bo = src_bo,
1049 		.offset = src_offset,
1050 		.size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)
1051 	};
1052 
1053 	radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
1054 			     VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1055 
1056 	radv_meta_push_descriptor_set(cmd_buffer,
1057 				      VK_PIPELINE_BIND_POINT_COMPUTE,
1058 				      device->meta_state.query.p_layout,
1059 				      0, /* set */
1060 				      2, /* descriptorWriteCount */
1061 				      (VkWriteDescriptorSet[]) {
1062 				              {
1063 				                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1064 				                      .dstBinding = 0,
1065 				                      .dstArrayElement = 0,
1066 				                      .descriptorCount = 1,
1067 				                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1068 				                      .pBufferInfo = &(VkDescriptorBufferInfo) {
1069 				                              .buffer = radv_buffer_to_handle(&dst_buffer),
1070 				                              .offset = 0,
1071 				                              .range = VK_WHOLE_SIZE
1072 				                      }
1073 				              },
1074 				              {
1075 				                      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1076 				                      .dstBinding = 1,
1077 				                      .dstArrayElement = 0,
1078 				                      .descriptorCount = 1,
1079 				                      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1080 				                      .pBufferInfo = &(VkDescriptorBufferInfo) {
1081 				                              .buffer = radv_buffer_to_handle(&src_buffer),
1082 				                              .offset = 0,
1083 				                              .range = VK_WHOLE_SIZE
1084 				                      }
1085 				              }
1086 				      });
1087 
1088 	/* Encode the number of elements for easy access by the shader. */
1089 	pipeline_stats_mask &= 0x7ff;
1090 	pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1091 
1092 	avail_offset -= src_offset;
1093 
1094 	struct {
1095 		uint32_t flags;
1096 		uint32_t dst_stride;
1097 		uint32_t pipeline_stats_mask;
1098 		uint32_t avail_offset;
1099 	} push_constants = {
1100 		flags,
1101 		dst_stride,
1102 		pipeline_stats_mask,
1103 		avail_offset
1104 	};
1105 
1106 	radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1107 				      device->meta_state.query.p_layout,
1108 				      VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
1109 				      &push_constants);
1110 
1111 	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 |
1112 	                                RADV_CMD_FLAG_INV_VCACHE;
1113 
1114 	if (flags & VK_QUERY_RESULT_WAIT_BIT)
1115 		cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1116 
1117 	radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1118 
1119 	/* Restore conditional rendering. */
1120 	cmd_buffer->state.predicating = old_predicating;
1121 
1122 	radv_meta_restore(&saved_state, cmd_buffer);
1123 }
1124 
1125 static bool
radv_query_pool_needs_gds(struct radv_device * device,struct radv_query_pool * pool)1126 radv_query_pool_needs_gds(struct radv_device *device,
1127 			  struct radv_query_pool *pool)
1128 {
1129 	/* The number of primitives generated by geometry shader invocations is
1130 	 * only counted by the hardware if GS uses the legacy path. When NGG GS
1131 	 * is used, the hardware can't know the number of generated primitives
1132 	 * and we have to it manually inside the shader. To achieve that, the
1133 	 * driver does a plain GDS atomic to accumulate that value.
1134 	 * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
1135 	 * query.
1136 	 */
1137 	return device->physical_device->use_ngg &&
1138 	       (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1139 }
1140 
1141 static void
radv_destroy_query_pool(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_query_pool * pool)1142 radv_destroy_query_pool(struct radv_device *device,
1143 			const VkAllocationCallbacks *pAllocator,
1144 			struct radv_query_pool *pool)
1145 {
1146 	if (pool->bo)
1147 		device->ws->buffer_destroy(pool->bo);
1148 	vk_object_base_finish(&pool->base);
1149 	vk_free2(&device->vk.alloc, pAllocator, pool);
1150 }
1151 
radv_CreateQueryPool(VkDevice _device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)1152 VkResult radv_CreateQueryPool(
1153 	VkDevice                                    _device,
1154 	const VkQueryPoolCreateInfo*                pCreateInfo,
1155 	const VkAllocationCallbacks*                pAllocator,
1156 	VkQueryPool*                                pQueryPool)
1157 {
1158 	RADV_FROM_HANDLE(radv_device, device, _device);
1159 	struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator,
1160 					       sizeof(*pool), 8,
1161 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1162 
1163 	if (!pool)
1164 		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1165 
1166 	vk_object_base_init(&device->vk, &pool->base,
1167 			    VK_OBJECT_TYPE_QUERY_POOL);
1168 
1169 	switch(pCreateInfo->queryType) {
1170 	case VK_QUERY_TYPE_OCCLUSION:
1171 		pool->stride = 16 * device->physical_device->rad_info.num_render_backends;
1172 		break;
1173 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1174 		pool->stride = pipelinestat_block_size * 2;
1175 		break;
1176 	case VK_QUERY_TYPE_TIMESTAMP:
1177 		pool->stride = 8;
1178 		break;
1179 	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1180 		pool->stride = 32;
1181 		break;
1182 	default:
1183 		unreachable("creating unhandled query type");
1184 	}
1185 
1186 	pool->type = pCreateInfo->queryType;
1187 	pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1188 	pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1189 	pool->size = pool->availability_offset;
1190 	if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1191 		pool->size += 4 * pCreateInfo->queryCount;
1192 
1193 	pool->bo = device->ws->buffer_create(device->ws, pool->size,
1194 					     64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING,
1195 					     RADV_BO_PRIORITY_QUERY_POOL);
1196 	if (!pool->bo) {
1197 		radv_destroy_query_pool(device, pAllocator, pool);
1198 		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1199 	}
1200 
1201 	pool->ptr = device->ws->buffer_map(pool->bo);
1202 	if (!pool->ptr) {
1203 		radv_destroy_query_pool(device, pAllocator, pool);
1204 		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1205 	}
1206 
1207 	*pQueryPool = radv_query_pool_to_handle(pool);
1208 	return VK_SUCCESS;
1209 }
1210 
radv_DestroyQueryPool(VkDevice _device,VkQueryPool _pool,const VkAllocationCallbacks * pAllocator)1211 void radv_DestroyQueryPool(
1212 	VkDevice                                    _device,
1213 	VkQueryPool                                 _pool,
1214 	const VkAllocationCallbacks*                pAllocator)
1215 {
1216 	RADV_FROM_HANDLE(radv_device, device, _device);
1217 	RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1218 
1219 	if (!pool)
1220 		return;
1221 
1222 	radv_destroy_query_pool(device, pAllocator, pool);
1223 }
1224 
radv_GetQueryPoolResults(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)1225 VkResult radv_GetQueryPoolResults(
1226 	VkDevice                                    _device,
1227 	VkQueryPool                                 queryPool,
1228 	uint32_t                                    firstQuery,
1229 	uint32_t                                    queryCount,
1230 	size_t                                      dataSize,
1231 	void*                                       pData,
1232 	VkDeviceSize                                stride,
1233 	VkQueryResultFlags                          flags)
1234 {
1235 	RADV_FROM_HANDLE(radv_device, device, _device);
1236 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1237 	char *data = pData;
1238 	VkResult result = VK_SUCCESS;
1239 
1240 	if (radv_device_is_lost(device))
1241 		return VK_ERROR_DEVICE_LOST;
1242 
1243 	for(unsigned i = 0; i < queryCount; ++i, data += stride) {
1244 		char *dest = data;
1245 		unsigned query = firstQuery + i;
1246 		char *src = pool->ptr + query * pool->stride;
1247 		uint32_t available;
1248 
1249 		switch (pool->type) {
1250 		case VK_QUERY_TYPE_TIMESTAMP: {
1251 			uint64_t const *src64 = (uint64_t const *)src;
1252 			uint64_t value;
1253 
1254 			do {
1255 				value = p_atomic_read(src64);
1256 			} while (value == TIMESTAMP_NOT_READY &&
1257 			         (flags & VK_QUERY_RESULT_WAIT_BIT));
1258 
1259 			available = value != TIMESTAMP_NOT_READY;
1260 
1261 			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1262 				result = VK_NOT_READY;
1263 
1264 			if (flags & VK_QUERY_RESULT_64_BIT) {
1265 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1266 					*(uint64_t*)dest = value;
1267 				dest += 8;
1268 			} else {
1269 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1270 					*(uint32_t*)dest = (uint32_t)value;
1271 				dest += 4;
1272 			}
1273 			break;
1274 		}
1275 		case VK_QUERY_TYPE_OCCLUSION: {
1276 			uint64_t const *src64 = (uint64_t const *)src;
1277 			uint32_t db_count = device->physical_device->rad_info.num_render_backends;
1278 			uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1279 			uint64_t sample_count = 0;
1280 			available = 1;
1281 
1282 			for (int i = 0; i < db_count; ++i) {
1283 				uint64_t start, end;
1284 
1285 				if (!(enabled_rb_mask & (1 << i)))
1286 					continue;
1287 
1288 				do {
1289 					start = p_atomic_read(src64 + 2 * i);
1290 					end = p_atomic_read(src64 + 2 * i + 1);
1291 				} while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
1292 
1293 				if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1294 					available = 0;
1295 				else {
1296 					sample_count += end - start;
1297 				}
1298 			}
1299 
1300 			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1301 				result = VK_NOT_READY;
1302 
1303 			if (flags & VK_QUERY_RESULT_64_BIT) {
1304 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1305 					*(uint64_t*)dest = sample_count;
1306 				dest += 8;
1307 			} else {
1308 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1309 					*(uint32_t*)dest = sample_count;
1310 				dest += 4;
1311 			}
1312 			break;
1313 		}
1314 		case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1315 			const uint32_t *avail_ptr = (const uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
1316 
1317 			do {
1318 				available = p_atomic_read(avail_ptr);
1319 			} while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
1320 
1321 			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1322 				result = VK_NOT_READY;
1323 
1324 			const uint64_t *start = (uint64_t*)src;
1325 			const uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size);
1326 			if (flags & VK_QUERY_RESULT_64_BIT) {
1327 				uint64_t *dst = (uint64_t*)dest;
1328 				dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1329 				for(int i = 0; i < 11; ++i) {
1330 					if(pool->pipeline_stats_mask & (1u << i)) {
1331 						if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1332 							*dst = stop[pipeline_statistics_indices[i]] -
1333 							       start[pipeline_statistics_indices[i]];
1334 						dst++;
1335 					}
1336 				}
1337 
1338 			} else {
1339 				uint32_t *dst = (uint32_t*)dest;
1340 				dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1341 				for(int i = 0; i < 11; ++i) {
1342 					if(pool->pipeline_stats_mask & (1u << i)) {
1343 						if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1344 							*dst = stop[pipeline_statistics_indices[i]] -
1345 							       start[pipeline_statistics_indices[i]];
1346 						dst++;
1347 					}
1348 				}
1349 			}
1350 			break;
1351 		}
1352 		case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1353 			uint64_t const *src64 = (uint64_t const *)src;
1354 			uint64_t num_primitives_written;
1355 			uint64_t primitive_storage_needed;
1356 
1357 			/* SAMPLE_STREAMOUTSTATS stores this structure:
1358 			 * {
1359 			 *	u64 NumPrimitivesWritten;
1360 			 *	u64 PrimitiveStorageNeeded;
1361 			 * }
1362 			 */
1363 			available = 1;
1364 			for (int j = 0; j < 4; j++) {
1365 				if (!(p_atomic_read(src64 + j) & 0x8000000000000000UL))
1366 					available = 0;
1367 			}
1368 
1369 			if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1370 				result = VK_NOT_READY;
1371 
1372 			num_primitives_written = src64[3] - src64[1];
1373 			primitive_storage_needed = src64[2] - src64[0];
1374 
1375 			if (flags & VK_QUERY_RESULT_64_BIT) {
1376 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1377 					*(uint64_t *)dest = num_primitives_written;
1378 				dest += 8;
1379 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1380 					*(uint64_t *)dest = primitive_storage_needed;
1381 				dest += 8;
1382 			} else {
1383 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1384 					*(uint32_t *)dest = num_primitives_written;
1385 				dest += 4;
1386 				if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1387 					*(uint32_t *)dest = primitive_storage_needed;
1388 				dest += 4;
1389 			}
1390 			break;
1391 		}
1392 		default:
1393 			unreachable("trying to get results of unhandled query type");
1394 		}
1395 
1396 		if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1397 			if (flags & VK_QUERY_RESULT_64_BIT) {
1398 				*(uint64_t*)dest = available;
1399 			} else {
1400 				*(uint32_t*)dest = available;
1401 			}
1402 		}
1403 	}
1404 
1405 	return result;
1406 }
1407 
emit_query_flush(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool)1408 static void emit_query_flush(struct radv_cmd_buffer *cmd_buffer,
1409 			     struct radv_query_pool *pool)
1410 {
1411 	if (cmd_buffer->pending_reset_query) {
1412 		if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1413 			/* Only need to flush caches if the query pool size is
1414 			 * large enough to be resetted using the compute shader
1415 			 * path. Small pools don't need any cache flushes
1416 			 * because we use a CP dma clear.
1417 			 */
1418 			si_emit_cache_flush(cmd_buffer);
1419 		}
1420 	}
1421 }
1422 
radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)1423 void radv_CmdCopyQueryPoolResults(
1424     VkCommandBuffer                             commandBuffer,
1425     VkQueryPool                                 queryPool,
1426     uint32_t                                    firstQuery,
1427     uint32_t                                    queryCount,
1428     VkBuffer                                    dstBuffer,
1429     VkDeviceSize                                dstOffset,
1430     VkDeviceSize                                stride,
1431     VkQueryResultFlags                          flags)
1432 {
1433 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1434 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1435 	RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1436 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1437 	uint64_t va = radv_buffer_get_va(pool->bo);
1438 	uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1439 	dest_va += dst_buffer->offset + dstOffset;
1440 
1441 	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1442 	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1443 
1444 	/* From the Vulkan spec 1.1.108:
1445 	 *
1446 	 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1447 	 *  previous uses of vkCmdResetQueryPool in the same queue, without any
1448 	 *  additional synchronization."
1449 	 *
1450 	 * So, we have to flush the caches if the compute shader path was used.
1451 	 */
1452 	emit_query_flush(cmd_buffer, pool);
1453 
1454 	switch (pool->type) {
1455 	case VK_QUERY_TYPE_OCCLUSION:
1456 		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1457 			for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1458 				unsigned query = firstQuery + i;
1459 				uint64_t src_va = va + query * pool->stride + pool->stride - 4;
1460 
1461 				radeon_check_space(cmd_buffer->device->ws, cs, 7);
1462 
1463 				/* Waits on the upper word of the last DB entry */
1464 				radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1465 						 src_va, 0x80000000, 0xffffffff);
1466 			}
1467 		}
1468 		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1469 		                  pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1470 		                  dst_buffer->offset + dstOffset,
1471 		                  pool->stride, stride,
1472 		                  queryCount, flags, 0, 0);
1473 		break;
1474 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1475 		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1476 			for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1477 				unsigned query = firstQuery + i;
1478 
1479 				radeon_check_space(cmd_buffer->device->ws, cs, 7);
1480 
1481 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
1482 
1483 				/* This waits on the ME. All copies below are done on the ME */
1484 				radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL,
1485 						 avail_va, 1, 0xffffffff);
1486 			}
1487 		}
1488 		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1489 		                  pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1490 		                  dst_buffer->offset + dstOffset,
1491 		                  pool->stride, stride, queryCount, flags,
1492 		                  pool->pipeline_stats_mask,
1493 		                  pool->availability_offset + 4 * firstQuery);
1494 		break;
1495 	case VK_QUERY_TYPE_TIMESTAMP:
1496 		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1497 			for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1498 				unsigned query = firstQuery + i;
1499 				uint64_t local_src_va = va  + query * pool->stride;
1500 
1501 				radeon_check_space(cmd_buffer->device->ws, cs, 7);
1502 
1503 				/* Wait on the high 32 bits of the timestamp in
1504 				 * case the low part is 0xffffffff.
1505 				 */
1506 				radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL,
1507 						 local_src_va + 4,
1508 						 TIMESTAMP_NOT_READY >> 32,
1509 						 0xffffffff);
1510 			}
1511 		}
1512 
1513 		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline,
1514 		                  pool->bo, dst_buffer->bo,
1515 				  firstQuery * pool->stride,
1516 		                  dst_buffer->offset + dstOffset,
1517 		                  pool->stride, stride,
1518 				  queryCount, flags, 0, 0);
1519 		break;
1520 	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1521 		if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1522 			for(unsigned i = 0; i < queryCount; i++) {
1523 				unsigned query = firstQuery + i;
1524 				uint64_t src_va = va + query * pool->stride;
1525 
1526 				radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1527 
1528 				/* Wait on the upper word of all results. */
1529 				for (unsigned j = 0; j < 4; j++, src_va += 8) {
1530 					radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1531 							 src_va + 4, 0x80000000,
1532 							 0xffffffff);
1533 				}
1534 			}
1535 		}
1536 
1537 		radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1538 		                  pool->bo, dst_buffer->bo,
1539 				  firstQuery * pool->stride,
1540 		                  dst_buffer->offset + dstOffset,
1541 		                  pool->stride, stride,
1542 				  queryCount, flags, 0, 0);
1543 		break;
1544 	default:
1545 		unreachable("trying to get results of unhandled query type");
1546 	}
1547 
1548 }
1549 
radv_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1550 void radv_CmdResetQueryPool(
1551 	VkCommandBuffer                             commandBuffer,
1552 	VkQueryPool                                 queryPool,
1553 	uint32_t                                    firstQuery,
1554 	uint32_t                                    queryCount)
1555 {
1556 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1557 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1558 	uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1559 			 ? (uint32_t)TIMESTAMP_NOT_READY : 0;
1560 	uint32_t flush_bits = 0;
1561 
1562 	/* Make sure to sync all previous work if the given command buffer has
1563 	 * pending active queries. Otherwise the GPU might write queries data
1564 	 * after the reset operation.
1565 	 */
1566 	cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1567 
1568 	flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1569 				       firstQuery * pool->stride,
1570 				       queryCount * pool->stride, value);
1571 
1572 	if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1573 		flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1574 					       pool->availability_offset + firstQuery * 4,
1575 					       queryCount * 4, 0);
1576 	}
1577 
1578 	if (flush_bits) {
1579 		/* Only need to flush caches for the compute shader path. */
1580 		cmd_buffer->pending_reset_query = true;
1581 		cmd_buffer->state.flush_bits |= flush_bits;
1582 	}
1583 }
1584 
radv_ResetQueryPool(VkDevice _device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)1585 void radv_ResetQueryPool(
1586 	VkDevice                                   _device,
1587 	VkQueryPool                                 queryPool,
1588 	uint32_t                                    firstQuery,
1589 	uint32_t                                    queryCount)
1590 {
1591 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1592 
1593 	uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1594 			 ? (uint32_t)TIMESTAMP_NOT_READY : 0;
1595 	uint32_t *data =  (uint32_t*)(pool->ptr + firstQuery * pool->stride);
1596 	uint32_t *data_end = (uint32_t*)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1597 
1598 	for(uint32_t *p = data; p != data_end; ++p)
1599 		*p = value;
1600 
1601 	if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1602 		memset(pool->ptr + pool->availability_offset + firstQuery * 4,
1603 		       0, queryCount * 4);
1604 	}
1605 }
1606 
event_type_for_stream(unsigned stream)1607 static unsigned event_type_for_stream(unsigned stream)
1608 {
1609 	switch (stream) {
1610 	default:
1611 	case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
1612 	case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
1613 	case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
1614 	case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
1615 	}
1616 }
1617 
emit_begin_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,VkQueryType query_type,VkQueryControlFlags flags,uint32_t index)1618 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
1619 			     struct radv_query_pool *pool,
1620 			     uint64_t va,
1621 			     VkQueryType query_type,
1622 			     VkQueryControlFlags flags,
1623 			     uint32_t index)
1624 {
1625 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1626 	switch (query_type) {
1627 	case VK_QUERY_TYPE_OCCLUSION:
1628 		radeon_check_space(cmd_buffer->device->ws, cs, 7);
1629 
1630 		++cmd_buffer->state.active_occlusion_queries;
1631 		if (cmd_buffer->state.active_occlusion_queries == 1) {
1632 			if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1633 				/* This is the first occlusion query, enable
1634 				 * the hint if the precision bit is set.
1635 				 */
1636 				cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1637 			}
1638 
1639 			radv_set_db_count_control(cmd_buffer);
1640 		} else {
1641 			if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1642 			    !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1643 				/* This is not the first query, but this one
1644 				 * needs to enable precision, DB_COUNT_CONTROL
1645 				 * has to be updated accordingly.
1646 				 */
1647 				cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1648 
1649 				radv_set_db_count_control(cmd_buffer);
1650 			}
1651 		}
1652 
1653 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1654 		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1655 		radeon_emit(cs, va);
1656 		radeon_emit(cs, va >> 32);
1657 		break;
1658 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1659 		radeon_check_space(cmd_buffer->device->ws, cs, 4);
1660 
1661 		++cmd_buffer->state.active_pipeline_queries;
1662 		if (cmd_buffer->state.active_pipeline_queries == 1) {
1663 			cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1664 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1665 		}
1666 
1667 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1668 		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1669 		radeon_emit(cs, va);
1670 		radeon_emit(cs, va >> 32);
1671 
1672 		if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
1673 			int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1674 
1675 			/* Make sure GDS is idle before copying the value. */
1676 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1677 							RADV_CMD_FLAG_INV_L2;
1678 			si_emit_cache_flush(cmd_buffer);
1679 
1680 			va += 8 * idx;
1681 
1682 			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1683 			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
1684 					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1685 					COPY_DATA_WR_CONFIRM);
1686 			radeon_emit(cs, 0);
1687 			radeon_emit(cs, 0);
1688 			radeon_emit(cs, va);
1689 			radeon_emit(cs, va >> 32);
1690 
1691 			/* Record that the command buffer needs GDS. */
1692 			cmd_buffer->gds_needed = true;
1693 
1694 			cmd_buffer->state.active_pipeline_gds_queries++;
1695 		}
1696 		break;
1697 	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1698 		radeon_check_space(cmd_buffer->device->ws, cs, 4);
1699 
1700 		assert(index < MAX_SO_STREAMS);
1701 
1702 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1703 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1704 		radeon_emit(cs, va);
1705 		radeon_emit(cs, va >> 32);
1706 		break;
1707 	default:
1708 		unreachable("beginning unhandled query type");
1709 	}
1710 
1711 }
1712 
emit_end_query(struct radv_cmd_buffer * cmd_buffer,struct radv_query_pool * pool,uint64_t va,uint64_t avail_va,VkQueryType query_type,uint32_t index)1713 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
1714 			   struct radv_query_pool *pool,
1715 			   uint64_t va, uint64_t avail_va,
1716 			   VkQueryType query_type, uint32_t index)
1717 {
1718 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1719 	switch (query_type) {
1720 	case VK_QUERY_TYPE_OCCLUSION:
1721 		radeon_check_space(cmd_buffer->device->ws, cs, 14);
1722 
1723 		cmd_buffer->state.active_occlusion_queries--;
1724 		if (cmd_buffer->state.active_occlusion_queries == 0) {
1725 			radv_set_db_count_control(cmd_buffer);
1726 
1727 			/* Reset the perfect occlusion queries hint now that no
1728 			 * queries are active.
1729 			 */
1730 			cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1731 		}
1732 
1733 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1734 		radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1735 		radeon_emit(cs, va + 8);
1736 		radeon_emit(cs, (va + 8) >> 32);
1737 
1738 		break;
1739 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1740 		radeon_check_space(cmd_buffer->device->ws, cs, 16);
1741 
1742 		cmd_buffer->state.active_pipeline_queries--;
1743 		if (cmd_buffer->state.active_pipeline_queries == 0) {
1744 			cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1745 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1746 		}
1747 		va += pipelinestat_block_size;
1748 
1749 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1750 		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1751 		radeon_emit(cs, va);
1752 		radeon_emit(cs, va >> 32);
1753 
1754 		si_cs_emit_write_event_eop(cs,
1755 					   cmd_buffer->device->physical_device->rad_info.chip_class,
1756 					   radv_cmd_buffer_uses_mec(cmd_buffer),
1757 					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
1758 					   EOP_DST_SEL_MEM,
1759 					   EOP_DATA_SEL_VALUE_32BIT,
1760 					   avail_va, 1,
1761 					   cmd_buffer->gfx9_eop_bug_va);
1762 
1763 		if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
1764 			int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1765 
1766 			/* Make sure GDS is idle before copying the value. */
1767 			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1768 							RADV_CMD_FLAG_INV_L2;
1769 			si_emit_cache_flush(cmd_buffer);
1770 
1771 			va += 8 * idx;
1772 
1773 			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1774 			radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_GDS) |
1775 					COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
1776 					COPY_DATA_WR_CONFIRM);
1777 			radeon_emit(cs, 0);
1778 			radeon_emit(cs, 0);
1779 			radeon_emit(cs, va);
1780 			radeon_emit(cs, va >> 32);
1781 
1782 			cmd_buffer->state.active_pipeline_gds_queries--;
1783 		}
1784 		break;
1785 	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1786 		radeon_check_space(cmd_buffer->device->ws, cs, 4);
1787 
1788 		assert(index < MAX_SO_STREAMS);
1789 
1790 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1791 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1792 		radeon_emit(cs, (va + 16));
1793 		radeon_emit(cs, (va + 16) >> 32);
1794 		break;
1795 	default:
1796 		unreachable("ending unhandled query type");
1797 	}
1798 
1799 	cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1800 					       RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
1801 					       RADV_CMD_FLAG_INV_L2 |
1802 					       RADV_CMD_FLAG_INV_VCACHE;
1803 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1804 		cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
1805 						       RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1806 	}
1807 }
1808 
radv_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)1809 void radv_CmdBeginQueryIndexedEXT(
1810     VkCommandBuffer                             commandBuffer,
1811     VkQueryPool                                 queryPool,
1812     uint32_t                                    query,
1813     VkQueryControlFlags                         flags,
1814     uint32_t                                    index)
1815 {
1816 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1817 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1818 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1819 	uint64_t va = radv_buffer_get_va(pool->bo);
1820 
1821 	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1822 
1823 	emit_query_flush(cmd_buffer, pool);
1824 
1825 	va += pool->stride * query;
1826 
1827 	emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
1828 }
1829 
radv_CmdBeginQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags)1830 void radv_CmdBeginQuery(
1831     VkCommandBuffer                             commandBuffer,
1832     VkQueryPool                                 queryPool,
1833     uint32_t                                    query,
1834     VkQueryControlFlags                         flags)
1835 {
1836 	radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
1837 }
1838 
radv_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)1839 void radv_CmdEndQueryIndexedEXT(
1840     VkCommandBuffer                             commandBuffer,
1841     VkQueryPool                                 queryPool,
1842     uint32_t                                    query,
1843     uint32_t                                    index)
1844 {
1845 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1846 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1847 	uint64_t va = radv_buffer_get_va(pool->bo);
1848 	uint64_t avail_va = va + pool->availability_offset + 4 * query;
1849 	va += pool->stride * query;
1850 
1851 	/* Do not need to add the pool BO to the list because the query must
1852 	 * currently be active, which means the BO is already in the list.
1853 	 */
1854 	emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
1855 
1856 	/*
1857 	 * For multiview we have to emit a query for each bit in the mask,
1858 	 * however the first query we emit will get the totals for all the
1859 	 * operations, so we don't want to get a real value in the other
1860 	 * queries. This emits a fake begin/end sequence so the waiting
1861 	 * code gets a completed query value and doesn't hang, but the
1862 	 * query returns 0.
1863 	 */
1864 	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1865 		uint64_t avail_va = va + pool->availability_offset + 4 * query;
1866 
1867 
1868 		for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
1869 			va += pool->stride;
1870 			avail_va += 4;
1871 			emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
1872 			emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
1873 		}
1874 	}
1875 }
1876 
radv_CmdEndQuery(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query)1877 void radv_CmdEndQuery(
1878     VkCommandBuffer                             commandBuffer,
1879     VkQueryPool                                 queryPool,
1880     uint32_t                                    query)
1881 {
1882 	radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
1883 }
1884 
radv_CmdWriteTimestamp(VkCommandBuffer commandBuffer,VkPipelineStageFlagBits pipelineStage,VkQueryPool queryPool,uint32_t query)1885 void radv_CmdWriteTimestamp(
1886     VkCommandBuffer                             commandBuffer,
1887     VkPipelineStageFlagBits                     pipelineStage,
1888     VkQueryPool                                 queryPool,
1889     uint32_t                                    query)
1890 {
1891 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1892 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1893 	bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
1894 	struct radeon_cmdbuf *cs = cmd_buffer->cs;
1895 	uint64_t va = radv_buffer_get_va(pool->bo);
1896 	uint64_t query_va = va + pool->stride * query;
1897 
1898 	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1899 
1900 	emit_query_flush(cmd_buffer, pool);
1901 
1902 	int num_queries = 1;
1903 	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
1904 		num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
1905 
1906 	ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
1907 
1908 	for (unsigned i = 0; i < num_queries; i++) {
1909 		switch(pipelineStage) {
1910 		case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1911 			radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1912 			radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
1913 				    COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
1914 				    COPY_DATA_DST_SEL(V_370_MEM));
1915 			radeon_emit(cs, 0);
1916 			radeon_emit(cs, 0);
1917 			radeon_emit(cs, query_va);
1918 			radeon_emit(cs, query_va >> 32);
1919 			break;
1920 		default:
1921 			si_cs_emit_write_event_eop(cs,
1922 						   cmd_buffer->device->physical_device->rad_info.chip_class,
1923 						   mec,
1924 						   V_028A90_BOTTOM_OF_PIPE_TS, 0,
1925 						   EOP_DST_SEL_MEM,
1926 						   EOP_DATA_SEL_TIMESTAMP,
1927 						   query_va, 0,
1928 						   cmd_buffer->gfx9_eop_bug_va);
1929 			break;
1930 		}
1931 		query_va += pool->stride;
1932 	}
1933 
1934 	cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1935 					       RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
1936 					       RADV_CMD_FLAG_INV_L2 |
1937 					       RADV_CMD_FLAG_INV_VCACHE;
1938 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1939 		cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
1940 						       RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1941 	}
1942 
1943 	assert(cmd_buffer->cs->cdw <= cdw_max);
1944 }
1945