1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 
26 #include "radv_private.h"
27 #include "radv_cs.h"
28 #include "sid.h"
29 
30 #define SQTT_BUFFER_ALIGN_SHIFT 12
31 
32 static uint64_t
radv_thread_trace_get_info_offset(unsigned se)33 radv_thread_trace_get_info_offset(unsigned se)
34 {
35 	return sizeof(struct radv_thread_trace_info) * se;
36 }
37 
38 static uint64_t
radv_thread_trace_get_data_offset(struct radv_device * device,unsigned se)39 radv_thread_trace_get_data_offset(struct radv_device *device, unsigned se)
40 {
41 	uint64_t data_offset;
42 
43 	data_offset = align64(sizeof(struct radv_thread_trace_info) * 4,
44 			      1 << SQTT_BUFFER_ALIGN_SHIFT);
45 	data_offset += device->thread_trace_buffer_size * se;
46 
47 	return data_offset;
48 }
49 
50 static uint64_t
radv_thread_trace_get_info_va(struct radv_device * device,unsigned se)51 radv_thread_trace_get_info_va(struct radv_device *device, unsigned se)
52 {
53 	uint64_t va = radv_buffer_get_va(device->thread_trace_bo);
54 	return va + radv_thread_trace_get_info_offset(se);
55 }
56 
57 static uint64_t
radv_thread_trace_get_data_va(struct radv_device * device,unsigned se)58 radv_thread_trace_get_data_va(struct radv_device *device, unsigned se)
59 {
60 	uint64_t va = radv_buffer_get_va(device->thread_trace_bo);
61 	return va + radv_thread_trace_get_data_offset(device, se);
62 }
63 
64 static void
radv_emit_thread_trace_start(struct radv_device * device,struct radeon_cmdbuf * cs,uint32_t queue_family_index)65 radv_emit_thread_trace_start(struct radv_device *device,
66 			     struct radeon_cmdbuf *cs,
67 			     uint32_t queue_family_index)
68 {
69 	uint32_t shifted_size = device->thread_trace_buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
70 	unsigned max_se = device->physical_device->rad_info.max_se;
71 
72 	assert(device->physical_device->rad_info.chip_class >= GFX8);
73 
74 	for (unsigned se = 0; se < max_se; se++) {
75 		uint64_t data_va = radv_thread_trace_get_data_va(device, se);
76 		uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
77 
78 		/* Target SEx and SH0. */
79 		radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
80 				       S_030800_SE_INDEX(se) |
81 				       S_030800_SH_INDEX(0) |
82 				       S_030800_INSTANCE_BROADCAST_WRITES(1));
83 
84 		if (device->physical_device->rad_info.chip_class == GFX10) {
85 			/* Order seems important for the following 2 registers. */
86 			radeon_set_privileged_config_reg(cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
87 							 S_008D04_SIZE(shifted_size) |
88 							 S_008D04_BASE_HI(shifted_va >> 32));
89 
90 			radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE,
91 							 S_008D00_BASE_LO(shifted_va));
92 
93 			radeon_set_privileged_config_reg(cs, R_008D14_SQ_THREAD_TRACE_MASK,
94 							 S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
95 							 S_008D14_SA_SEL(0) |
96 							 S_008D14_WGP_SEL(0) |
97 							 S_008D14_SIMD_SEL(0));
98 
99 			radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
100 							 S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC |
101 									      V_008D18_REG_INCLUDE_SHDEC |
102 									      V_008D18_REG_INCLUDE_GFXUDEC |
103 									      V_008D18_REG_INCLUDE_CONTEXT |
104 									      V_008D18_REG_INCLUDE_COMP |
105 									      V_008D18_REG_INCLUDE_CONTEXT |
106 									      V_008D18_REG_INCLUDE_CONFIG) |
107 							 S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF));
108 
109 			/* Should be emitted last (it enables thread traces). */
110 			radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
111 							 S_008D1C_MODE(1) |
112 							 S_008D1C_HIWATER(5) |
113 							 S_008D1C_UTIL_TIMER(1) |
114 							 S_008D1C_RT_FREQ(2) | /* 4096 clk */
115 							 S_008D1C_DRAW_EVENT_EN(1) |
116 							 S_008D1C_REG_STALL_EN(1) |
117 							 S_008D1C_SPI_STALL_EN(1) |
118 							 S_008D1C_SQ_STALL_EN(1) |
119 							 S_008D1C_REG_DROP_ON_STALL(0));
120 		} else {
121 			/* Order seems important for the following 4 registers. */
122 			radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,
123 					       S_030CDC_ADDR_HI(shifted_va >> 32));
124 
125 			radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE,
126 					       S_030CC0_ADDR(shifted_va));
127 
128 			radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE,
129 					       S_030CC4_SIZE(shifted_size));
130 
131 			radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL,
132 					       S_030CD4_RESET_BUFFER(1));
133 
134 			uint32_t thread_trace_mask = S_030CC8_CU_SEL(2) |
135 						     S_030CC8_SH_SEL(0) |
136 						     S_030CC8_SIMD_EN(0xf) |
137 						     S_030CC8_VM_ID_MASK(0) |
138 						     S_030CC8_REG_STALL_EN(1) |
139 						     S_030CC8_SPI_STALL_EN(1) |
140 						     S_030CC8_SQ_STALL_EN(1);
141 
142 			if (device->physical_device->rad_info.chip_class < GFX9) {
143 				thread_trace_mask |= S_030CC8_RANDOM_SEED(0xffff);
144 			}
145 
146 			radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK,
147 					       thread_trace_mask);
148 
149 			/* Trace all tokens and registers. */
150 			radeon_set_uconfig_reg(cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
151 					       S_030CCC_TOKEN_MASK(0xbfff) |
152 					       S_030CCC_REG_MASK(0xff) |
153 					       S_030CCC_REG_DROP_ON_STALL(0));
154 
155 			/* Enable SQTT perf counters for all CUs. */
156 			radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
157 					       S_030CD0_SH0_MASK(0xffff) |
158 					       S_030CD0_SH1_MASK(0xffff));
159 
160 			radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2,
161 					       S_030CE0_INST_MASK(0xffffffff));
162 
163 			radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER,
164 					       S_030CEC_HIWATER(4));
165 
166 			if (device->physical_device->rad_info.chip_class == GFX9) {
167 				/* Reset thread trace status errors. */
168 				radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS,
169 						       S_030CE8_UTC_ERROR(0));
170 			}
171 
172 			/* Enable the thread trace mode. */
173 			uint32_t thread_trace_mode = S_030CD8_MASK_PS(1) |
174 						     S_030CD8_MASK_VS(1) |
175 						     S_030CD8_MASK_GS(1) |
176 						     S_030CD8_MASK_ES(1) |
177 						     S_030CD8_MASK_HS(1) |
178 						     S_030CD8_MASK_LS(1) |
179 						     S_030CD8_MASK_CS(1) |
180 						     S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
181 						     S_030CD8_MODE(1);
182 
183 			if (device->physical_device->rad_info.chip_class == GFX9) {
184 				/* Count SQTT traffic in TCC perf counters. */
185 				thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
186 			}
187 
188 			radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,
189 					       thread_trace_mode);
190 		}
191 	}
192 
193 	/* Restore global broadcasting. */
194 	radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
195 		               S_030800_SE_BROADCAST_WRITES(1) |
196 			       S_030800_SH_BROADCAST_WRITES(1) |
197 			       S_030800_INSTANCE_BROADCAST_WRITES(1));
198 
199 	/* Start the thread trace with a different event based on the queue. */
200 	if (queue_family_index == RADV_QUEUE_COMPUTE &&
201 	    device->physical_device->rad_info.chip_class >= GFX7) {
202 		radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
203 				  S_00B878_THREAD_TRACE_ENABLE(1));
204 	} else {
205 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
206 		radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
207 	}
208 }
209 
210 static const uint32_t gfx8_thread_trace_info_regs[] =
211 {
212 	R_030CE4_SQ_THREAD_TRACE_WPTR,
213 	R_030CE8_SQ_THREAD_TRACE_STATUS,
214 	R_008E40_SQ_THREAD_TRACE_CNTR,
215 };
216 
217 static const uint32_t gfx9_thread_trace_info_regs[] =
218 {
219 	R_030CE4_SQ_THREAD_TRACE_WPTR,
220 	R_030CE8_SQ_THREAD_TRACE_STATUS,
221 	R_030CF0_SQ_THREAD_TRACE_CNTR,
222 };
223 
224 static const uint32_t gfx10_thread_trace_info_regs[] =
225 {
226 	R_008D10_SQ_THREAD_TRACE_WPTR,
227 	R_008D20_SQ_THREAD_TRACE_STATUS,
228 	R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
229 };
230 
231 static void
radv_copy_thread_trace_info_regs(struct radv_device * device,struct radeon_cmdbuf * cs,unsigned se_index)232 radv_copy_thread_trace_info_regs(struct radv_device *device,
233 				 struct radeon_cmdbuf *cs,
234 				 unsigned se_index)
235 {
236 	const uint32_t *thread_trace_info_regs = NULL;
237 
238 	switch (device->physical_device->rad_info.chip_class) {
239 	case GFX10:
240 		thread_trace_info_regs = gfx10_thread_trace_info_regs;
241 		break;
242 	case GFX9:
243 		thread_trace_info_regs = gfx9_thread_trace_info_regs;
244 		break;
245 	case GFX8:
246 		thread_trace_info_regs = gfx8_thread_trace_info_regs;
247 		break;
248 	default:
249 		unreachable("Unsupported chip_class");
250 	}
251 
252 	/* Get the VA where the info struct is stored for this SE. */
253 	uint64_t info_va = radv_thread_trace_get_info_va(device, se_index);
254 
255 	/* Copy back the info struct one DWORD at a time. */
256 	for (unsigned i = 0; i < 3; i++) {
257 		radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
258 		radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) |
259 				COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
260 				COPY_DATA_WR_CONFIRM);
261 		radeon_emit(cs, thread_trace_info_regs[i] >> 2);
262 		radeon_emit(cs, 0); /* unused */
263 		radeon_emit(cs, (info_va + i * 4));
264 		radeon_emit(cs, (info_va + i * 4) >> 32);
265 	}
266 }
267 
268 static void
radv_emit_thread_trace_stop(struct radv_device * device,struct radeon_cmdbuf * cs,uint32_t queue_family_index)269 radv_emit_thread_trace_stop(struct radv_device *device,
270 			    struct radeon_cmdbuf *cs,
271 			    uint32_t queue_family_index)
272 {
273 	unsigned max_se = device->physical_device->rad_info.max_se;
274 
275 	assert(device->physical_device->rad_info.chip_class >= GFX8);
276 
277 	/* Stop the thread trace with a different event based on the queue. */
278 	if (queue_family_index == RADV_QUEUE_COMPUTE &&
279 	    device->physical_device->rad_info.chip_class >= GFX7) {
280 		radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
281 				  S_00B878_THREAD_TRACE_ENABLE(0));
282 	} else {
283 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
284 		radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
285 	}
286 
287 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
288 	radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
289 
290 	for (unsigned se = 0; se < max_se; se++) {
291 		/* Target SEi and SH0. */
292 		radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
293 				       S_030800_SE_INDEX(se) |
294 				       S_030800_SH_INDEX(0) |
295 				       S_030800_INSTANCE_BROADCAST_WRITES(1));
296 
297 		if (device->physical_device->rad_info.chip_class == GFX10) {
298 			/* Make sure to wait for the trace buffer. */
299 			radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
300 			radeon_emit(cs, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
301 			radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
302 			radeon_emit(cs, 0);
303 			radeon_emit(cs, 0); /* reference value */
304 			radeon_emit(cs, S_008D20_FINISH_DONE(1)); /* mask */
305 			radeon_emit(cs, 4); /* poll interval */
306 
307 			/* Disable the thread trace mode. */
308 			radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
309 							 S_008D1C_MODE(0));
310 
311 			/* Wait for thread trace completion. */
312 			radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
313 			radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
314 			radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
315 			radeon_emit(cs, 0);
316 			radeon_emit(cs, 0); /* reference value */
317 			radeon_emit(cs, S_008D20_BUSY(1)); /* mask */
318 			radeon_emit(cs, 4); /* poll interval */
319 		} else {
320 			/* Disable the thread trace mode. */
321 			radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE,
322 					       S_030CD8_MODE(0));
323 
324 			/* Wait for thread trace completion. */
325 			radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
326 			radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
327 			radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2);  /* register */
328 			radeon_emit(cs, 0);
329 			radeon_emit(cs, 0); /* reference value */
330 			radeon_emit(cs, S_030CE8_BUSY(1)); /* mask */
331 			radeon_emit(cs, 4); /* poll interval */
332 		}
333 
334 		radv_copy_thread_trace_info_regs(device, cs, se);
335 	}
336 
337 	/* Restore global broadcasting. */
338 	radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
339 		               S_030800_SE_BROADCAST_WRITES(1) |
340 			       S_030800_SH_BROADCAST_WRITES(1) |
341 			       S_030800_INSTANCE_BROADCAST_WRITES(1));
342 }
343 
344 void
radv_emit_thread_trace_userdata(const struct radv_device * device,struct radeon_cmdbuf * cs,const void * data,uint32_t num_dwords)345 radv_emit_thread_trace_userdata(const struct radv_device *device,
346 				struct radeon_cmdbuf *cs,
347 				const void *data, uint32_t num_dwords)
348 {
349 	const uint32_t *dwords = (uint32_t *)data;
350 
351 	while (num_dwords > 0) {
352 		uint32_t count = MIN2(num_dwords, 2);
353 
354 		/* Without the perfctr bit the CP might not always pass the
355 		 * write on correctly. */
356 		if (device->physical_device->rad_info.chip_class >= GFX10)
357 			radeon_set_uconfig_reg_seq_perfctr(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
358 		else
359 			radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
360 		radeon_emit_array(cs, dwords, count);
361 
362 		dwords += count;
363 		num_dwords -= count;
364 	}
365 }
366 
367 static void
radv_emit_spi_config_cntl(struct radv_device * device,struct radeon_cmdbuf * cs,bool enable)368 radv_emit_spi_config_cntl(struct radv_device *device,
369 			  struct radeon_cmdbuf *cs, bool enable)
370 {
371 	if (device->physical_device->rad_info.chip_class >= GFX9) {
372 		uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
373 					   S_031100_EXP_PRIORITY_ORDER(3) |
374 					   S_031100_ENABLE_SQG_TOP_EVENTS(enable) |
375 					   S_031100_ENABLE_SQG_BOP_EVENTS(enable);
376 
377 		if (device->physical_device->rad_info.chip_class == GFX10)
378 			spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
379 
380 		radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
381 	} else {
382 		/* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
383 		radeon_set_privileged_config_reg(cs, R_009100_SPI_CONFIG_CNTL,
384 						 S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
385 						 S_009100_ENABLE_SQG_BOP_EVENTS(enable));
386 	}
387 }
388 
389 static void
radv_emit_wait_for_idle(struct radv_device * device,struct radeon_cmdbuf * cs,int family)390 radv_emit_wait_for_idle(struct radv_device *device,
391 			struct radeon_cmdbuf *cs, int family)
392 {
393 	enum rgp_flush_bits sqtt_flush_bits = 0;
394 	si_cs_emit_cache_flush(cs, device->physical_device->rad_info.chip_class,
395 			       NULL, 0,
396 			       family == RING_COMPUTE &&
397 			       device->physical_device->rad_info.chip_class >= GFX7,
398 			       (family == RADV_QUEUE_COMPUTE ?
399 				RADV_CMD_FLAG_CS_PARTIAL_FLUSH :
400 				(RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
401 			       RADV_CMD_FLAG_INV_ICACHE |
402 			       RADV_CMD_FLAG_INV_SCACHE |
403 			       RADV_CMD_FLAG_INV_VCACHE |
404 			       RADV_CMD_FLAG_INV_L2, &sqtt_flush_bits, 0);
405 }
406 
407 static void
radv_thread_trace_init_cs(struct radv_device * device)408 radv_thread_trace_init_cs(struct radv_device *device)
409 {
410 	struct radeon_winsys *ws = device->ws;
411 	VkResult result;
412 
413 	/* Thread trace start CS. */
414 	for (int family = 0; family < 2; ++family) {
415 		device->thread_trace_start_cs[family] = ws->cs_create(ws, family);
416 		if (!device->thread_trace_start_cs[family])
417 			return;
418 
419 		switch (family) {
420 		case RADV_QUEUE_GENERAL:
421 			radeon_emit(device->thread_trace_start_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
422 			radeon_emit(device->thread_trace_start_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
423 			radeon_emit(device->thread_trace_start_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
424 			break;
425 		case RADV_QUEUE_COMPUTE:
426 			radeon_emit(device->thread_trace_start_cs[family], PKT3(PKT3_NOP, 0, 0));
427 			radeon_emit(device->thread_trace_start_cs[family], 0);
428 			break;
429 		}
430 
431 		radv_cs_add_buffer(ws, device->thread_trace_start_cs[family],
432 				   device->thread_trace_bo);
433 
434 		/* Make sure to wait-for-idle before starting SQTT. */
435 		radv_emit_wait_for_idle(device,
436 					device->thread_trace_start_cs[family],
437 					family);
438 
439 		/* Enable SQG events that collects thread trace data. */
440 		radv_emit_spi_config_cntl(device,
441 					  device->thread_trace_start_cs[family],
442 					  true);
443 
444 		radv_emit_thread_trace_start(device,
445 					     device->thread_trace_start_cs[family],
446 					     family);
447 
448 		result = ws->cs_finalize(device->thread_trace_start_cs[family]);
449 		if (result != VK_SUCCESS)
450 			return;
451 	}
452 
453 	/* Thread trace stop CS. */
454 	for (int family = 0; family < 2; ++family) {
455 		device->thread_trace_stop_cs[family] = ws->cs_create(ws, family);
456 		if (!device->thread_trace_stop_cs[family])
457 			return;
458 
459 		switch (family) {
460 		case RADV_QUEUE_GENERAL:
461 			radeon_emit(device->thread_trace_stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
462 			radeon_emit(device->thread_trace_stop_cs[family], CC0_UPDATE_LOAD_ENABLES(1));
463 			radeon_emit(device->thread_trace_stop_cs[family], CC1_UPDATE_SHADOW_ENABLES(1));
464 			break;
465 		case RADV_QUEUE_COMPUTE:
466 			radeon_emit(device->thread_trace_stop_cs[family], PKT3(PKT3_NOP, 0, 0));
467 			radeon_emit(device->thread_trace_stop_cs[family], 0);
468 			break;
469 		}
470 
471 		radv_cs_add_buffer(ws, device->thread_trace_stop_cs[family],
472 				   device->thread_trace_bo);
473 
474 		/* Make sure to wait-for-idle before stopping SQTT. */
475 		radv_emit_wait_for_idle(device,
476 					device->thread_trace_stop_cs[family],
477 					family);
478 
479 		radv_emit_thread_trace_stop(device,
480 					    device->thread_trace_stop_cs[family],
481 					    family);
482 
483 		/* Restore previous state by disabling SQG events. */
484 		radv_emit_spi_config_cntl(device,
485 					  device->thread_trace_stop_cs[family],
486 					  false);
487 
488 		result = ws->cs_finalize(device->thread_trace_stop_cs[family]);
489 		if (result != VK_SUCCESS)
490 			return;
491 	}
492 }
493 
494 static bool
radv_thread_trace_init_bo(struct radv_device * device)495 radv_thread_trace_init_bo(struct radv_device *device)
496 {
497 	struct radeon_winsys *ws = device->ws;
498 	uint64_t size;
499 
500 	/* The buffer size and address need to be aligned in HW regs. Align the
501 	 * size as early as possible so that we do all the allocation & addressing
502 	 * correctly. */
503 	device->thread_trace_buffer_size = align64(device->thread_trace_buffer_size,
504 	                                           1u << SQTT_BUFFER_ALIGN_SHIFT);
505 
506 	/* Compute total size of the thread trace BO for 4 SEs. */
507 	size = align64(sizeof(struct radv_thread_trace_info) * 4,
508 		       1 << SQTT_BUFFER_ALIGN_SHIFT);
509 	size += device->thread_trace_buffer_size * 4;
510 
511 	device->thread_trace_bo = ws->buffer_create(ws, size, 4096,
512 						    RADEON_DOMAIN_VRAM,
513 						    RADEON_FLAG_CPU_ACCESS |
514 						    RADEON_FLAG_NO_INTERPROCESS_SHARING |
515 						    RADEON_FLAG_ZERO_VRAM,
516 						    RADV_BO_PRIORITY_SCRATCH);
517 	if (!device->thread_trace_bo)
518 		return false;
519 
520 	device->thread_trace_ptr = ws->buffer_map(device->thread_trace_bo);
521 	if (!device->thread_trace_ptr)
522 		return false;
523 
524 	return true;
525 }
526 
527 bool
radv_thread_trace_init(struct radv_device * device)528 radv_thread_trace_init(struct radv_device *device)
529 {
530 	if (!radv_thread_trace_init_bo(device))
531 		return false;
532 
533 	radv_thread_trace_init_cs(device);
534 	return true;
535 }
536 
537 void
radv_thread_trace_finish(struct radv_device * device)538 radv_thread_trace_finish(struct radv_device *device)
539 {
540 	struct radeon_winsys *ws = device->ws;
541 
542 	if (unlikely(device->thread_trace_bo))
543 		ws->buffer_destroy(device->thread_trace_bo);
544 
545 	for (unsigned i = 0; i < 2; i++) {
546 		if (device->thread_trace_start_cs[i])
547 			ws->cs_destroy(device->thread_trace_start_cs[i]);
548 		if (device->thread_trace_stop_cs[i])
549 			ws->cs_destroy(device->thread_trace_stop_cs[i]);
550 	}
551 }
552 
553 bool
radv_begin_thread_trace(struct radv_queue * queue)554 radv_begin_thread_trace(struct radv_queue *queue)
555 {
556 	int family = queue->queue_family_index;
557 	struct radeon_cmdbuf *cs = queue->device->thread_trace_start_cs[family];
558 	return radv_queue_internal_submit(queue, cs);
559 }
560 
561 bool
radv_end_thread_trace(struct radv_queue * queue)562 radv_end_thread_trace(struct radv_queue *queue)
563 {
564 	int family = queue->queue_family_index;
565 	struct radeon_cmdbuf *cs = queue->device->thread_trace_stop_cs[family];
566 	return radv_queue_internal_submit(queue, cs);
567 }
568 
569 static bool
radv_is_thread_trace_complete(struct radv_device * device,const struct radv_thread_trace_info * info)570 radv_is_thread_trace_complete(struct radv_device *device,
571 			      const struct radv_thread_trace_info *info)
572 {
573 	if (device->physical_device->rad_info.chip_class == GFX10) {
574 		/* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the
575 		 * number of dropped bytes for all SEs via
576 		 * THREAD_TRACE_DROPPED_CNTR.
577 		 */
578 		return info->gfx10_dropped_cntr == 0;
579 	}
580 
581 	/* Otherwise, compare the current thread trace offset with the number
582 	 * of written bytes.
583 	 */
584 	return info->cur_offset == info->gfx9_write_counter;
585 }
586 
587 static uint32_t
radv_get_expected_buffer_size(struct radv_device * device,const struct radv_thread_trace_info * info)588 radv_get_expected_buffer_size(struct radv_device *device,
589 			      const struct radv_thread_trace_info *info)
590 {
591 	if (device->physical_device->rad_info.chip_class == GFX10) {
592 		uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / device->physical_device->rad_info.max_se;
593 		return ((info->cur_offset * 32) + dropped_cntr_per_se) / 1024;
594 	}
595 
596 	return (info->gfx9_write_counter * 32) / 1024;
597 }
598 
599 bool
radv_get_thread_trace(struct radv_queue * queue,struct radv_thread_trace * thread_trace)600 radv_get_thread_trace(struct radv_queue *queue,
601 		      struct radv_thread_trace *thread_trace)
602 {
603 	struct radv_device *device = queue->device;
604 	unsigned max_se = device->physical_device->rad_info.max_se;
605 	void *thread_trace_ptr = device->thread_trace_ptr;
606 
607 	memset(thread_trace, 0, sizeof(*thread_trace));
608 	thread_trace->num_traces = max_se;
609 
610 	for (unsigned se = 0; se < max_se; se++) {
611 		uint64_t info_offset = radv_thread_trace_get_info_offset(se);
612 		uint64_t data_offset = radv_thread_trace_get_data_offset(device, se);
613 		void *info_ptr = thread_trace_ptr + info_offset;
614 		void *data_ptr = thread_trace_ptr + data_offset;
615 		struct radv_thread_trace_info *info =
616 			(struct radv_thread_trace_info *)info_ptr;
617 		struct radv_thread_trace_se thread_trace_se = {0};
618 
619 		if (!radv_is_thread_trace_complete(device, info)) {
620 			uint32_t expected_size =
621 				radv_get_expected_buffer_size(device, info);
622 			uint32_t available_size =
623 				(info->cur_offset * 32) / 1024;
624 
625 			fprintf(stderr, "Failed to get the thread trace "
626 					"because the buffer is too small. The "
627 					"hardware needs %d KB but the "
628 					"buffer size is %d KB.\n",
629 					expected_size, available_size);
630 			fprintf(stderr, "Please update the buffer size with "
631 					"RADV_THREAD_TRACE_BUFFER_SIZE=<size_in_bytes>\n");
632 			return false;
633 		}
634 
635 		thread_trace_se.data_ptr = data_ptr;
636 		thread_trace_se.info = *info;
637 		thread_trace_se.shader_engine = se;
638 		thread_trace_se.compute_unit = 0;
639 
640 		thread_trace->traces[se] = thread_trace_se;
641 	}
642 
643 	return true;
644 }
645